267 files changed, 41961 insertions, 21381 deletions
diff --git a/src/mesa/drivers/common/driverfuncs.c b/src/mesa/drivers/common/driverfuncs.c
index 6a98c29a3d1..3b397fef7de 100644
--- a/src/mesa/drivers/common/driverfuncs.c
+++ b/src/mesa/drivers/common/driverfuncs.c
@@ -202,16 +202,11 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->GetIntegerv = NULL;
    driver->GetPointerv = NULL;
    
-#if FEATURE_ARB_vertex_buffer_object
-   driver->NewBufferObject = _mesa_new_buffer_object;
-   driver->DeleteBuffer = _mesa_delete_buffer_object;
-   driver->BindBuffer = NULL;
-   driver->BufferData = _mesa_buffer_data;
-   driver->BufferSubData = _mesa_buffer_subdata;
-   driver->GetBufferSubData = _mesa_buffer_get_subdata;
-   driver->MapBuffer = _mesa_buffer_map;
-   driver->UnmapBuffer = _mesa_buffer_unmap;
-#endif
+   /* buffer objects */
+   _mesa_init_buffer_object_functions(driver);
+
+   /* query objects */
+   _mesa_init_query_object_functions(driver);
 
 #if FEATURE_EXT_framebuffer_object
    driver->NewFramebuffer = _mesa_new_framebuffer;
@@ -225,14 +220,6 @@ _mesa_init_driver_functions(struct dd_function_table *driver)
    driver->BlitFramebuffer = _swrast_BlitFramebuffer;
 #endif
 
-   /* query objects */
-   driver->NewQueryObject = _mesa_new_query_object;
-   driver->DeleteQuery = _mesa_delete_query;
-   driver->BeginQuery = _mesa_begin_query;
-   driver->EndQuery = _mesa_end_query;
-   driver->WaitQuery = _mesa_wait_query;
-   driver->CheckQuery = _mesa_check_query;
-
    /* APPLE_vertex_array_object */
    driver->NewArrayObject = _mesa_new_array_object;
    driver->DeleteArrayObject = _mesa_delete_array_object;
diff --git a/src/mesa/drivers/dri/Makefile.template b/src/mesa/drivers/dri/Makefile.template
index 5c01d233c13..18dbeba24a8 100644
--- a/src/mesa/drivers/dri/Makefile.template
+++ b/src/mesa/drivers/dri/Makefile.template
@@ -11,7 +11,8 @@ COMMON_GALLIUM_SOURCES = \
 COMMON_SOURCES = $(COMMON_GALLIUM_SOURCES) \
         ../../common/driverfuncs.c \
         ../common/texmem.c \
-        ../common/drirenderbuffer.c
+        ../common/drirenderbuffer.c \
+	../common/dri_metaops.c
 
 ifeq ($(WINDOW_SYSTEM),dri)
 WINOBJ=
@@ -92,7 +93,7 @@ clean:
 
 install: $(LIBNAME)
 	$(INSTALL) -d $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
-	$(INSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
+	$(MINSTALL) -m 755 $(LIBNAME) $(DESTDIR)$(DRI_DRIVER_INSTALL_DIR)
 
 
 -include depend
diff --git a/src/mesa/drivers/dri/common/dri_metaops.c b/src/mesa/drivers/dri/common/dri_metaops.c
new file mode 100644
index 00000000000..fe183c2e870
--- /dev/null
+++ b/src/mesa/drivers/dri/common/dri_metaops.c
@@ -0,0 +1,541 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "main/arrayobj.h"
+#include "main/attrib.h"
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/buffers.h"
+#include "main/depth.h"
+#include "main/enable.h"
+#include "main/matrix.h"
+#include "main/macros.h"
+#include "main/polygon.h"
+#include "main/shaders.h"
+#include "main/stencil.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "main/viewport.h"
+#include "shader/arbprogram.h"
+#include "shader/program.h"
+#include "dri_metaops.h"
+
+void
+meta_set_passthrough_transform(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   meta->saved_vp_x = ctx->Viewport.X;
+   meta->saved_vp_y = ctx->Viewport.Y;
+   meta->saved_vp_width = ctx->Viewport.Width;
+   meta->saved_vp_height = ctx->Viewport.Height;
+   meta->saved_matrix_mode = ctx->Transform.MatrixMode;
+
+   meta->internal_viewport_call = GL_TRUE;
+   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
+   meta->internal_viewport_call = GL_FALSE;
+
+   _mesa_MatrixMode(GL_PROJECTION);
+   _mesa_PushMatrix();
+   _mesa_LoadIdentity();
+   _mesa_Ortho(0, ctx->DrawBuffer->Width, 0, ctx->DrawBuffer->Height, 1, -1);
+
+   _mesa_MatrixMode(GL_MODELVIEW);
+   _mesa_PushMatrix();
+   _mesa_LoadIdentity();
+}
+
+void
+meta_restore_transform(struct dri_metaops *meta)
+{
+   _mesa_MatrixMode(GL_PROJECTION);
+   _mesa_PopMatrix();
+   _mesa_MatrixMode(GL_MODELVIEW);
+   _mesa_PopMatrix();
+
+   _mesa_MatrixMode(meta->saved_matrix_mode);
+
+   meta->internal_viewport_call = GL_TRUE;
+   _mesa_Viewport(meta->saved_vp_x, meta->saved_vp_y,
+		  meta->saved_vp_width, meta->saved_vp_height);
+   meta->internal_viewport_call = GL_FALSE;
+}
+
+
+/**
+ * Set up a vertex program to pass through the position and first texcoord
+ * for pixel path.
+ */
+void
+meta_set_passthrough_vertex_program(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+   static const char *vp =
+      "!!ARBvp1.0\n"
+      "TEMP vertexClip;\n"
+      "DP4 vertexClip.x, state.matrix.mvp.row[0], vertex.position;\n"
+      "DP4 vertexClip.y, state.matrix.mvp.row[1], vertex.position;\n"
+      "DP4 vertexClip.z, state.matrix.mvp.row[2], vertex.position;\n"
+      "DP4 vertexClip.w, state.matrix.mvp.row[3], vertex.position;\n"
+      "MOV result.position, vertexClip;\n"
+      "MOV result.texcoord[0], vertex.texcoord[0];\n"
+      "MOV result.color, vertex.color;\n"
+      "END\n";
+
+   assert(meta->saved_vp == NULL);
+
+   _mesa_reference_vertprog(ctx, &meta->saved_vp,
+			    ctx->VertexProgram.Current);
+   if (meta->passthrough_vp == NULL) {
+      GLuint prog_name;
+      _mesa_GenPrograms(1, &prog_name);
+      _mesa_BindProgram(GL_VERTEX_PROGRAM_ARB, prog_name);
+      _mesa_ProgramStringARB(GL_VERTEX_PROGRAM_ARB,
+			     GL_PROGRAM_FORMAT_ASCII_ARB,
+			     strlen(vp), (const GLubyte *)vp);
+      _mesa_reference_vertprog(ctx, &meta->passthrough_vp,
+			       ctx->VertexProgram.Current);
+      _mesa_DeletePrograms(1, &prog_name);
+   }
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
+			    meta->passthrough_vp);
+   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
+			   &meta->passthrough_vp->Base);
+
+   meta->saved_vp_enable = ctx->VertexProgram.Enabled;
+   _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
+}
+
+/**
+ * Restores the previous vertex program after
+ * meta_set_passthrough_vertex_program()
+ */
+void
+meta_restore_vertex_program(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
+			    meta->saved_vp);
+   _mesa_reference_vertprog(ctx, &meta->saved_vp, NULL);
+   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
+			   &ctx->VertexProgram.Current->Base);
+
+   if (!meta->saved_vp_enable)
+      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
+}
+
+/**
+ * Binds the given program string to GL_FRAGMENT_PROGRAM_ARB, caching the
+ * program object.
+ */
+void
+meta_set_fragment_program(struct dri_metaops *meta,
+			  struct gl_fragment_program **prog,
+			  const char *prog_string)
+{
+   GLcontext *ctx = meta->ctx;
+   assert(meta->saved_fp == NULL);
+
+   _mesa_reference_fragprog(ctx, &meta->saved_fp,
+			    ctx->FragmentProgram.Current);
+   if (*prog == NULL) {
+      GLuint prog_name;
+      _mesa_GenPrograms(1, &prog_name);
+      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, prog_name);
+      _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB,
+			     GL_PROGRAM_FORMAT_ASCII_ARB,
+			     strlen(prog_string), (const GLubyte *)prog_string);
+      _mesa_reference_fragprog(ctx, prog, ctx->FragmentProgram.Current);
+      /* Note that DeletePrograms unbinds the program on us */
+      _mesa_DeletePrograms(1, &prog_name);
+   }
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, *prog);
+   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, &((*prog)->Base));
+
+   meta->saved_fp_enable = ctx->FragmentProgram.Enabled;
+   _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
+}
+
+/**
+ * Restores the previous fragment program after
+ * meta_set_fragment_program()
+ */
+void
+meta_restore_fragment_program(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
+   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
+			    meta->saved_fp);
+   _mesa_reference_fragprog(ctx, &meta->saved_fp, NULL);
+   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
+			   &ctx->FragmentProgram.Current->Base);
+
+   if (!meta->saved_fp_enable)
+      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
+}
+
+static const float default_texcoords[4][2] = { { 0.0, 0.0 },
+					       { 1.0, 0.0 },
+					       { 1.0, 1.0 },
+					       { 0.0, 1.0 } };
+
+void
+meta_set_default_texrect(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+   struct gl_client_array *old_texcoord_array;
+
+   meta->saved_active_texture = ctx->Texture.CurrentUnit;
+   if (meta->saved_array_vbo == NULL) {
+      _mesa_reference_buffer_object(ctx, &meta->saved_array_vbo,
+				    ctx->Array.ArrayBufferObj);
+   }
+
+   old_texcoord_array = &ctx->Array.ArrayObj->TexCoord[0];
+   meta->saved_texcoord_type = old_texcoord_array->Type;
+   meta->saved_texcoord_size = old_texcoord_array->Size;
+   meta->saved_texcoord_stride = old_texcoord_array->Stride;
+   meta->saved_texcoord_enable = old_texcoord_array->Enabled;
+   meta->saved_texcoord_ptr = old_texcoord_array->Ptr;
+   _mesa_reference_buffer_object(ctx, &meta->saved_texcoord_vbo,
+				 old_texcoord_array->BufferObj);
+
+   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
+
+   if (meta->texcoord_vbo == NULL) {
+      GLuint vbo_name;
+
+      _mesa_GenBuffersARB(1, &vbo_name);
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, vbo_name);
+      _mesa_BufferDataARB(GL_ARRAY_BUFFER_ARB, sizeof(default_texcoords),
+			  default_texcoords, GL_STATIC_DRAW_ARB);
+      _mesa_reference_buffer_object(ctx, &meta->texcoord_vbo,
+				    ctx->Array.ArrayBufferObj);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  meta->texcoord_vbo->Name);
+   }
+   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), NULL);
+
+   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+}
+
+void
+meta_restore_texcoords(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+
+   /* Restore the old TexCoordPointer */
+   if (meta->saved_texcoord_vbo) {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  meta->saved_texcoord_vbo->Name);
+      _mesa_reference_buffer_object(ctx, &meta->saved_texcoord_vbo, NULL);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   }
+
+   _mesa_TexCoordPointer(meta->saved_texcoord_size,
+			 meta->saved_texcoord_type,
+			 meta->saved_texcoord_stride,
+			 meta->saved_texcoord_ptr);
+   if (!meta->saved_texcoord_enable)
+      _mesa_Disable(GL_TEXTURE_COORD_ARRAY);
+
+   _mesa_ClientActiveTextureARB(GL_TEXTURE0 +
+				meta->saved_active_texture);
+
+   if (meta->saved_array_vbo) {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB,
+			  meta->saved_array_vbo->Name);
+      _mesa_reference_buffer_object(ctx, &meta->saved_array_vbo, NULL);
+   } else {
+      _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   }
+}
+
+
+/**
+ * Perform glClear where mask contains only color, depth, and/or stencil.
+ *
+ * The implementation is based on calling into Mesa to set GL state and
+ * performing normal triangle rendering.  The intent of this path is to
+ * have as generic a path as possible, so that any driver could make use of
+ * it.
+ */
+
+/**
+ * Per-context one-time init of things for intl_clear_tris().
+ * Basically set up a private array object for vertex/color arrays.
+ */
+static void
+meta_init_clear(struct dri_metaops *meta)
+{
+   GLcontext *ctx = meta->ctx;
+   struct gl_array_object *arraySave = NULL;
+   const GLuint arrayBuffer = ctx->Array.ArrayBufferObj->Name;
+   const GLuint elementBuffer = ctx->Array.ElementArrayBufferObj->Name;
+
+   /* create new array object */
+   meta->clear.arrayObj = _mesa_new_array_object(ctx, ~0);
+
+   /* save current array object, bind new one */
+   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
+   ctx->NewState |= _NEW_ARRAY;
+   ctx->Array.NewState |= _NEW_ARRAY_ALL;
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, meta->clear.arrayObj);
+
+   /* one-time setup of vertex arrays (pos, color) */
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, 0);
+   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), meta->clear.color);
+   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), meta->clear.vertices);
+   _mesa_Enable(GL_COLOR_ARRAY);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+
+   /* restore original array object */
+   ctx->NewState |= _NEW_ARRAY;
+   ctx->Array.NewState |= _NEW_ARRAY_ALL;
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
+   _mesa_reference_array_object(ctx, &arraySave, NULL);
+
+   /* restore original buffer objects */
+   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, arrayBuffer);
+   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, elementBuffer);
+}
+
+
+
+/**
+ * Perform glClear where mask contains only color, depth, and/or stencil.
+ *
+ * The implementation is based on calling into Mesa to set GL state and
+ * performing normal triangle rendering.  The intent of this path is to
+ * have as generic a path as possible, so that any driver could make use of
+ * it.
+ */
+void
+meta_clear_tris(struct dri_metaops *meta, GLbitfield mask)
+{
+   GLcontext *ctx = meta->ctx;
+   GLfloat dst_z;
+   struct gl_framebuffer *fb = ctx->DrawBuffer;
+   int i;
+   GLboolean saved_fp_enable = GL_FALSE, saved_vp_enable = GL_FALSE;
+   GLuint saved_shader_program = 0;
+   unsigned int saved_active_texture;
+   struct gl_array_object *arraySave = NULL;
+
+   if (!meta->clear.arrayObj)
+      meta_init_clear(meta);
+
+   assert((mask & ~(TRI_CLEAR_COLOR_BITS | BUFFER_BIT_DEPTH |
+		    BUFFER_BIT_STENCIL)) == 0);
+
+   _mesa_PushAttrib(GL_COLOR_BUFFER_BIT |
+		    GL_DEPTH_BUFFER_BIT |
+		    GL_ENABLE_BIT |
+		    GL_POLYGON_BIT |
+		    GL_STENCIL_BUFFER_BIT |
+		    GL_TRANSFORM_BIT |
+		    GL_CURRENT_BIT |
+		    GL_VIEWPORT_BIT);
+   saved_active_texture = ctx->Texture.CurrentUnit;
+
+   /* Disable existing GL state we don't want to apply to a clear. */
+   _mesa_Disable(GL_ALPHA_TEST);
+   _mesa_Disable(GL_BLEND);
+   _mesa_Disable(GL_CULL_FACE);
+   _mesa_Disable(GL_FOG);
+   _mesa_Disable(GL_POLYGON_SMOOTH);
+   _mesa_Disable(GL_POLYGON_STIPPLE);
+   _mesa_Disable(GL_POLYGON_OFFSET_FILL);
+   _mesa_Disable(GL_LIGHTING);
+   _mesa_Disable(GL_CLIP_PLANE0);
+   _mesa_Disable(GL_CLIP_PLANE1);
+   _mesa_Disable(GL_CLIP_PLANE2);
+   _mesa_Disable(GL_CLIP_PLANE3);
+   _mesa_Disable(GL_CLIP_PLANE4);
+   _mesa_Disable(GL_CLIP_PLANE5);
+   _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
+   if (ctx->Extensions.ARB_fragment_program && ctx->FragmentProgram.Enabled) {
+      saved_fp_enable = GL_TRUE;
+      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
+   }
+   if (ctx->Extensions.ARB_vertex_program && ctx->VertexProgram.Enabled) {
+      saved_vp_enable = GL_TRUE;
+      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
+   }
+   if (ctx->Extensions.ARB_shader_objects && ctx->Shader.CurrentProgram) {
+      saved_shader_program = ctx->Shader.CurrentProgram->Name;
+      _mesa_UseProgramObjectARB(0);
+   }
+
+   if (ctx->Texture._EnabledUnits != 0) {
+      int i;
+
+      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+	 _mesa_ActiveTextureARB(GL_TEXTURE0 + i);
+	 _mesa_Disable(GL_TEXTURE_1D);
+	 _mesa_Disable(GL_TEXTURE_2D);
+	 _mesa_Disable(GL_TEXTURE_3D);
+	 if (ctx->Extensions.ARB_texture_cube_map)
+	    _mesa_Disable(GL_TEXTURE_CUBE_MAP_ARB);
+	 if (ctx->Extensions.NV_texture_rectangle)
+	    _mesa_Disable(GL_TEXTURE_RECTANGLE_NV);
+	 if (ctx->Extensions.MESA_texture_array) {
+	    _mesa_Disable(GL_TEXTURE_1D_ARRAY_EXT);
+	    _mesa_Disable(GL_TEXTURE_2D_ARRAY_EXT);
+	 }
+      }
+   }
+
+   /* save current array object, bind our private one */
+   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
+   ctx->NewState |= _NEW_ARRAY;
+   ctx->Array.NewState |= _NEW_ARRAY_ALL;
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, meta->clear.arrayObj);
+
+   meta_set_passthrough_transform(meta);
+
+   for (i = 0; i < 4; i++) {
+      COPY_4FV(meta->clear.color[i], ctx->Color.ClearColor);
+   }
+
+   /* convert clear Z from [0,1] to NDC coord in [-1,1] */
+   dst_z = -1.0 + 2.0 * ctx->Depth.Clear;
+
+   /* The ClearDepth value is unaffected by DepthRange, so do a default
+    * mapping.
+    */
+   _mesa_DepthRange(0.0, 1.0);
+
+   /* Prepare the vertices, which are the same regardless of which buffer we're
+    * drawing to.
+    */
+   meta->clear.vertices[0][0] = fb->_Xmin;
+   meta->clear.vertices[0][1] = fb->_Ymin;
+   meta->clear.vertices[0][2] = dst_z;
+   meta->clear.vertices[1][0] = fb->_Xmax;
+   meta->clear.vertices[1][1] = fb->_Ymin;
+   meta->clear.vertices[1][2] = dst_z;
+   meta->clear.vertices[2][0] = fb->_Xmax;
+   meta->clear.vertices[2][1] = fb->_Ymax;
+   meta->clear.vertices[2][2] = dst_z;
+   meta->clear.vertices[3][0] = fb->_Xmin;
+   meta->clear.vertices[3][1] = fb->_Ymax;
+   meta->clear.vertices[3][2] = dst_z;
+
+   while (mask != 0) {
+      GLuint this_mask = 0;
+      GLuint color_bit;
+
+      color_bit = _mesa_ffs(mask & TRI_CLEAR_COLOR_BITS);
+      if (color_bit != 0)
+	 this_mask |= (1 << (color_bit - 1));
+
+      /* Clear depth/stencil in the same pass as color. */
+      this_mask |= (mask & (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL));
+
+      /* Select the current color buffer and use the color write mask if
+       * we have one, otherwise don't write any color channels.
+       */
+      if (this_mask & BUFFER_BIT_FRONT_LEFT)
+	 _mesa_DrawBuffer(GL_FRONT_LEFT);
+      else if (this_mask & BUFFER_BIT_BACK_LEFT)
+	 _mesa_DrawBuffer(GL_BACK_LEFT);
+      else if (color_bit != 0)
+	 _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0 +
+			  (color_bit - BUFFER_COLOR0 - 1));
+      else
+	 _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
+
+      /* Control writing of the depth clear value to depth. */
+      if (this_mask & BUFFER_BIT_DEPTH) {
+	 _mesa_DepthFunc(GL_ALWAYS);
+	 _mesa_Enable(GL_DEPTH_TEST);
+      } else {
+	 _mesa_Disable(GL_DEPTH_TEST);
+	 _mesa_DepthMask(GL_FALSE);
+      }
+
+      /* Control writing of the stencil clear value to stencil. */
+      if (this_mask & BUFFER_BIT_STENCIL) {
+	 _mesa_Enable(GL_STENCIL_TEST);
+	 _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
+				 GL_REPLACE, GL_REPLACE, GL_REPLACE);
+	 _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
+				   ctx->Stencil.Clear & 0x7fffffff,
+				   ctx->Stencil.WriteMask[0]);
+      } else {
+	 _mesa_Disable(GL_STENCIL_TEST);
+      }
+
+      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+      mask &= ~this_mask;
+   }
+
+   meta_restore_transform(meta);
+
+   _mesa_ActiveTextureARB(GL_TEXTURE0 + saved_active_texture);
+   if (saved_fp_enable)
+      _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
+   if (saved_vp_enable)
+      _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
+
+   if (saved_shader_program)
+      _mesa_UseProgramObjectARB(saved_shader_program);
+
+   _mesa_PopAttrib();
+
+   /* restore current array object */
+   ctx->NewState |= _NEW_ARRAY;
+   ctx->Array.NewState |= _NEW_ARRAY_ALL;
+   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
+   _mesa_reference_array_object(ctx, &arraySave, NULL);
+}
+
+void meta_init_metaops(GLcontext *ctx, struct dri_metaops *meta)
+{
+   meta->ctx = ctx;
+}
+
+void meta_destroy_metaops(struct dri_metaops *meta)
+{
+  if (meta->clear.arrayObj)
+    _mesa_delete_array_object(meta->ctx, meta->clear.arrayObj);
+
+}
diff --git a/src/mesa/drivers/dri/common/dri_metaops.h b/src/mesa/drivers/dri/common/dri_metaops.h
new file mode 100644
index 00000000000..bb4079d5353
--- /dev/null
+++ b/src/mesa/drivers/dri/common/dri_metaops.h
@@ -0,0 +1,99 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Intel Corporation.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef DRI_METAOPS_H
+#define DRI_METAOPS_H
+
+#define TRI_CLEAR_COLOR_BITS (BUFFER_BIT_BACK_LEFT |			\
+			      BUFFER_BIT_FRONT_LEFT |			\
+			      BUFFER_BIT_COLOR0 |			\
+			      BUFFER_BIT_COLOR1 |			\
+			      BUFFER_BIT_COLOR2 |			\
+			      BUFFER_BIT_COLOR3 |			\
+			      BUFFER_BIT_COLOR4 |			\
+			      BUFFER_BIT_COLOR5 |			\
+			      BUFFER_BIT_COLOR6 |			\
+			      BUFFER_BIT_COLOR7)
+
+struct dri_meta_clear {
+    struct gl_array_object *arrayObj;
+    GLfloat vertices[4][3];
+    GLfloat color[4][4];
+};
+
+struct dri_metaops {
+    GLcontext *ctx;
+    GLboolean internal_viewport_call;
+    struct gl_fragment_program *bitmap_fp;
+    struct gl_vertex_program *passthrough_vp;
+    struct gl_buffer_object *texcoord_vbo;
+    
+    struct gl_fragment_program *saved_fp;
+    GLboolean saved_fp_enable;
+    struct gl_vertex_program *saved_vp;
+    GLboolean saved_vp_enable;
+
+    struct gl_fragment_program *tex2d_fp;
+    
+    GLboolean saved_texcoord_enable;
+    struct gl_buffer_object *saved_array_vbo, *saved_texcoord_vbo;
+    GLenum saved_texcoord_type;
+    GLsizei saved_texcoord_size, saved_texcoord_stride;
+    const void *saved_texcoord_ptr;
+    int saved_active_texture;
+
+    GLint saved_vp_x, saved_vp_y;
+    GLsizei saved_vp_width, saved_vp_height;
+    GLenum saved_matrix_mode;
+    
+    struct dri_meta_clear clear;
+};
+
+
+void meta_set_passthrough_transform(struct dri_metaops *meta);
+
+void meta_restore_transform(struct dri_metaops *meta);
+
+void meta_set_passthrough_vertex_program(struct dri_metaops *meta);
+
+void meta_restore_vertex_program(struct dri_metaops *meta);
+
+void meta_set_fragment_program(struct dri_metaops *meta,
+			  struct gl_fragment_program **prog,
+			  const char *prog_string);
+
+void meta_restore_fragment_program(struct dri_metaops *meta);
+
+void meta_set_default_texrect(struct dri_metaops *meta);
+
+void meta_restore_texcoords(struct dri_metaops *meta);
+void meta_clear_tris(struct dri_metaops *meta, GLbitfield mask);
+
+void meta_init_metaops(GLcontext *ctx, struct dri_metaops *meta);
+void meta_destroy_metaops(struct dri_metaops *meta);
+#endif
+
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index 576494940f3..1d940603fa8 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -1,4 +1,3 @@
-/* $XFree86: xc/lib/GL/dri/dri_util.c,v 1.7 2003/04/28 17:01:25 dawes Exp $ */
 /**
  * \file dri_util.c
  * DRI utility functions.
@@ -37,6 +36,9 @@
 typedef GLboolean ( * PFNGLXGETMSCRATEOMLPROC) (__DRIdrawable *drawable, int32_t *numerator, int32_t *denominator);
 #endif
 
+static void dri_get_drawable(__DRIdrawable *pdp);
+static void dri_put_drawable(__DRIdrawable *pdp);
+
 /**
  * This is just a token extension used to signal that the driver
  * supports setting a read drawable.
@@ -130,7 +132,7 @@ static int driUnbindContext(__DRIcontext *pcp)
 	return GL_FALSE;
     }
 
-    pdp->refcount--;
+    dri_put_drawable(pdp);
 
     if (prp != pdp) {
         if (prp->refcount == 0) {
@@ -138,7 +140,7 @@ static int driUnbindContext(__DRIcontext *pcp)
 	    return GL_FALSE;
 	}
 
-	prp->refcount--;
+    	dri_put_drawable(prp);
     }
 
 
@@ -174,10 +176,10 @@ static int driBindContext(__DRIcontext *pcp,
 	pcp->driReadablePriv = prp;
 	if (pdp) {
 	    pdp->driContextPriv = pcp;
-	    pdp->refcount++;
+    	    dri_get_drawable(pdp);
 	}
 	if ( prp && pdp != prp ) {
-	    prp->refcount++;
+    	    dri_get_drawable(prp);
 	}
     }
 
@@ -434,7 +436,7 @@ driCreateNewDrawable(__DRIscreen *psp, const __DRIconfig *config,
 
     pdp->loaderPrivate = data;
     pdp->hHWDrawable = hwDrawable;
-    pdp->refcount = 0;
+    pdp->refcount = 1;
     pdp->pStamp = NULL;
     pdp->lastStamp = 0;
     pdp->index = 0;
@@ -487,12 +489,19 @@ dri2CreateNewDrawable(__DRIscreen *screen,
     return pdraw;
 }
 
-
-static void
-driDestroyDrawable(__DRIdrawable *pdp)
+static void dri_get_drawable(__DRIdrawable *pdp)
+{
+    pdp->refcount++;
+}
+	
+static void dri_put_drawable(__DRIdrawable *pdp)
 {
     __DRIscreenPrivate *psp;
 
+    pdp->refcount--;
+    if (pdp->refcount)
+	return;
+
     if (pdp) {
 	psp = pdp->driScreenPriv;
         (*psp->DriverAPI.DestroyBuffer)(pdp);
@@ -508,6 +517,12 @@ driDestroyDrawable(__DRIdrawable *pdp)
     }
 }
 
+static void
+driDestroyDrawable(__DRIdrawable *pdp)
+{
+    dri_put_drawable(pdp);
+}
+
 /*@}*/
 
 
diff --git a/src/mesa/drivers/dri/common/extension_helper.h b/src/mesa/drivers/dri/common/extension_helper.h
index 8dcaaee3079..e308fd28311 100644
--- a/src/mesa/drivers/dri/common/extension_helper.h
+++ b/src/mesa/drivers/dri/common/extension_helper.h
@@ -406,9 +406,10 @@ static const char UniformMatrix4fvARB_names[] =
     "";
 #endif
 
-#if defined(need_GL_APPLE_vertex_array_object)
+#if defined(need_GL_ARB_vertex_array_object) || defined(need_GL_APPLE_vertex_array_object)
 static const char DeleteVertexArraysAPPLE_names[] = 
     "ip\0" /* Parameter signature */
+    "glDeleteVertexArrays\0"
     "glDeleteVertexArraysAPPLE\0"
     "";
 #endif
@@ -941,6 +942,13 @@ static const char WeightivARB_names[] =
     "";
 #endif
 
+#if defined(need_GL_SGIX_instruments)
+static const char PollInstrumentsSGIX_names[] = 
+    "p\0" /* Parameter signature */
+    "glPollInstrumentsSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_SUN_global_alpha)
 static const char GlobalAlphaFactordSUN_names[] = 
     "d\0" /* Parameter signature */
@@ -1763,6 +1771,13 @@ static const char DeleteFencesNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_SGIX_polynomial_ffd)
+static const char DeformationMap3dSGIX_names[] = 
+    "iddiiddiiddiip\0" /* Parameter signature */
+    "glDeformationMap3dSGIX\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0)
 static const char IsShader_names[] = 
     "i\0" /* Parameter signature */
@@ -2107,6 +2122,13 @@ static const char Tangent3fvEXT_names[] =
     "";
 #endif
 
+#if defined(need_GL_ARB_vertex_array_object)
+static const char GenVertexArrays_names[] = 
+    "ip\0" /* Parameter signature */
+    "glGenVertexArrays\0"
+    "";
+#endif
+
 #if defined(need_GL_ARB_framebuffer_object) || defined(need_GL_EXT_framebuffer_object)
 static const char BindFramebufferEXT_names[] = 
     "ii\0" /* Parameter signature */
@@ -2333,10 +2355,10 @@ static const char GetCombinerStageParameterfvNV_names[] =
     "";
 #endif
 
-#if defined(need_GL_EXT_coordinate_frame)
-static const char Binormal3fEXT_names[] = 
-    "fff\0" /* Parameter signature */
-    "glBinormal3fEXT\0"
+#if defined(need_GL_ARB_vertex_array_object)
+static const char BindVertexArray_names[] = 
+    "i\0" /* Parameter signature */
+    "glBindVertexArray\0"
     "";
 #endif
 
@@ -2766,10 +2788,10 @@ static const char Uniform4fARB_names[] =
     "";
 #endif
 
-#if defined(need_GL_IBM_multimode_draw_arrays)
-static const char MultiModeDrawArraysIBM_names[] = 
-    "pppii\0" /* Parameter signature */
-    "glMultiModeDrawArraysIBM\0"
+#if defined(need_GL_ARB_map_buffer_range)
+static const char FlushMappedBufferRange_names[] = 
+    "iii\0" /* Parameter signature */
+    "glFlushMappedBufferRange\0"
     "";
 #endif
 
@@ -3396,10 +3418,11 @@ static const char GetProgramParameterdvNV_names[] =
     "";
 #endif
 
-#if defined(need_GL_SGIX_instruments)
-static const char PollInstrumentsSGIX_names[] = 
-    "p\0" /* Parameter signature */
-    "glPollInstrumentsSGIX\0"
+#if defined(need_GL_ARB_vertex_array_object) || defined(need_GL_APPLE_vertex_array_object)
+static const char IsVertexArrayAPPLE_names[] = 
+    "i\0" /* Parameter signature */
+    "glIsVertexArray\0"
+    "glIsVertexArrayAPPLE\0"
     "";
 #endif
 
@@ -3897,6 +3920,13 @@ static const char VertexAttribs4dvNV_names[] =
     "";
 #endif
 
+#if defined(need_GL_IBM_multimode_draw_arrays)
+static const char MultiModeDrawArraysIBM_names[] = 
+    "pppii\0" /* Parameter signature */
+    "glMultiModeDrawArraysIBM\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_2_0) || defined(need_GL_ARB_vertex_program)
 static const char VertexAttrib4dARB_names[] = 
     "idddd\0" /* Parameter signature */
@@ -3926,6 +3956,13 @@ static const char VertexWeightfEXT_names[] =
     "";
 #endif
 
+#if defined(need_GL_EXT_coordinate_frame)
+static const char Binormal3fEXT_names[] = 
+    "fff\0" /* Parameter signature */
+    "glBinormal3fEXT\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
 static const char FogCoordfvEXT_names[] = 
     "p\0" /* Parameter signature */
@@ -4041,10 +4078,10 @@ static const char BlendFuncSeparateEXT_names[] =
     "";
 #endif
 
-#if defined(need_GL_APPLE_vertex_array_object)
-static const char IsVertexArrayAPPLE_names[] = 
-    "i\0" /* Parameter signature */
-    "glIsVertexArrayAPPLE\0"
+#if defined(need_GL_ARB_map_buffer_range)
+static const char MapBufferRange_names[] = 
+    "iiii\0" /* Parameter signature */
+    "glMapBufferRange\0"
     "";
 #endif
 
@@ -4289,6 +4326,13 @@ static const char SpriteParameterivSGIX_names[] =
     "";
 #endif
 
+#if defined(need_GL_EXT_provoking_vertex)
+static const char ProvokingVertexEXT_names[] = 
+    "i\0" /* Parameter signature */
+    "glProvokingVertexEXT\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_3)
 static const char MultiTexCoord1fARB_names[] = 
     "if\0" /* Parameter signature */
@@ -4396,6 +4440,13 @@ static const char WindowPos3ivMESA_names[] =
     "";
 #endif
 
+#if defined(need_GL_ARB_copy_buffer)
+static const char CopyBufferSubData_names[] = 
+    "iiiii\0" /* Parameter signature */
+    "glCopyBufferSubData\0"
+    "";
+#endif
+
 #if defined(need_GL_VERSION_1_5) || defined(need_GL_ARB_vertex_buffer_object)
 static const char IsBufferARB_names[] = 
     "i\0" /* Parameter signature */
@@ -4560,13 +4611,6 @@ static const char Minmax_names[] =
     "";
 #endif
 
-#if defined(need_GL_SGIX_polynomial_ffd)
-static const char DeformationMap3dSGIX_names[] = 
-    "iddiiddiiddiip\0" /* Parameter signature */
-    "glDeformationMap3dSGIX\0"
-    "";
-#endif
-
 #if defined(need_GL_VERSION_1_4) || defined(need_GL_EXT_fog_coord)
 static const char FogCoorddvEXT_names[] = 
     "p\0" /* Parameter signature */
@@ -4933,8 +4977,15 @@ static const struct dri_extension_function GL_3DFX_tbuffer_functions[] = {
 static const struct dri_extension_function GL_APPLE_vertex_array_object_functions[] = {
     { DeleteVertexArraysAPPLE_names, DeleteVertexArraysAPPLE_remap_index, -1 },
     { GenVertexArraysAPPLE_names, GenVertexArraysAPPLE_remap_index, -1 },
-    { BindVertexArrayAPPLE_names, BindVertexArrayAPPLE_remap_index, -1 },
     { IsVertexArrayAPPLE_names, IsVertexArrayAPPLE_remap_index, -1 },
+    { BindVertexArrayAPPLE_names, BindVertexArrayAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
+#if defined(need_GL_ARB_copy_buffer)
+static const struct dri_extension_function GL_ARB_copy_buffer_functions[] = {
+    { CopyBufferSubData_names, CopyBufferSubData_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -4972,6 +5023,14 @@ static const struct dri_extension_function GL_ARB_framebuffer_object_functions[]
 };
 #endif
 
+#if defined(need_GL_ARB_map_buffer_range)
+static const struct dri_extension_function GL_ARB_map_buffer_range_functions[] = {
+    { FlushMappedBufferRange_names, FlushMappedBufferRange_remap_index, -1 },
+    { MapBufferRange_names, MapBufferRange_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ARB_matrix_palette)
 static const struct dri_extension_function GL_ARB_matrix_palette_functions[] = {
     { MatrixIndexusvARB_names, MatrixIndexusvARB_remap_index, -1 },
@@ -5080,6 +5139,16 @@ static const struct dri_extension_function GL_ARB_transpose_matrix_functions[] =
 };
 #endif
 
+#if defined(need_GL_ARB_vertex_array_object)
+static const struct dri_extension_function GL_ARB_vertex_array_object_functions[] = {
+    { DeleteVertexArraysAPPLE_names, DeleteVertexArraysAPPLE_remap_index, -1 },
+    { GenVertexArrays_names, GenVertexArrays_remap_index, -1 },
+    { BindVertexArray_names, BindVertexArray_remap_index, -1 },
+    { IsVertexArrayAPPLE_names, IsVertexArrayAPPLE_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_ARB_vertex_blend)
 static const struct dri_extension_function GL_ARB_vertex_blend_functions[] = {
     { WeightubvARB_names, WeightubvARB_remap_index, -1 },
@@ -5333,7 +5402,6 @@ static const struct dri_extension_function GL_EXT_coordinate_frame_functions[] =
     { Binormal3ivEXT_names, Binormal3ivEXT_remap_index, -1 },
     { Tangent3sEXT_names, Tangent3sEXT_remap_index, -1 },
     { Tangent3fvEXT_names, Tangent3fvEXT_remap_index, -1 },
-    { Binormal3fEXT_names, Binormal3fEXT_remap_index, -1 },
     { Tangent3dvEXT_names, Tangent3dvEXT_remap_index, -1 },
     { Binormal3bvEXT_names, Binormal3bvEXT_remap_index, -1 },
     { Binormal3dEXT_names, Binormal3dEXT_remap_index, -1 },
@@ -5342,6 +5410,7 @@ static const struct dri_extension_function GL_EXT_coordinate_frame_functions[] =
     { Tangent3ivEXT_names, Tangent3ivEXT_remap_index, -1 },
     { Tangent3dEXT_names, Tangent3dEXT_remap_index, -1 },
     { Binormal3svEXT_names, Binormal3svEXT_remap_index, -1 },
+    { Binormal3fEXT_names, Binormal3fEXT_remap_index, -1 },
     { Binormal3dvEXT_names, Binormal3dvEXT_remap_index, -1 },
     { Tangent3iEXT_names, Tangent3iEXT_remap_index, -1 },
     { Tangent3bvEXT_names, Tangent3bvEXT_remap_index, -1 },
@@ -5527,6 +5596,13 @@ static const struct dri_extension_function GL_EXT_polygon_offset_functions[] = {
 };
 #endif
 
+#if defined(need_GL_EXT_provoking_vertex)
+static const struct dri_extension_function GL_EXT_provoking_vertex_functions[] = {
+    { ProvokingVertexEXT_names, ProvokingVertexEXT_remap_index, -1 },
+    { NULL, 0, 0 }
+};
+#endif
+
 #if defined(need_GL_EXT_secondary_color)
 static const struct dri_extension_function GL_EXT_secondary_color_functions[] = {
     { SecondaryColor3iEXT_names, SecondaryColor3iEXT_remap_index, -1 },
@@ -6024,11 +6100,11 @@ static const struct dri_extension_function GL_SGIX_igloo_interface_functions[] =
 #if defined(need_GL_SGIX_instruments)
 static const struct dri_extension_function GL_SGIX_instruments_functions[] = {
     { ReadInstrumentsSGIX_names, ReadInstrumentsSGIX_remap_index, -1 },
+    { PollInstrumentsSGIX_names, PollInstrumentsSGIX_remap_index, -1 },
     { GetInstrumentsSGIX_names, GetInstrumentsSGIX_remap_index, -1 },
     { StartInstrumentsSGIX_names, StartInstrumentsSGIX_remap_index, -1 },
     { StopInstrumentsSGIX_names, StopInstrumentsSGIX_remap_index, -1 },
     { InstrumentsBufferSGIX_names, InstrumentsBufferSGIX_remap_index, -1 },
-    { PollInstrumentsSGIX_names, PollInstrumentsSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
@@ -6055,9 +6131,9 @@ static const struct dri_extension_function GL_SGIX_pixel_texture_functions[] = {
 #if defined(need_GL_SGIX_polynomial_ffd)
 static const struct dri_extension_function GL_SGIX_polynomial_ffd_functions[] = {
     { LoadIdentityDeformationMapSGIX_names, LoadIdentityDeformationMapSGIX_remap_index, -1 },
+    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { DeformSGIX_names, DeformSGIX_remap_index, -1 },
     { DeformationMap3fSGIX_names, DeformationMap3fSGIX_remap_index, -1 },
-    { DeformationMap3dSGIX_names, DeformationMap3dSGIX_remap_index, -1 },
     { NULL, 0, 0 }
 };
 #endif
diff --git a/src/mesa/drivers/dri/common/xmlpool.h b/src/mesa/drivers/dri/common/xmlpool.h
index 7fbc6e800d0..587517ea10a 100644
--- a/src/mesa/drivers/dri/common/xmlpool.h
+++ b/src/mesa/drivers/dri/common/xmlpool.h
@@ -60,6 +60,10 @@
 #define DRI_CONF_OPT_BEGIN(name,type,def) \
 "<option name=\""#name"\" type=\""#type"\" default=\""#def"\">\n"
 
+/** \brief Begin an option definition with qouted default value */
+#define DRI_CONF_OPT_BEGIN_Q(name,type,def) \
+"<option name=\""#name"\" type=\""#type"\" default="#def">\n"
+
 /** \brief Begin an option definition with restrictions on valid values */
 #define DRI_CONF_OPT_BEGIN_V(name,type,def,valid) \
 "<option name=\""#name"\" type=\""#type"\" default=\""#def"\" valid=\""valid"\">\n"
diff --git a/src/mesa/drivers/dri/fb/fb_egl.c b/src/mesa/drivers/dri/fb/fb_egl.c
index 35c268441c3..dee67feb5ac 100644
--- a/src/mesa/drivers/dri/fb/fb_egl.c
+++ b/src/mesa/drivers/dri/fb/fb_egl.c
@@ -472,8 +472,8 @@ fbCreateContext(_EGLDriver *drv, EGLDisplay dpy, EGLConfig config, EGLContext sh
    c->Base.DrawSurface = EGL_NO_SURFACE;
    c->Base.ReadSurface = EGL_NO_SURFACE;
 
-   /* generate handle and insert into hash table */
-   _eglSaveContext(&c->Base);
+   /* link to display */
+   _eglLinkContext(&c->Base, disp);
    assert(c->Base.Handle);
 
    /* Init default driver functions then plug in our FBdev-specific functions
@@ -604,13 +604,9 @@ static EGLBoolean
 fbDestroySurface(_EGLDriver *drv, EGLDisplay dpy, EGLSurface surface)
 {
    fbSurface *fs = Lookup_fbSurface(surface);
-   _eglRemoveSurface(&fs->Base);
-   if (fs->Base.IsBound) {
-      fs->Base.DeletePending = EGL_TRUE;
-   }
-   else {
+   _eglUnlinkSurface(&fs->Base);
+   if (!fs->Base.IsBound)
       free(fs);
-   }
    return EGL_TRUE;
 }
 
@@ -619,13 +615,9 @@ static EGLBoolean
 fbDestroyContext(_EGLDriver *drv, EGLDisplay dpy, EGLContext context)
 {
    fbContext *fc = Lookup_fbContext(context);
-   _eglRemoveContext(&fc->Base);
-   if (fc->Base.IsBound) {
-      fc->Base.DeletePending = EGL_TRUE;
-   }
-   else {
+   _eglUnlinkContext(&fc->Base);
+   if (!fc->Base.IsBound)
       free(fc);
-   }
    return EGL_TRUE;
 }
 
@@ -688,7 +680,7 @@ fbCreateScreenSurfaceMESA(_EGLDriver *drv, EGLDisplay dpy, EGLConfig cfg,
    surface->mesa_framebuffer = _mesa_create_framebuffer(&vis);
    if (!surface->mesa_framebuffer) {
       free(surface);
-      _eglRemoveSurface(&surface->Base);
+      _eglUnlinkSurface(&surface->Base);
       return EGL_NO_SURFACE;
    }
 
diff --git a/src/mesa/drivers/dri/i915/Makefile b/src/mesa/drivers/dri/i915/Makefile
index 9f4bd1699f9..beaf9a4b129 100644
--- a/src/mesa/drivers/dri/i915/Makefile
+++ b/src/mesa/drivers/dri/i915/Makefile
@@ -19,6 +19,7 @@ DRIVER_SOURCES = \
 	intel_batchbuffer.c \
 	intel_clear.c \
 	intel_extensions.c \
+	intel_generatemipmap.c \
 	intel_mipmap_tree.c \
 	intel_tex_layout.c \
 	intel_tex_image.c \
diff --git a/src/mesa/drivers/dri/i915/i830_reg.h b/src/mesa/drivers/dri/i915/i830_reg.h
index d210c2d08e4..db16871001d 100644
--- a/src/mesa/drivers/dri/i915/i830_reg.h
+++ b/src/mesa/drivers/dri/i915/i830_reg.h
@@ -48,19 +48,6 @@
 #define AA_LINE_ENABLE			((1<<1) | 1)
 #define AA_LINE_DISABLE			(1<<1)
 
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 #define _3DSTATE_COLOR_FACTOR_CMD	(CMD_3D | (0x1d<<24) | (0x1<<16))
 
 #define _3DSTATE_COLOR_FACTOR_N_CMD(stage)	(CMD_3D | (0x1d<<24) | \
diff --git a/src/mesa/drivers/dri/i915/i830_texstate.c b/src/mesa/drivers/dri/i915/i830_texstate.c
index 753c25b57ed..6f998fa6f77 100644
--- a/src/mesa/drivers/dri/i915/i830_texstate.c
+++ b/src/mesa/drivers/dri/i915/i830_texstate.c
@@ -174,14 +174,16 @@ i830_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    state[I830_TEXREG_TM0LI] = (_3DSTATE_LOAD_STATE_IMMEDIATE_2 |
                                (LOAD_TEXTURE_MAP0 << unit) | 4);
 
-/*    state[I830_TEXREG_TM0S0] = (TM0S0_USE_FENCE | */
-/* 			       t->intel.TextureOffset); */
-
-
    state[I830_TEXREG_TM0S1] =
       (((firstImage->Height - 1) << TM0S1_HEIGHT_SHIFT) |
        ((firstImage->Width - 1) << TM0S1_WIDTH_SHIFT) | format);
 
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I830_TEXREG_TM0S1] |= TM0S1_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I830_TEXREG_TM0S1] |= TM0S1_TILE_WALK;
+   }
+
    state[I830_TEXREG_TM0S2] =
       ((((pitch / 4) - 1) << TM0S2_PITCH_SHIFT) | TM0S2_CUBE_FACE_ENA_MASK);
 
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 3bf02de61f8..9c6f891dd32 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -552,7 +552,7 @@ i830_emit_state(struct intel_context *intel)
          if (state->tex_buffer[i]) {
             OUT_RELOC(state->tex_buffer[i],
 		      I915_GEM_DOMAIN_SAMPLER, 0,
-                      state->tex_offset[i] | TM0S0_USE_FENCE);
+                      state->tex_offset[i]);
          }
 	 else if (state == &i830->meta) {
 	    assert(i == 0);
@@ -634,21 +634,11 @@ i830_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I830_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I830_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I830_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I830_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I830_DESTREG_DV1 value
@@ -718,26 +708,6 @@ i830_set_draw_region(struct intel_context *intel,
    i830_state_draw_region(intel, &i830->state, color_regions[0], depth_region);
 }
 
-#if 0
-static void
-i830_update_color_z_regions(intelContextPtr intel,
-                            const intelRegion * colorRegion,
-                            const intelRegion * depthRegion)
-{
-   i830ContextPtr i830 = I830_CONTEXT(intel);
-
-   i830->state.Buffer[I830_DESTREG_CBUFADDR1] =
-      (BUF_3D_ID_COLOR_BACK | BUF_3D_PITCH(colorRegion->pitch) |
-       BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_CBUFADDR2] = colorRegion->offset;
-
-   i830->state.Buffer[I830_DESTREG_DBUFADDR1] =
-      (BUF_3D_ID_DEPTH | BUF_3D_PITCH(depthRegion->pitch) | BUF_3D_USE_FENCE);
-   i830->state.Buffer[I830_DESTREG_DBUFADDR2] = depthRegion->offset;
-}
-#endif
-
-
 /* This isn't really handled at the moment.
  */
 static void
@@ -767,12 +737,6 @@ i830_assert_not_dirty( struct intel_context *intel )
    assert(!get_dirty(state));
 }
 
-static void
-i830_note_unlock( struct intel_context *intel )
-{
-    /* nothing */
-}
-
 void
 i830InitVtbl(struct i830_context *i830)
 {
@@ -787,6 +751,5 @@ i830InitVtbl(struct i830_context *i830)
    i830->intel.vtbl.render_start = i830_render_start;
    i830->intel.vtbl.render_prevalidate = i830_render_prevalidate;
    i830->intel.vtbl.assert_not_dirty = i830_assert_not_dirty;
-   i830->intel.vtbl.note_unlock = i830_note_unlock; 
    i830->intel.vtbl.finish_batch = intel_finish_vb;
 }
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index 1f9f363df92..367d2a3b648 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -73,7 +73,7 @@ i915InvalidateState(GLcontext * ctx, GLuint new_state)
          p->params_uptodate = 0;
    }
 
-   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM))
+   if (new_state & (_NEW_FOG | _NEW_HINT | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS))
       i915_update_fog(ctx);
 }
 
diff --git a/src/mesa/drivers/dri/i915/i915_reg.h b/src/mesa/drivers/dri/i915/i915_reg.h
index 8891e11c6fd..84db58ea950 100644
--- a/src/mesa/drivers/dri/i915/i915_reg.h
+++ b/src/mesa/drivers/dri/i915/i915_reg.h
@@ -93,20 +93,6 @@
 
 /* 3DSTATE_BIN_CONTROL p141 */
 
-/* p143 */
-#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
-/* Dword 1 */
-#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
-#define BUF_3D_ID_DEPTH 	(0x7<<24)
-#define BUF_3D_USE_FENCE	(1<<23)
-#define BUF_3D_TILED_SURFACE	(1<<22)
-#define BUF_3D_TILE_WALK_X	0
-#define BUF_3D_TILE_WALK_Y	(1<<21)
-#define BUF_3D_PITCH(x)         (((x)/4)<<2)
-/* Dword 2 */
-#define BUF_3D_ADDR(x)		((x) & ~0x3)
-
-
 /* 3DSTATE_CHROMA_KEY */
 
 /* 3DSTATE_CLEAR_PARAMETERS, p150 */
@@ -155,6 +141,7 @@
 /* p161 */
 #define _3DSTATE_DST_BUF_VARS_CMD	(CMD_3D | (0x1d<<24) | (0x85<<16))
 /* Dword 1 */
+#define CLASSIC_EARLY_DEPTH		(1<<31)
 #define TEX_DEFAULT_COLOR_OGL           (0<<30)
 #define TEX_DEFAULT_COLOR_D3D           (1<<30)
 #define ZR_EARLY_DEPTH                  (1<<29)
diff --git a/src/mesa/drivers/dri/i915/i915_tex_layout.c b/src/mesa/drivers/dri/i915/i915_tex_layout.c
index 7cc1c096e4b..d9588e5b56d 100644
--- a/src/mesa/drivers/dri/i915/i915_tex_layout.c
+++ b/src/mesa/drivers/dri/i915/i915_tex_layout.c
@@ -55,6 +55,17 @@ static GLint step_offsets[6][2] = {
    [FACE_NEG_Z] = {-1, 1},
 };
 
+
+static GLint bottom_offsets[6] = {
+   [FACE_POS_X] = 16 + 0 * 8,
+   [FACE_POS_Y] = 16 + 1 * 8,
+   [FACE_POS_Z] = 16 + 2 * 8,
+   [FACE_NEG_X] = 16 + 3 * 8,
+   [FACE_NEG_Y] = 16 + 4 * 8,
+   [FACE_NEG_Z] = 16 + 5 * 8,
+};
+
+
 /**
  * Cube texture map layout for i830M-GM915.
  *
@@ -101,7 +112,8 @@ static GLint step_offsets[6][2] = {
  */
 static void
 i915_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -111,7 +123,7 @@ i915_miptree_layout_cube(struct intel_context *intel,
    assert(lvlWidth == lvlHeight); /* cubemap images are square */
 
    /* double pitch for cube layouts */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    mt->total_height = dim * 4;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -145,7 +157,8 @@ i915_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i915_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -154,7 +167,7 @@ i915_miptree_layout_3d(struct intel_context *intel,
    GLint level;
 
    /* Calculate the size of a single slice. */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
 
    /* XXX: hardware expects/requires 9 levels at minimum. */
    for (level = mt->first_level; level <= MAX2(8, mt->last_level); level++) {
@@ -189,14 +202,15 @@ i915_miptree_layout_3d(struct intel_context *intel,
 
 static void
 i915_miptree_layout_2d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
    GLuint img_height;
    GLint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    for (level = mt->first_level; level <= mt->last_level; level++) {
@@ -217,19 +231,20 @@ i915_miptree_layout_2d(struct intel_context *intel,
 }
 
 GLboolean
-i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
-      i915_miptree_layout_cube(intel, mt);
+      i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i915_miptree_layout_3d(intel, mt);
+      i915_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i915_miptree_layout_2d(intel, mt);
+      i915_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i915_miptree_layout()");
@@ -297,7 +312,7 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
  * +---+   +---+   +---+   +---+   +---+   +---+
  *
  * The bottom row continues with the remaining 2x2 then the 1x1 mip contents
- * in order, with each of them aligned to a 4x4 block boundary.  Thus, for
+ * in order, with each of them aligned to a 8x8 block boundary.  Thus, for
  * 32x32 cube maps and smaller, the bottom row layout is going to dictate the
  * pitch of the tree.  For a tree with 4x4 images, the pitch is at least
  * 14 * 8 = 112 texels, for 2x2 it is at least 12 * 8 texels, and for 1x1
@@ -306,7 +321,8 @@ i915_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
 
 static void
 i945_miptree_layout_cube(struct intel_context *intel,
-			 struct intel_mipmap_tree * mt)
+			 struct intel_mipmap_tree * mt,
+			 uint32_t tiling)
 {
    const GLuint dim = mt->width0;
    GLuint face;
@@ -320,9 +336,9 @@ i945_miptree_layout_cube(struct intel_context *intel,
     * or the final row of 4x4, 2x2 and 1x1 faces below this.
     */
    if (dim > 32)
-      mt->pitch = intel_miptree_pitch_align (intel, mt, dim * 2);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, dim * 2);
    else
-      mt->pitch = intel_miptree_pitch_align (intel, mt, 14 * 8);
+      mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, 14 * 8);
 
    if (dim >= 4)
       mt->total_height = dim * 4 + 4;
@@ -375,10 +391,11 @@ i945_miptree_layout_cube(struct intel_context *intel,
 	       x = (face - 4) * 8;
 	       break;
 	    }
+	    break;
 
 	 case 2:
 	    y = mt->total_height - 4;
-	    x = 16 + face * 8;
+	    x = bottom_offsets[face];
 	    break;
 
 	 case 1:
@@ -396,7 +413,8 @@ i945_miptree_layout_cube(struct intel_context *intel,
 
 static void
 i945_miptree_layout_3d(struct intel_context *intel,
-		       struct intel_mipmap_tree * mt)
+		       struct intel_mipmap_tree * mt,
+		       uint32_t tiling)
 {
    GLuint width = mt->width0;
    GLuint height = mt->height0;
@@ -405,7 +423,7 @@ i945_miptree_layout_3d(struct intel_context *intel,
    GLuint pack_y_pitch;
    GLuint level;
 
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
    mt->total_height = 0;
 
    pack_y_pitch = MAX2(mt->height0, 2);
@@ -450,22 +468,23 @@ i945_miptree_layout_3d(struct intel_context *intel,
 }
 
 GLboolean
-i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt)
+i945_miptree_layout(struct intel_context *intel, struct intel_mipmap_tree * mt,
+		    uint32_t tiling)
 {
    switch (mt->target) {
    case GL_TEXTURE_CUBE_MAP:
       if (mt->compressed)
-	 i945_miptree_layout_cube(intel, mt);
+	 i945_miptree_layout_cube(intel, mt, tiling);
       else
-	 i915_miptree_layout_cube(intel, mt);
+	 i915_miptree_layout_cube(intel, mt, tiling);
       break;
    case GL_TEXTURE_3D:
-      i945_miptree_layout_3d(intel, mt);
+      i945_miptree_layout_3d(intel, mt, tiling);
       break;
    case GL_TEXTURE_1D:
    case GL_TEXTURE_2D:
    case GL_TEXTURE_RECTANGLE_ARB:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    default:
       _mesa_problem(NULL, "Unexpected tex target in i945_miptree_layout()");
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index a37dd7f4fb5..32d4b30cf9a 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -185,8 +185,13 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
 
    state[I915_TEXREG_MS3] =
       (((firstImage->Height - 1) << MS3_HEIGHT_SHIFT) |
-       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format |
-       MS3_USE_FENCE_REGS);
+       ((firstImage->Width - 1) << MS3_WIDTH_SHIFT) | format);
+
+   if (intelObj->mt->region->tiling != I915_TILING_NONE) {
+      state[I915_TEXREG_MS3] |= MS3_TILED_SURFACE;
+      if (intelObj->mt->region->tiling == I915_TILING_Y)
+	 state[I915_TEXREG_MS3] |= MS3_TILE_WALK;
+   }
 
    state[I915_TEXREG_MS4] =
      ((((pitch / 4) - 1) << MS4_PITCH_SHIFT) | MS4_CUBE_FACE_ENA_MASK |
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index 115004616ff..fe1be93a6d0 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -42,6 +42,7 @@
 #include "intel_regions.h"
 #include "intel_tris.h"
 #include "intel_fbo.h"
+#include "intel_chipset.h"
 
 #include "i915_reg.h"
 #include "i915_context.h"
@@ -529,6 +530,23 @@ i915_destroy_context(struct intel_context *intel)
    _tnl_free_vertices(&intel->ctx);
 }
 
+void
+i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+			     uint32_t buffer_id)
+{
+   state[0] = _3DSTATE_BUF_INFO_CMD;
+   state[1] = buffer_id;
+
+   if (region != NULL) {
+      state[1] |= BUF_3D_PITCH(region->pitch * region->cpp);
+
+      if (region->tiling != I915_TILING_NONE) {
+	 state[1] |= BUF_3D_TILED_SURFACE;
+	 if (region->tiling == I915_TILING_Y)
+	    state[1] |= BUF_3D_TILE_WALK_Y;
+      }
+   }
+}
 
 /**
  * Set the drawing regions for the color and depth/stencil buffers.
@@ -562,21 +580,11 @@ i915_state_draw_region(struct intel_context *intel,
    /*
     * Set stride/cpp values
     */
-   if (color_region) {
-      state->Buffer[I915_DESTREG_CBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_CBUFADDR1] =
-         (BUF_3D_ID_COLOR_BACK |
-          BUF_3D_PITCH(color_region->pitch * color_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_CBUFADDR0],
+				color_region, BUF_3D_ID_COLOR_BACK);
 
-   if (depth_region) {
-      state->Buffer[I915_DESTREG_DBUFADDR0] = _3DSTATE_BUF_INFO_CMD;
-      state->Buffer[I915_DESTREG_DBUFADDR1] =
-         (BUF_3D_ID_DEPTH |
-          BUF_3D_PITCH(depth_region->pitch * depth_region->cpp) |
-          BUF_3D_USE_FENCE);
-   }
+   i915_set_buf_info_for_region(&state->Buffer[I915_DESTREG_DBUFADDR0],
+				depth_region, BUF_3D_ID_DEPTH);
 
    /*
     * Compute/set I915_DESTREG_DV1 value
@@ -604,6 +612,14 @@ i915_state_draw_region(struct intel_context *intel,
       }
    }
 
+   /* This isn't quite safe, thus being hidden behind an option.  When changing
+    * the value of this bit, the pipeline needs to be MI_FLUSHed.  And it
+    * can only be set when a depth buffer is already defined.
+    */
+   if (IS_945(intel->intelScreen->deviceID) && intel->use_early_z &&
+       depth_region->tiling != I915_TILING_NONE)
+      value |= CLASSIC_EARLY_DEPTH;
+
    if (depth_region && depth_region->cpp == 4) {
       value |= DEPTH_FRMT_24_FIXED_8_OTHER;
    }
@@ -676,13 +692,6 @@ i915_assert_not_dirty( struct intel_context *intel )
    assert(!dirty);
 }
 
-static void
-i915_note_unlock( struct intel_context *intel )
-{
-    /* nothing */
-}
-
-
 void
 i915InitVtbl(struct i915_context *i915)
 {
@@ -697,6 +706,5 @@ i915InitVtbl(struct i915_context *i915)
    i915->intel.vtbl.update_texture_state = i915UpdateTextureState;
    i915->intel.vtbl.flush_cmd = i915_flush_cmd;
    i915->intel.vtbl.assert_not_dirty = i915_assert_not_dirty;
-   i915->intel.vtbl.note_unlock = i915_note_unlock; 
    i915->intel.vtbl.finish_batch = intel_finish_vb;
 }
diff --git a/src/mesa/drivers/dri/i915/intel_generatemipmap.c b/src/mesa/drivers/dri/i915/intel_generatemipmap.c
new file mode 120000
index 00000000000..4c6b37ada01
--- /dev/null
+++ b/src/mesa/drivers/dri/i915/intel_generatemipmap.c
@@ -0,0 +1 @@
+../intel/intel_generatemipmap.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_pixel_read.c b/src/mesa/drivers/dri/i915/intel_pixel_read.c
index 56087aacd4e..cc4589f4d42 100644..120000
--- a/src/mesa/drivers/dri/i915/intel_pixel_read.c
+++ b/src/mesa/drivers/dri/i915/intel_pixel_read.c
@@ -1,306 +1 @@
-/**************************************************************************
- * 
- * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "main/glheader.h"
-#include "main/enums.h"
-#include "main/mtypes.h"
-#include "main/macros.h"
-#include "main/image.h"
-#include "main/bufferobj.h"
-#include "swrast/swrast.h"
-
-#include "intel_screen.h"
-#include "intel_context.h"
-#include "intel_batchbuffer.h"
-#include "intel_blit.h"
-#include "intel_buffers.h"
-#include "intel_regions.h"
-#include "intel_pixel.h"
-#include "intel_buffer_objects.h"
-
-/* For many applications, the new ability to pull the source buffers
- * back out of the GTT and then do the packing/conversion operations
- * in software will be as much of an improvement as trying to get the
- * blitter and/or texture engine to do the work. 
- *
- * This step is gated on private backbuffers.
- * 
- * Obviously the frontbuffer can't be pulled back, so that is either
- * an argument for blit/texture readpixels, or for blitting to a
- * temporary and then pulling that back.
- *
- * When the destination is a pbo, however, it's not clear if it is
- * ever going to be pulled to main memory (though the access param
- * will be a good hint).  So it sounds like we do want to be able to
- * choose between blit/texture implementation on the gpu and pullback
- * and cpu-based copying.
- *
- * Unless you can magically turn client memory into a PBO for the
- * duration of this call, there will be a cpu-based copying step in
- * any case.
- */
-
-
-static GLboolean
-do_texture_readpixels(GLcontext * ctx,
-                      GLint x, GLint y, GLsizei width, GLsizei height,
-                      GLenum format, GLenum type,
-                      const struct gl_pixelstore_attrib *pack,
-                      struct intel_region *dest_region)
-{
-#if 0
-   struct intel_context *intel = intel_context(ctx);
-   intelScreenPrivate *screen = intel->intelScreen;
-   GLint pitch = pack->RowLength ? pack->RowLength : width;
-   __DRIdrawablePrivate *dPriv = intel->driDrawable;
-   int textureFormat;
-   GLenum glTextureFormat;
-   int destFormat, depthFormat, destPitch;
-   drm_clip_rect_t tmp;
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-
-   if (ctx->_ImageTransferState ||
-       pack->SwapBytes || pack->LsbFirst || !pack->Invert) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   intel->vtbl.meta_texrect_source(intel, intel_readbuf_region(intel));
-
-   if (!intel->vtbl.meta_render_dest(intel, dest_region, type, format)) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         fprintf(stderr, "%s: couldn't set dest %s/%s\n",
-                 __FUNCTION__,
-                 _mesa_lookup_enum_by_nr(type),
-                 _mesa_lookup_enum_by_nr(format));
-      return GL_FALSE;
-   }
-
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      intel->vtbl.install_meta_state(intel);
-      intel->vtbl.meta_no_depth_write(intel);
-      intel->vtbl.meta_no_stencil_write(intel);
-
-      if (!driClipRectToFramebuffer(ctx->ReadBuffer, &x, &y, &width, &height)) {
-         UNLOCK_HARDWARE(intel);
-         SET_STATE(i830, state);
-         if (INTEL_DEBUG & DEBUG_PIXEL)
-            fprintf(stderr, "%s: cliprect failed\n", __FUNCTION__);
-         return GL_TRUE;
-      }
-
-      y = dPriv->h - y - height;
-      x += dPriv->x;
-      y += dPriv->y;
-
-
-      /* Set the frontbuffer up as a large rectangular texture.
-       */
-      intel->vtbl.meta_tex_rect_source(intel, src_region, textureFormat);
-
-
-      intel->vtbl.meta_texture_blend_replace(i830, glTextureFormat);
-
-
-      /* Set the 3d engine to draw into the destination region:
-       */
-
-      intel->vtbl.meta_draw_region(intel, dest_region);
-      intel->vtbl.meta_draw_format(intel, destFormat, depthFormat);     /* ?? */
-
-
-      /* Draw a single quad, no cliprects:
-       */
-      intel->vtbl.meta_disable_cliprects(intel);
-
-      intel->vtbl.draw_quad(intel,
-                            0, width, 0, height,
-                            0x00ff00ff, x, x + width, y, y + height);
-
-      intel->vtbl.leave_meta_state(intel);
-   }
-   UNLOCK_HARDWARE(intel);
-
-   intel_region_wait_fence(ctx, dest_region);   /* required by GL */
-   return GL_TRUE;
-#endif
-
-   return GL_FALSE;
-}
-
-
-
-
-static GLboolean
-do_blit_readpixels(GLcontext * ctx,
-                   GLint x, GLint y, GLsizei width, GLsizei height,
-                   GLenum format, GLenum type,
-                   const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_region *src = intel_readbuf_region(intel);
-   struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
-   GLuint dst_offset;
-   GLuint rowLength;
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s\n", __FUNCTION__);
-
-   if (!src)
-      return GL_FALSE;
-
-   if (dst) {
-      /* XXX This validation should be done by core mesa:
-       */
-      if (!_mesa_validate_pbo_access(2, pack, width, height, 1,
-                                     format, type, pixels)) {
-         _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawPixels");
-         return GL_TRUE;
-      }
-   }
-   else {
-      /* PBO only for now:
-       */
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s - not PBO\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-
-   if (ctx->_ImageTransferState ||
-       !intel_check_blit_format(src, format, type)) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s - bad format for blit\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   if (pack->Alignment != 1 || pack->SwapBytes || pack->LsbFirst) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s: bad packing params\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-
-   if (pack->RowLength > 0)
-      rowLength = pack->RowLength;
-   else
-      rowLength = width;
-
-   if (pack->Invert) {
-      if (INTEL_DEBUG & DEBUG_PIXEL)
-         _mesa_printf("%s: MESA_PACK_INVERT not done yet\n", __FUNCTION__);
-      return GL_FALSE;
-   }
-   else {
-      rowLength = -rowLength;
-   }
-
-   /* XXX 64-bit cast? */
-   dst_offset = (GLuint) _mesa_image_address(2, pack, pixels, width, height,
-                                             format, type, 0, 0, 0);
-
-
-   /* Although the blits go on the command buffer, need to do this and
-    * fire with lock held to guarentee cliprects are correct.
-    */
-   intelFlush(&intel->ctx);
-   LOCK_HARDWARE(intel);
-
-   if (intel->driDrawable->numClipRects) {
-      GLboolean all = (width * height * src->cpp == dst->Base.Size &&
-                       x == 0 && dst_offset == 0);
-
-      dri_bo *dst_buffer = intel_bufferobj_buffer(intel, dst,
-						  all ? INTEL_WRITE_FULL :
-						  INTEL_WRITE_PART);
-      __DRIdrawablePrivate *dPriv = intel->driDrawable;
-      int nbox = dPriv->numClipRects;
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t rect;
-      drm_clip_rect_t src_rect;
-      int i;
-
-      src_rect.x1 = dPriv->x + x;
-      src_rect.y1 = dPriv->y + dPriv->h - (y + height);
-      src_rect.x2 = src_rect.x1 + width;
-      src_rect.y2 = src_rect.y1 + height;
-
-
-
-      for (i = 0; i < nbox; i++) {
-         if (!intel_intersect_cliprects(&rect, &src_rect, &box[i]))
-            continue;
-
-         intelEmitCopyBlit(intel,
-                           src->cpp,
-                           src->pitch, src->buffer, 0, src->tiling,
-                           rowLength, dst_buffer, dst_offset, GL_FALSE,
-                           rect.x1,
-                           rect.y1,
-                           rect.x1 - src_rect.x1,
-                           rect.y2 - src_rect.y2,
-                           rect.x2 - rect.x1, rect.y2 - rect.y1,
-			   GL_COPY);
-      }
-   }
-   UNLOCK_HARDWARE(intel);
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s - DONE\n", __FUNCTION__);
-
-   return GL_TRUE;
-}
-
-void
-intelReadPixels(GLcontext * ctx,
-                GLint x, GLint y, GLsizei width, GLsizei height,
-                GLenum format, GLenum type,
-                const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
-{
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   intelFlush(ctx);
-
-   if (do_blit_readpixels
-       (ctx, x, y, width, height, format, type, pack, pixels))
-      return;
-
-   if (do_texture_readpixels
-       (ctx, x, y, width, height, format, type, pack, pixels))
-      return;
-
-   if (INTEL_DEBUG & DEBUG_PIXEL)
-      _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
-
-   _swrast_ReadPixels(ctx, x, y, width, height, format, type, pack, pixels);
-}
+../intel/intel_pixel_read.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i915/intel_tris.c b/src/mesa/drivers/dri/i915/intel_tris.c
index 1d39278cbf0..a905455342d 100644
--- a/src/mesa/drivers/dri/i915/intel_tris.c
+++ b/src/mesa/drivers/dri/i915/intel_tris.c
@@ -1255,11 +1255,9 @@ intel_meta_draw_poly(struct intel_context *intel,
 {
    union fi *vb;
    GLint i;
-   GLboolean was_locked = intel->locked;
    unsigned int saved_vertex_size = intel->vertex_size;
 
-   if (!was_locked)
-       LOCK_HARDWARE(intel);
+   LOCK_HARDWARE(intel);
 
    intel->vertex_size = 6;
 
@@ -1283,8 +1281,7 @@ intel_meta_draw_poly(struct intel_context *intel,
 
    intel->vertex_size = saved_vertex_size;
 
-   if (!was_locked)
-       UNLOCK_HARDWARE(intel);
+   UNLOCK_HARDWARE(intel);
 }
 
 static void
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index 2934414d99a..00a42111da0 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -14,6 +14,7 @@ DRIVER_SOURCES = \
 	intel_decode.c \
 	intel_extensions.c \
 	intel_fbo.c \
+	intel_generatemipmap.c \
 	intel_mipmap_tree.c \
 	intel_regions.c \
 	intel_screen.c \
@@ -22,6 +23,7 @@ DRIVER_SOURCES = \
 	intel_pixel_bitmap.c \
 	intel_pixel_copy.c \
 	intel_pixel_draw.c \
+	intel_pixel_read.c \
 	intel_state.c \
 	intel_swapbuffers.c \
 	intel_tex.c \
@@ -69,6 +71,7 @@ DRIVER_SOURCES = \
 	brw_vs_constval.c \
 	brw_vs_emit.c \
 	brw_vs_state.c \
+	brw_vs_surface_state.c \
 	brw_vtbl.c \
 	brw_wm.c \
 	brw_wm_debug.c \
diff --git a/src/mesa/drivers/dri/i965/brw_clip.c b/src/mesa/drivers/dri/i965/brw_clip.c
index 5cffcebde43..54d30a3f422 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.c
+++ b/src/mesa/drivers/dri/i965/brw_clip.c
@@ -65,21 +65,31 @@ static void compile_clip_prog( struct brw_context *brw,
    c.func.single_program_flow = 1;
 
    c.key = *key;
-
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
 
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.header_position_offset = ATTR_SIZE;
 
-   for (i = 0, delta = REG_SIZE; i < VERT_RESULT_MAX; i++)
+   if (BRW_IS_IGDNG(brw))
+       delta = 3 * REG_SIZE;
+   else
+       delta = REG_SIZE;
+
+   for (i = 0; i < VERT_RESULT_MAX; i++)
       if (c.key.attrs & (1<<i)) {
 	 c.offset[i] = delta;
 	 delta += ATTR_SIZE;
       }
 
    c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+   
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
    c.prog_data.clip_mode = c.key.clip_mode; /* XXX */
@@ -148,7 +158,11 @@ static void upload_clip_prog(struct brw_context *brw)
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
    /* _NEW_TRANSFORM */
    key.nr_userclip = brw_count_bits(ctx->Transform.ClipPlanesEnabled);
-   key.clip_mode = BRW_CLIPMODE_NORMAL;
+
+   if (BRW_IS_IGDNG(brw))
+       key.clip_mode = BRW_CLIPMODE_KERNEL_CLIP;
+   else
+       key.clip_mode = BRW_CLIPMODE_NORMAL;
 
    /* _NEW_POLYGON */
    if (key.primitive == GL_TRIANGLES) {
diff --git a/src/mesa/drivers/dri/i965/brw_clip.h b/src/mesa/drivers/dri/i965/brw_clip.h
index e06747864b5..12e8548df1f 100644
--- a/src/mesa/drivers/dri/i965/brw_clip.h
+++ b/src/mesa/drivers/dri/i965/brw_clip.h
@@ -117,6 +117,7 @@ struct brw_clip_compile {
 
    GLuint header_position_offset;
    GLuint offset[VERT_ATTRIB_MAX];
+   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
@@ -171,5 +172,5 @@ struct brw_reg get_tmp( struct brw_clip_compile *c );
 
 void brw_clip_project_position(struct brw_clip_compile *c,
              struct brw_reg pos );
-
+void brw_clip_ff_sync(struct brw_clip_compile *c);
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_clip_line.c b/src/mesa/drivers/dri/i965/brw_clip_line.c
index d830e49e50a..9abd0642aa2 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_line.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_line.c
@@ -130,7 +130,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    struct brw_instruction *plane_loop;
    struct brw_instruction *plane_active;
    struct brw_instruction *is_negative;
-   struct brw_instruction *is_neg2;
+   struct brw_instruction *is_neg2 = NULL;
    struct brw_instruction *not_culled;
    struct brw_reg v1_null_ud = retype(vec1(brw_null_reg()), BRW_REGISTER_TYPE_UD);
 
@@ -148,7 +148,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
    brw_clip_init_clipmask(c);
 
    /* -ve rhw workaround */
-   if (!BRW_IS_G4X(p->brw)) {
+   if (BRW_IS_965(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2),
               brw_imm_ud(1<<20));
@@ -185,7 +185,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
               * Both can be negative on GM965/G965 due to RHW workaround
               * if so, this object should be rejected.
               */
-             if (!BRW_IS_G4X(p->brw)) {
+             if (BRW_IS_965(p->brw)) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_LE, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
                  {
@@ -210,7 +210,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
              /* If both are positive, do nothing */
              /* Only on GM965/G965 */
-             if (!BRW_IS_G4X(p->brw)) {
+             if (BRW_IS_965(p->brw)) {
                  brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.dp0, brw_imm_f(0.0));
                  is_neg2 = brw_IF(p, BRW_EXECUTE_1);
              }
@@ -225,7 +225,7 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
                  brw_set_predicate_control(p, BRW_PREDICATE_NONE);
              }
 
-             if (!BRW_IS_G4X(p->brw)) {
+             if (BRW_IS_965(p->brw)) {
                  brw_ENDIF(p, is_neg2);
              }
          }
@@ -246,6 +246,8 @@ static void clip_and_emit_line( struct brw_clip_compile *c )
 
    brw_ADD(p, c->reg.t, c->reg.t0, c->reg.t1);
    brw_CMP(p, vec1(brw_null_reg()), BRW_CONDITIONAL_L, c->reg.t, brw_imm_f(1.0));
+   if (c->need_ff_sync)
+	   brw_clip_ff_sync(c);      
    not_culled = brw_IF(p, BRW_EXECUTE_1);
    {
       brw_clip_interp_vertex(c, newvtx0, vtx0, vtx1, c->reg.t0, GL_FALSE);
diff --git a/src/mesa/drivers/dri/i965/brw_clip_point.c b/src/mesa/drivers/dri/i965/brw_clip_point.c
index d17b199b898..97382991686 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_point.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_point.c
@@ -50,5 +50,7 @@ void brw_emit_point_clip( struct brw_clip_compile *c )
    /* Send an empty message to kill the thread:
     */
    brw_clip_tri_alloc_regs(c, 0);
+   if (c->need_ff_sync)
+	   brw_clip_ff_sync(c);      
    brw_clip_kill_thread(c);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_state.c b/src/mesa/drivers/dri/i965/brw_clip_state.c
index 9b0d7eab7bf..5762c9577c6 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_state.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_state.c
@@ -95,7 +95,14 @@ clip_unit_create_from_key(struct brw_context *brw,
        * even number.
        */
       assert(key->nr_urb_entries % 2 == 0);
-      clip.thread4.max_threads = 2 - 1;
+      
+      /* Although up to 16 concurrent Clip threads are allowed on IGDNG, 
+       * only 2 threads can output VUEs at a time.
+       */
+      if (BRW_IS_IGDNG(brw))
+         clip.thread4.max_threads = 16 - 1;        
+      else
+         clip.thread4.max_threads = 2 - 1;
    } else {
       assert(key->nr_urb_entries >= 5);
       clip.thread4.max_threads = 1 - 1;
diff --git a/src/mesa/drivers/dri/i965/brw_clip_tri.c b/src/mesa/drivers/dri/i965/brw_clip_tri.c
index 7fd37bd05ff..4c2d655fb10 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_tri.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_tri.c
@@ -77,6 +77,10 @@ void brw_clip_tri_alloc_regs( struct brw_clip_compile *c,
    if (c->nr_attrs & 1) {
       for (j = 0; j < 3; j++) {
 	 GLuint delta = c->nr_attrs*16 + 32;
+
+         if (BRW_IS_IGDNG(c->func.brw))
+             delta = c->nr_attrs * 16 + 32 * 3;
+
 	 brw_MOV(&c->func, byte_offset(c->reg.vertex[j], delta), brw_imm_f(0));
       }
    }
@@ -562,7 +566,7 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
 
    /* if -ve rhw workaround bit is set, 
       do cliptest */
-   if (!BRW_IS_G4X(p->brw)) {
+   if (BRW_IS_965(p->brw)) {
       brw_set_conditionalmod(p, BRW_CONDITIONAL_NZ);
       brw_AND(p, brw_null_reg(), get_element_ud(c->reg.R0, 2), 
               brw_imm_ud(1<<20));
@@ -579,11 +583,14 @@ void brw_emit_tri_clip( struct brw_clip_compile *c )
    if (c->key.do_flat_shading) 
       brw_clip_tri_flat_shade(c); 
       
-   if (c->key.clip_mode == BRW_CLIPMODE_NORMAL)
+   if ((c->key.clip_mode == BRW_CLIPMODE_NORMAL) ||
+       (c->key.clip_mode == BRW_CLIPMODE_KERNEL_CLIP))
       do_clip_tri(c);
    else 
       maybe_do_clip_tri(c);
-      
+
+   if (c->need_ff_sync)
+	   brw_clip_ff_sync(c);      
    brw_clip_tri_emit_polygon(c);
 
    /* Send an empty message to kill the thread:
diff --git a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
index d7ca517927b..26950383c1b 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_unfilled.c
@@ -496,6 +496,8 @@ void brw_emit_unfilled_clip( struct brw_clip_compile *c )
    }
    brw_ENDIF(p, do_clip);
    
+   if (c->need_ff_sync)
+	   brw_clip_ff_sync(c);      
    emit_unfilled_primitives(c);
    brw_clip_kill_thread(c);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_clip_util.c b/src/mesa/drivers/dri/i965/brw_clip_util.c
index 9d3b0be694a..e09efc07ed4 100644
--- a/src/mesa/drivers/dri/i965/brw_clip_util.c
+++ b/src/mesa/drivers/dri/i965/brw_clip_util.c
@@ -140,6 +140,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 
    /* Just copy the vertex header:
     */
+   /*
+    * After CLIP stage, only first 256 bits of the VUE are read
+    * back on IGDNG, so needn't change it
+    */
    brw_copy_indirect_to_indirect(p, dest_ptr, v0_ptr, 1);
       
    /* Iterate over each attribute (could be done in pairs?)
@@ -147,6 +151,9 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
    for (i = 0; i < c->nr_attrs; i++) {
       GLuint delta = i*16 + 32;
 
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
       if (delta == c->offset[VERT_RESULT_EDGE]) {
 	 if (force_edgeflag) 
 	    brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(1));
@@ -177,6 +184,10 @@ void brw_clip_interp_vertex( struct brw_clip_compile *c,
 
    if (i & 1) {
       GLuint delta = i*16 + 32;
+
+      if (BRW_IS_IGDNG(p->brw))
+          delta = i * 16 + 32 * 3;
+
       brw_MOV(p, deref_4f(dest_ptr, delta), brw_imm_f(0));
    }
 
@@ -343,3 +354,19 @@ void brw_clip_init_clipmask( struct brw_clip_compile *c )
    }
 }
 
+void brw_clip_ff_sync(struct brw_clip_compile *c)
+{
+	struct brw_compile *p = &c->func;
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 577497bf6b0..57ddf75413a 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -129,8 +129,8 @@ struct brw_context;
 #define BRW_NEW_PRIMITIVE               0x40
 #define BRW_NEW_CONTEXT                 0x80
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
-#define BRW_NEW_INPUT_VARYING           0x200
 #define BRW_NEW_PSP                     0x800
+#define BRW_NEW_WM_SURFACES		0x1000
 #define BRW_NEW_FENCE                   0x2000
 #define BRW_NEW_INDICES			0x4000
 #define BRW_NEW_VERTICES		0x8000
@@ -401,7 +401,6 @@ struct brw_vertex_element {
 
 
 struct brw_vertex_info {
-   GLuint varying;  /* varying:1[VERT_ATTRIB_MAX] */
    GLuint sizes[ATTRIB_BIT_DWORDS * 2]; /* sizes:2[VERT_ATTRIB_MAX] */
 };
 
@@ -450,8 +449,6 @@ struct brw_context
 
    struct {
       struct brw_state_flags dirty;
-      struct brw_tracked_state **atoms;
-      GLuint nr_atoms;
 
       GLuint nr_color_regions;
       struct intel_region *color_regions[MAX_DRAW_BUFFERS];
@@ -471,7 +468,8 @@ struct brw_context
       int validated_bo_count;
    } state;
 
-   struct brw_cache cache;
+   struct brw_cache cache;  /** non-surface items */
+   struct brw_cache surface_cache;  /* surface items */
    struct brw_cached_batch_item *cached_batch_items;
 
    struct {
@@ -555,11 +553,6 @@ struct brw_context
       GLuint vs_size;
       GLuint total_size;
 
-      /* Dynamic tracker which changes to reflect the state referenced
-       * by active fp and vp program parameters:
-       */
-      struct brw_tracked_state tracked_state;
-
       dri_bo *curbe_bo;
       /** Offset within curbe_bo of space for current curbe entry */
       GLuint curbe_offset;
diff --git a/src/mesa/drivers/dri/i965/brw_curbe.c b/src/mesa/drivers/dri/i965/brw_curbe.c
index 9197fede2d8..a1a6c53d0e0 100644
--- a/src/mesa/drivers/dri/i965/brw_curbe.c
+++ b/src/mesa/drivers/dri/i965/brw_curbe.c
@@ -36,6 +36,7 @@
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
@@ -188,13 +189,6 @@ static void prepare_constant_buffer(struct brw_context *brw)
    GLfloat *buf;
    GLuint i;
 
-   /* Update our own dependency flags.  This works because this
-    * function will also be called whenever fp or vp changes.
-    */
-   brw->curbe.tracked_state.dirty.mesa = (_NEW_TRANSFORM|_NEW_PROJECTION);
-   brw->curbe.tracked_state.dirty.mesa |= vp->program.Base.Parameters->StateFlags;
-   brw->curbe.tracked_state.dirty.mesa |= fp->program.Base.Parameters->StateFlags;
-
    if (sz == 0) {
       if (brw->curbe.last_buf) {
 	 free(brw->curbe.last_buf);
@@ -335,78 +329,11 @@ static void prepare_constant_buffer(struct brw_context *brw)
     */
 }
 
-
-/**
- * Copy Mesa program parameters into given constant buffer.
- */
-static void
-update_constant_buffer(struct brw_context *brw,
-                       const struct gl_program_parameter_list *params,
-                       dri_bo *const_buffer)
-{
-   struct intel_context *intel = &brw->intel;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
-
-   /* copy Mesa program constants into the buffer */
-   if (const_buffer && size > 0) {
-
-      assert(const_buffer);
-      assert(const_buffer->size >= size);
-
-      if (intel->intelScreen->kernel_exec_fencing) {
-         drm_intel_gem_bo_map_gtt(const_buffer);
-         memcpy(const_buffer->virtual, params->ParameterValues, size);
-         drm_intel_gem_bo_unmap_gtt(const_buffer);
-      }
-      else {
-         dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
-      }
-
-      if (0) {
-         int i;
-         for (i = 0; i < params->NumParameters; i++) {
-            float *p = params->ParameterValues[i];
-            printf("%d: %f %f %f %f\n", i, p[0], p[1], p[2], p[3]);
-         }
-      }
-   }
-}
-
-
-/** Copy current vertex program's parameters into the constant buffer */
-static void
-update_vertex_constant_buffer(struct brw_context *brw)
-{
-   struct brw_vertex_program *vp =
-      (struct brw_vertex_program *) brw->vertex_program;
-   if (0) {
-      printf("update VS constants in buffer %p\n", vp->const_buffer);
-      printf("program %u\n", vp->program.Base.Id);
-   }
-   if (vp->use_const_buffer)
-      update_constant_buffer(brw, vp->program.Base.Parameters, vp->const_buffer);
-}
-
-
-/** Copy current fragment program's parameters into the constant buffer */
-static void
-update_fragment_constant_buffer(struct brw_context *brw)
-{
-   struct brw_fragment_program *fp =
-      (struct brw_fragment_program *) brw->fragment_program;
-   if (fp->use_const_buffer)
-      update_constant_buffer(brw, fp->program.Base.Parameters, fp->const_buffer);
-}
-
-
 static void emit_constant_buffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    GLuint sz = brw->curbe.total_size;
 
-   update_vertex_constant_buffer(brw);
-   update_fragment_constant_buffer(brw);
-
    BEGIN_BATCH(2, IGNORE_CLIPRECTS);
    if (sz == 0) {
       OUT_BATCH((CMD_CONST_BUFFER << 16) | (2 - 2));
@@ -428,7 +355,7 @@ static void emit_constant_buffer(struct brw_context *brw)
  */
 const struct brw_tracked_state brw_constant_buffer = {
    .dirty = {
-      .mesa = (_NEW_TRANSFORM|_NEW_PROJECTION),      /* plus fp and vp flags */
+      .mesa = _NEW_PROGRAM_CONSTANTS,
       .brw  = (BRW_NEW_FRAGMENT_PROGRAM |
 	       BRW_NEW_VERTEX_PROGRAM |
 	       BRW_NEW_URB_FENCE | /* Implicit - hardware requires this, not used above */
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index 98fc909c2ad..d166250b4fe 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -139,6 +139,7 @@
 #define BRW_CLIPMODE_CLIP_NON_REJECTED   2
 #define BRW_CLIPMODE_REJECT_ALL          3
 #define BRW_CLIPMODE_ACCEPT_ALL          4
+#define BRW_CLIPMODE_KERNEL_CLIP         5
 
 #define BRW_CLIP_NDCSPACE     0
 #define BRW_CLIP_SCREENSPACE  1
@@ -670,6 +671,25 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_LD                  3
 #define BRW_SAMPLER_MESSAGE_SIMD16_LD                 3
 
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG            0
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_IGDNG          0
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG           0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG       1
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_BIAS_IGDNG     1
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG      1
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_IGDNG        2
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_LOD_IGDNG      2
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD_IGDNG       2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG    3
+#define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE_IGDNG  3
+#define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG   3
+
+/* for IGDNG only */
+#define BRW_SAMPLER_SIMD_MODE_SIMD4X2                   0
+#define BRW_SAMPLER_SIMD_MODE_SIMD8                     1
+#define BRW_SAMPLER_SIMD_MODE_SIMD16                    2
+#define BRW_SAMPLER_SIMD_MODE_SIMD32_64                 3
+
 #define BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW   0
 #define BRW_DATAPORT_OWORD_BLOCK_1_OWORDHIGH  1
 #define BRW_DATAPORT_OWORD_BLOCK_2_OWORDS     2
@@ -819,8 +839,11 @@
 #include "intel_chipset.h"
 
 #define BRW_IS_G4X(brw)         (IS_G4X((brw)->intel.intelScreen->deviceID))
-#define CMD_PIPELINE_SELECT(brw)        (BRW_IS_G4X(brw) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
-#define CMD_VF_STATISTICS(brw)          (BRW_IS_G4X(brw) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
-#define URB_SIZES(brw)                  (BRW_IS_G4X(brw) ? 384 : 256)  /* 512 bit units */
+#define BRW_IS_IGDNG(brw)         (IS_IGDNG((brw)->intel.intelScreen->deviceID))
+#define BRW_IS_965(brw)         (!(BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)))
+#define CMD_PIPELINE_SELECT(brw)        ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_PIPELINE_SELECT_GM45 : CMD_PIPELINE_SELECT_965)
+#define CMD_VF_STATISTICS(brw)          ((BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? CMD_VF_STATISTICS_GM45 : CMD_VF_STATISTICS_965)
+#define URB_SIZES(brw)                  (BRW_IS_IGDNG(brw) ? 1024 : \
+                                         (BRW_IS_G4X(brw) ? 384 : 256))  /* 512 bit units */
 
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 5342622a737..5152c3f3a5f 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -187,19 +187,13 @@ static void brw_merge_inputs( struct brw_context *brw,
       brw->vb.inputs[i].glarray = arrays[i];
 
       if (arrays[i]->StrideB != 0)
-	 brw->vb.info.varying |= 1 << i;
-
 	 brw->vb.info.sizes[i/16] |= (brw->vb.inputs[i].glarray->Size - 1) <<
 	    ((i%16) * 2);
    }
 
-   /* Raise statechanges if input sizes and varying have changed: 
-    */
+   /* Raise statechanges if input sizes have changed. */
    if (memcmp(brw->vb.info.sizes, old.sizes, sizeof(old.sizes)) != 0)
       brw->state.dirty.brw |= BRW_NEW_INPUT_DIMENSIONS;
-
-   if (brw->vb.info.varying != old.varying)
-      brw->state.dirty.brw |= BRW_NEW_INPUT_VARYING;
 }
 
 /* XXX: could split the primitive list to fallback only on the
@@ -416,6 +410,8 @@ static GLboolean brw_try_draw_prims( GLcontext *ctx,
  out:
    UNLOCK_HARDWARE(intel);
 
+   brw_state_cache_check_size(brw);
+
    if (warn)
       fprintf(stderr, "i965: Single primitive emit potentially exceeded "
 	      "available aperture space\n");
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index b91b20bec6f..c29f1dd5c03 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -343,7 +343,7 @@ static void brw_prepare_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read; 
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read; 
    GLuint i;
    const unsigned char *ptr = NULL;
    GLuint interleave = 0;
@@ -362,11 +362,11 @@ static void brw_prepare_vertices(struct brw_context *brw)
       _mesa_printf("%s %d..%d\n", __FUNCTION__, min_index, max_index);
 
    /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      GLuint i = _mesa_ffsll(tmp)-1;
+   while (vs_inputs) {
+      GLuint i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      tmp &= ~(1<<i);
+      vs_inputs &= ~(1 << i);
       enabled[nr_enabled++] = input;
    }
 
@@ -477,17 +477,17 @@ static void brw_emit_vertices(struct brw_context *brw)
 {
    GLcontext *ctx = &brw->intel.ctx;
    struct intel_context *intel = intel_context(ctx);
-   GLuint tmp = brw->vs.prog_data->inputs_read;
+   GLbitfield vs_inputs = brw->vs.prog_data->inputs_read;
    struct brw_vertex_element *enabled[VERT_ATTRIB_MAX];
    GLuint i;
    GLuint nr_enabled = 0;
 
   /* Accumulate the list of enabled arrays. */
-   while (tmp) {
-      i = _mesa_ffsll(tmp)-1;
+   while (vs_inputs) {
+      i = _mesa_ffsll(vs_inputs) - 1;
       struct brw_vertex_element *input = &brw->vb.inputs[i];
 
-      tmp &= ~(1<<i);
+      vs_inputs &= ~(1 << i);
       enabled[nr_enabled++] = input;
    }
 
@@ -512,7 +512,19 @@ static void brw_emit_vertices(struct brw_context *brw)
       OUT_RELOC(input->bo,
 		I915_GEM_DOMAIN_VERTEX, 0,
 		input->offset);
-      OUT_BATCH(brw->vb.max_index);
+      if (BRW_IS_IGDNG(brw)) {
+          if (input->stride) {
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->stride * input->count);
+          } else {
+              assert(input->count == 1);
+              OUT_RELOC(input->bo,
+                        I915_GEM_DOMAIN_VERTEX, 0,
+                        input->offset + input->element_size);
+          }
+      } else
+          OUT_BATCH(brw->vb.max_index);
       OUT_BATCH(0); /* Instance data step rate */
    }
    ADVANCE_BATCH();
@@ -542,11 +554,18 @@ static void brw_emit_vertices(struct brw_context *brw)
 		BRW_VE0_VALID |
 		(format << BRW_VE0_FORMAT_SHIFT) |
 		(0 << BRW_VE0_SRC_OFFSET_SHIFT));
-      OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
-		(comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
-		(comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
-		(comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
-		((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
+
+      if (BRW_IS_IGDNG(brw))
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT));
+      else
+          OUT_BATCH((comp0 << BRW_VE1_COMPONENT_0_SHIFT) |
+                    (comp1 << BRW_VE1_COMPONENT_1_SHIFT) |
+                    (comp2 << BRW_VE1_COMPONENT_2_SHIFT) |
+                    (comp3 << BRW_VE1_COMPONENT_3_SHIFT) |
+                    ((i * 4) << BRW_VE1_DST_OFFSET_SHIFT));
    }
    ADVANCE_BATCH();
 }
@@ -635,7 +654,7 @@ static void brw_emit_indices(struct brw_context *brw)
    if (index_buffer == NULL)
       return;
 
-   ib_size = get_size(index_buffer->type) * index_buffer->count;
+   ib_size = get_size(index_buffer->type) * index_buffer->count - 1;
 
    /* Emit the indexbuffer packet:
     */
diff --git a/src/mesa/drivers/dri/i965/brw_eu.h b/src/mesa/drivers/dri/i965/brw_eu.h
index 003332f78ca..30603bdd0e6 100644
--- a/src/mesa/drivers/dri/i965/brw_eu.h
+++ b/src/mesa/drivers/dri/i965/brw_eu.h
@@ -97,7 +97,7 @@ struct brw_glsl_call;
 
 
 #define BRW_EU_MAX_INSN_STACK 5
-#define BRW_EU_MAX_INSN 4000
+#define BRW_EU_MAX_INSN 10000
 
 struct brw_compile {
    struct brw_instruction store[BRW_EU_MAX_INSN];
@@ -171,9 +171,9 @@ static INLINE struct brw_reg brw_reg( GLuint file,
 {
    struct brw_reg reg;
    if (type == BRW_GENERAL_REGISTER_FILE)
-      assert(nr < 128);
+      assert(nr < BRW_MAX_GRF);
    else if (type == BRW_MESSAGE_REGISTER_FILE)
-      assert(nr < 9);
+      assert(nr < BRW_MAX_MRF);
    else if (type == BRW_ARCHITECTURE_REGISTER_FILE)
       assert(nr <= BRW_ARF_IP);
 
@@ -538,6 +538,7 @@ static INLINE struct brw_reg brw_mask_reg( GLuint subnr )
 
 static INLINE struct brw_reg brw_message_reg( GLuint nr )
 {
+   assert(nr < BRW_MAX_MRF);
    return brw_vec8_reg(BRW_MESSAGE_REGISTER_FILE,
 		       nr,
 		       0);
@@ -815,6 +816,19 @@ void brw_urb_WRITE(struct brw_compile *p,
 		   GLuint offset,
 		   GLuint swizzle);
 
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle);
+
 void brw_fb_WRITE(struct brw_compile *p,
 		   struct brw_reg dest,
 		   GLuint msg_reg_nr,
@@ -834,7 +848,9 @@ void brw_SAMPLE(struct brw_compile *p,
 		GLuint msg_type,
 		GLuint response_length,
 		GLuint msg_length,
-		GLboolean eot);
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode);
 
 void brw_math_16( struct brw_compile *p,
 		  struct brw_reg dest,
diff --git a/src/mesa/drivers/dri/i965/brw_eu_emit.c b/src/mesa/drivers/dri/i965/brw_eu_emit.c
index 2a147fb8c38..2412014248c 100644
--- a/src/mesa/drivers/dri/i965/brw_eu_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_eu_emit.c
@@ -241,7 +241,8 @@ void brw_set_src1( struct brw_instruction *insn,
 
 
 
-static void brw_set_math_message( struct brw_instruction *insn,
+static void brw_set_math_message( struct brw_context *brw,
+				  struct brw_instruction *insn,
 				  GLuint msg_length,
 				  GLuint response_length,
 				  GLuint function,
@@ -252,18 +253,35 @@ static void brw_set_math_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.math.function = function;
-   insn->bits3.math.int_type = integer_type;
-   insn->bits3.math.precision = low_precision;
-   insn->bits3.math.saturate = saturate;
-   insn->bits3.math.data_type = dataType;
-   insn->bits3.math.response_length = response_length;
-   insn->bits3.math.msg_length = msg_length;
-   insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
-   insn->bits3.math.end_of_thread = 0;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.math_igdng.function = function;
+       insn->bits3.math_igdng.int_type = integer_type;
+       insn->bits3.math_igdng.precision = low_precision;
+       insn->bits3.math_igdng.saturate = saturate;
+       insn->bits3.math_igdng.data_type = dataType;
+       insn->bits3.math_igdng.snapshot = 0;
+       insn->bits3.math_igdng.header_present = 0;
+       insn->bits3.math_igdng.response_length = response_length;
+       insn->bits3.math_igdng.msg_length = msg_length;
+       insn->bits3.math_igdng.end_of_thread = 0;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_MATH;
+       insn->bits2.send_igdng.end_of_thread = 0;
+   } else {
+       insn->bits3.math.function = function;
+       insn->bits3.math.int_type = integer_type;
+       insn->bits3.math.precision = low_precision;
+       insn->bits3.math.saturate = saturate;
+       insn->bits3.math.data_type = dataType;
+       insn->bits3.math.response_length = response_length;
+       insn->bits3.math.msg_length = msg_length;
+       insn->bits3.math.msg_target = BRW_MESSAGE_TARGET_MATH;
+       insn->bits3.math.end_of_thread = 0;
+   }
 }
 
-static void brw_set_urb_message( struct brw_instruction *insn,
+
+static void brw_set_ff_sync_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
 				 GLboolean allocate,
 				 GLboolean used,
 				 GLuint msg_length,
@@ -273,21 +291,64 @@ static void brw_set_urb_message( struct brw_instruction *insn,
 				 GLuint offset,
 				 GLuint swizzle_control )
 {
-   brw_set_src1(insn, brw_imm_d(0));
+	brw_set_src1(insn, brw_imm_d(0));
+
+	insn->bits3.urb_igdng.opcode = 1;
+	insn->bits3.urb_igdng.offset = offset;
+	insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+	insn->bits3.urb_igdng.allocate = allocate;
+	insn->bits3.urb_igdng.used = used;
+	insn->bits3.urb_igdng.complete = complete;
+	insn->bits3.urb_igdng.header_present = 1;
+	insn->bits3.urb_igdng.response_length = response_length;
+	insn->bits3.urb_igdng.msg_length = msg_length;
+	insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+	insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+	insn->bits2.send_igdng.end_of_thread = end_of_thread;
+}
 
-   insn->bits3.urb.opcode = 0;	/* ? */
-   insn->bits3.urb.offset = offset;
-   insn->bits3.urb.swizzle_control = swizzle_control;
-   insn->bits3.urb.allocate = allocate;
-   insn->bits3.urb.used = used;	/* ? */
-   insn->bits3.urb.complete = complete;
-   insn->bits3.urb.response_length = response_length;
-   insn->bits3.urb.msg_length = msg_length;
-   insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+static void brw_set_urb_message( struct brw_context *brw,
+				 struct brw_instruction *insn,
+				 GLboolean allocate,
+				 GLboolean used,
+				 GLuint msg_length,
+				 GLuint response_length,
+				 GLboolean end_of_thread,
+				 GLboolean complete,
+				 GLuint offset,
+				 GLuint swizzle_control )
+{
+    brw_set_src1(insn, brw_imm_d(0));
+
+    if (BRW_IS_IGDNG(brw)) {
+        insn->bits3.urb_igdng.opcode = 0;	/* ? */
+        insn->bits3.urb_igdng.offset = offset;
+        insn->bits3.urb_igdng.swizzle_control = swizzle_control;
+        insn->bits3.urb_igdng.allocate = allocate;
+        insn->bits3.urb_igdng.used = used;	/* ? */
+        insn->bits3.urb_igdng.complete = complete;
+        insn->bits3.urb_igdng.header_present = 1;
+        insn->bits3.urb_igdng.response_length = response_length;
+        insn->bits3.urb_igdng.msg_length = msg_length;
+        insn->bits3.urb_igdng.end_of_thread = end_of_thread;
+        insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_URB;
+        insn->bits2.send_igdng.end_of_thread = end_of_thread;
+    } else {
+        insn->bits3.urb.opcode = 0;	/* ? */
+        insn->bits3.urb.offset = offset;
+        insn->bits3.urb.swizzle_control = swizzle_control;
+        insn->bits3.urb.allocate = allocate;
+        insn->bits3.urb.used = used;	/* ? */
+        insn->bits3.urb.complete = complete;
+        insn->bits3.urb.response_length = response_length;
+        insn->bits3.urb.msg_length = msg_length;
+        insn->bits3.urb.msg_target = BRW_MESSAGE_TARGET_URB;
+        insn->bits3.urb.end_of_thread = end_of_thread;
+    }
 }
 
-static void brw_set_dp_write_message( struct brw_instruction *insn,
+static void brw_set_dp_write_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
 				      GLuint binding_table_index,
 				      GLuint msg_control,
 				      GLuint msg_type,
@@ -298,18 +359,33 @@ static void brw_set_dp_write_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_write.binding_table_index = binding_table_index;
-   insn->bits3.dp_write.msg_control = msg_control;
-   insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
-   insn->bits3.dp_write.msg_type = msg_type;
-   insn->bits3.dp_write.send_commit_msg = 0;
-   insn->bits3.dp_write.response_length = response_length;
-   insn->bits3.dp_write.msg_length = msg_length;
-   insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
-   insn->bits3.urb.end_of_thread = end_of_thread;
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_write_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_write_igdng.msg_control = msg_control;
+       insn->bits3.dp_write_igdng.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write_igdng.msg_type = msg_type;
+       insn->bits3.dp_write_igdng.send_commit_msg = 0;
+       insn->bits3.dp_write_igdng.header_present = 1;
+       insn->bits3.dp_write_igdng.response_length = response_length;
+       insn->bits3.dp_write_igdng.msg_length = msg_length;
+       insn->bits3.dp_write_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_write.binding_table_index = binding_table_index;
+       insn->bits3.dp_write.msg_control = msg_control;
+       insn->bits3.dp_write.pixel_scoreboard_clear = pixel_scoreboard_clear;
+       insn->bits3.dp_write.msg_type = msg_type;
+       insn->bits3.dp_write.send_commit_msg = 0;
+       insn->bits3.dp_write.response_length = response_length;
+       insn->bits3.dp_write.msg_length = msg_length;
+       insn->bits3.dp_write.msg_target = BRW_MESSAGE_TARGET_DATAPORT_WRITE;
+       insn->bits3.dp_write.end_of_thread = end_of_thread;
+   }
 }
 
-static void brw_set_dp_read_message( struct brw_instruction *insn,
+static void brw_set_dp_read_message( struct brw_context *brw,
+				      struct brw_instruction *insn,
 				      GLuint binding_table_index,
 				      GLuint msg_control,
 				      GLuint msg_type,
@@ -320,15 +396,29 @@ static void brw_set_dp_read_message( struct brw_instruction *insn,
 {
    brw_set_src1(insn, brw_imm_d(0));
 
-   insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
-   insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
-   insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
-   insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
-   insn->bits3.dp_read.response_length = response_length;  /*16:19*/
-   insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
-   insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
-   insn->bits3.dp_read.pad1 = 0;  /*28:30*/
-   insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   if (BRW_IS_IGDNG(brw)) {
+       insn->bits3.dp_read_igdng.binding_table_index = binding_table_index;
+       insn->bits3.dp_read_igdng.msg_control = msg_control;
+       insn->bits3.dp_read_igdng.msg_type = msg_type;
+       insn->bits3.dp_read_igdng.target_cache = target_cache;
+       insn->bits3.dp_read_igdng.header_present = 1;
+       insn->bits3.dp_read_igdng.response_length = response_length;
+       insn->bits3.dp_read_igdng.msg_length = msg_length;
+       insn->bits3.dp_read_igdng.pad1 = 0;
+       insn->bits3.dp_read_igdng.end_of_thread = end_of_thread;
+       insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_DATAPORT_READ;
+       insn->bits2.send_igdng.end_of_thread = end_of_thread;
+   } else {
+       insn->bits3.dp_read.binding_table_index = binding_table_index; /*0:7*/
+       insn->bits3.dp_read.msg_control = msg_control;  /*8:11*/
+       insn->bits3.dp_read.msg_type = msg_type;  /*12:13*/
+       insn->bits3.dp_read.target_cache = target_cache;  /*14:15*/
+       insn->bits3.dp_read.response_length = response_length;  /*16:19*/
+       insn->bits3.dp_read.msg_length = msg_length;  /*20:23*/
+       insn->bits3.dp_read.msg_target = BRW_MESSAGE_TARGET_DATAPORT_READ; /*24:27*/
+       insn->bits3.dp_read.pad1 = 0;  /*28:30*/
+       insn->bits3.dp_read.end_of_thread = end_of_thread;  /*31*/
+   }
 }
 
 static void brw_set_sampler_message(struct brw_context *brw,
@@ -338,11 +428,25 @@ static void brw_set_sampler_message(struct brw_context *brw,
                                     GLuint msg_type,
                                     GLuint response_length,
                                     GLuint msg_length,
-                                    GLboolean eot)
+                                    GLboolean eot,
+                                    GLuint header_present,
+                                    GLuint simd_mode)
 {
+   assert(eot == 0);
    brw_set_src1(insn, brw_imm_d(0));
 
-   if (BRW_IS_G4X(brw)) {
+   if (BRW_IS_IGDNG(brw)) {
+      insn->bits3.sampler_igdng.binding_table_index = binding_table_index;
+      insn->bits3.sampler_igdng.sampler = sampler;
+      insn->bits3.sampler_igdng.msg_type = msg_type;
+      insn->bits3.sampler_igdng.simd_mode = simd_mode;
+      insn->bits3.sampler_igdng.header_present = header_present;
+      insn->bits3.sampler_igdng.response_length = response_length;
+      insn->bits3.sampler_igdng.msg_length = msg_length;
+      insn->bits3.sampler_igdng.end_of_thread = eot;
+      insn->bits2.send_igdng.sfid = BRW_MESSAGE_TARGET_SAMPLER;
+      insn->bits2.send_igdng.end_of_thread = eot;
+   } else if (BRW_IS_G4X(brw)) {
       insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
       insn->bits3.sampler_g4x.sampler = sampler;
       insn->bits3.sampler_g4x.msg_type = msg_type;
@@ -484,6 +588,10 @@ struct brw_instruction *brw_JMPI(struct brw_compile *p,
 {
    struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
 
+   insn->header.execution_size = 1;
+   insn->header.compression_control = BRW_COMPRESSION_NONE;
+   insn->header.mask_control = BRW_MASK_DISABLE;
+
    p->current->header.predicate_control = BRW_PREDICATE_NONE;
 
    return insn;
@@ -540,6 +648,10 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 				 struct brw_instruction *if_insn)
 {
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
 
    if (p->single_program_flow) {
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -566,7 +678,7 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
    } else {
       assert(if_insn->header.opcode == BRW_OPCODE_IF);
 
-      if_insn->bits3.if_else.jump_count = insn - if_insn;
+      if_insn->bits3.if_else.jump_count = br * (insn - if_insn);
       if_insn->bits3.if_else.pop_count = 1;
       if_insn->bits3.if_else.pad0 = 0;
    }
@@ -577,6 +689,11 @@ struct brw_instruction *brw_ELSE(struct brw_compile *p,
 void brw_ENDIF(struct brw_compile *p, 
 	       struct brw_instruction *patch_insn)
 {
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2; 
+ 
    if (p->single_program_flow) {
       /* In single program flow mode, there's no need to execute an ENDIF,
        * since we don't need to do any stack operations, and if we're executing
@@ -608,11 +725,11 @@ void brw_ENDIF(struct brw_compile *p,
 	 /* Automagically turn it into an IFF:
 	  */
 	 patch_insn->header.opcode = BRW_OPCODE_IFF;
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
 	 patch_insn->bits3.if_else.pop_count = 0;
 	 patch_insn->bits3.if_else.pad0 = 0;
       } else if (patch_insn->header.opcode == BRW_OPCODE_ELSE) {
-	 patch_insn->bits3.if_else.jump_count = insn - patch_insn + 1;
+	 patch_insn->bits3.if_else.jump_count = br * (insn - patch_insn + 1);
 	 patch_insn->bits3.if_else.pop_count = 1;
 	 patch_insn->bits3.if_else.pad0 = 0;
       } else {
@@ -686,6 +803,10 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
                                   struct brw_instruction *do_insn)
 {
    struct brw_instruction *insn;
+   GLuint br = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+      br = 2;
 
    if (p->single_program_flow)
       insn = next_insn(p, BRW_OPCODE_ADD);
@@ -706,7 +827,7 @@ struct brw_instruction *brw_WHILE(struct brw_compile *p,
       insn->header.execution_size = do_insn->header.execution_size;
 
       assert(do_insn->header.opcode == BRW_OPCODE_DO);
-      insn->bits3.if_else.jump_count = do_insn - insn + 1;
+      insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
       insn->bits3.if_else.pop_count = 0;
       insn->bits3.if_else.pad0 = 0;
    }
@@ -725,11 +846,15 @@ void brw_land_fwd_jump(struct brw_compile *p,
 		       struct brw_instruction *jmp_insn)
 {
    struct brw_instruction *landing = &p->store[p->nr_insn];
+   GLuint jmpi = 1;
+
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
 
    assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
    assert(jmp_insn->bits1.da1.src1_reg_file = BRW_IMMEDIATE_VALUE);
 
-   jmp_insn->bits3.ud = (landing - jmp_insn) - 1; 
+   jmp_insn->bits3.ud = jmpi * ((landing - jmp_insn) - 1);
 }
 
 
@@ -794,7 +919,8 @@ void brw_math( struct brw_compile *p,
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -830,7 +956,8 @@ void brw_math_16( struct brw_compile *p,
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw,
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -846,7 +973,8 @@ void brw_math_16( struct brw_compile *p,
 
    brw_set_dest(insn, offset(dest,1));
    brw_set_src0(insn, src);
-   brw_set_math_message(insn, 
+   brw_set_math_message(p->brw, 
+			insn, 
 			msg_length, response_length, 
 			function,
 			BRW_MATH_INTEGER_UNSIGNED,
@@ -893,7 +1021,8 @@ void brw_dp_WRITE_16( struct brw_compile *p,
       brw_set_dest(insn, dest);
       brw_set_src0(insn, src);
 
-      brw_set_dp_write_message(insn,
+      brw_set_dp_write_message(p->brw,
+			       insn,
 			       255, /* binding table index (255=stateless) */
 			       BRW_DATAPORT_OWORD_BLOCK_4_OWORDS, /* msg_control */
 			       BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE, /* msg_type */
@@ -938,7 +1067,8 @@ void brw_dp_READ_16( struct brw_compile *p,
       brw_set_dest(insn, dest);	/* UW? */
       brw_set_src0(insn, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW));
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+			      insn,
 			      255, /* binding table index (255=stateless) */
 			      3,  /* msg_control (3 means 4 Owords) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -995,7 +1125,8 @@ void brw_dp_READ_4( struct brw_compile *p,
       brw_set_dest(insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+			      insn,
 			      bind_table_index,
 			      0,  /* msg_control (0 means 1 Oword) */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -1066,7 +1197,8 @@ void brw_dp_READ_4_vs(struct brw_compile *p,
       brw_set_dest(insn, dest);
       brw_set_src0(insn, brw_null_reg());
 
-      brw_set_dp_read_message(insn,
+      brw_set_dp_read_message(p->brw,
+			      insn,
 			      bind_table_index,
 			      oword,  /* 0 = lower Oword, 1 = upper Oword */
 			      BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
@@ -1096,7 +1228,8 @@ void brw_fb_WRITE(struct brw_compile *p,
   
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
-   brw_set_dp_write_message(insn,
+   brw_set_dp_write_message(p->brw,
+			    insn,
 			    binding_table_index,
 			    BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE, /* msg_control */
 			    BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE, /* msg_type */
@@ -1122,7 +1255,9 @@ void brw_SAMPLE(struct brw_compile *p,
 		GLuint msg_type,
 		GLuint response_length,
 		GLuint msg_length,
-		GLboolean eot)
+		GLboolean eot,
+		GLuint header_present,
+		GLuint simd_mode)
 {
    GLboolean need_stall = 0;
    
@@ -1197,7 +1332,9 @@ void brw_SAMPLE(struct brw_compile *p,
 			      msg_type,
 			      response_length, 
 			      msg_length,
-			      eot);
+			      eot,
+			      header_present,
+			      simd_mode);
    }
 
    if (need_stall) {
@@ -1232,7 +1369,7 @@ void brw_urb_WRITE(struct brw_compile *p,
 {
    struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
 
-   assert(msg_length < 16);
+   assert(msg_length < BRW_MAX_MRF);
 
    brw_set_dest(insn, dest);
    brw_set_src0(insn, src0);
@@ -1240,7 +1377,8 @@ void brw_urb_WRITE(struct brw_compile *p,
 
    insn->header.destreg__conditonalmod = msg_reg_nr;
 
-   brw_set_urb_message(insn,
+   brw_set_urb_message(p->brw,
+		       insn,
 		       allocate,
 		       used,
 		       msg_length,
@@ -1251,3 +1389,37 @@ void brw_urb_WRITE(struct brw_compile *p,
 		       swizzle);
 }
 
+void brw_ff_sync(struct brw_compile *p,
+		   struct brw_reg dest,
+		   GLuint msg_reg_nr,
+		   struct brw_reg src0,
+		   GLboolean allocate,
+		   GLboolean used,
+		   GLuint msg_length,
+		   GLuint response_length,
+		   GLboolean eot,
+		   GLboolean writes_complete,
+		   GLuint offset,
+		   GLuint swizzle)
+{
+   struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
+
+   assert(msg_length < 16);
+
+   brw_set_dest(insn, dest);
+   brw_set_src0(insn, src0);
+   brw_set_src1(insn, brw_imm_d(0));
+
+   insn->header.destreg__conditonalmod = msg_reg_nr;
+
+   brw_set_ff_sync_message(p->brw,
+		       insn,
+		       allocate,
+		       used,
+		       msg_length,
+		       response_length, 
+		       eot, 
+		       writes_complete, 
+		       offset,
+		       swizzle);
+}
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 299357409ce..d27c6c24ca5 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -37,6 +37,9 @@
 #include "tnl/tnl.h"
 #include "brw_context.h"
 #include "brw_fallback.h"
+#include "intel_chipset.h"
+#include "intel_fbo.h"
+#include "intel_regions.h"
 
 #include "glapi/glapi.h"
 
@@ -44,6 +47,7 @@
 
 static GLboolean do_check_fallback(struct brw_context *brw)
 {
+   struct intel_context *intel = &brw->intel;
    GLcontext *ctx = &brw->intel.ctx;
    GLuint i;
 
@@ -81,6 +85,33 @@ static GLboolean do_check_fallback(struct brw_context *brw)
       return GL_TRUE;
    }
 
+   /* _NEW_BUFFERS */
+   if (IS_965(intel->intelScreen->deviceID) &&
+       !IS_G4X(intel->intelScreen->deviceID)) {
+      for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
+	 struct gl_renderbuffer *rb = ctx->DrawBuffer->_ColorDrawBuffers[i];
+	 struct intel_renderbuffer *irb = intel_renderbuffer(rb);
+
+	 /* The original gen4 hardware couldn't set up WM surfaces pointing
+	  * at an offset within a tile, which can happen when rendering to
+	  * anything but the base level of a texture or the +X face/0 depth.
+	  * This was fixed with the 4 Series hardware.
+	  *
+	  * For these original chips, you would have to make the depth and
+	  * color destination surfaces include information on the texture
+	  * type, LOD, face, and various limits to use them as a destination.
+	  * I would have done this, but there's also a nasty requirement that
+	  * the depth and the color surfaces all be of the same LOD, which
+	  * may be a worse requirement than this alignment.  (Also, we may
+	  * want to just demote the texture to untiled, instead).
+	  */
+	 if (irb->region && irb->region->tiling != I915_TILING_NONE &&
+	     (irb->region->draw_offset & 4095)) {
+	    DBG("FALLBACK: non-tile-aligned destination for tiled FBO\n");
+	    return GL_TRUE;
+	 }
+      }
+   }
 
    return GL_FALSE;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_gs.c b/src/mesa/drivers/dri/i965/brw_gs.c
index a8b74a0afe6..48c2b9a41ce 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.c
+++ b/src/mesa/drivers/dri/i965/brw_gs.c
@@ -54,12 +54,17 @@ static void compile_gs_prog( struct brw_context *brw,
    memset(&c, 0, sizeof(c));
    
    c.key = *key;
-
+   c.need_ff_sync = BRW_IS_IGDNG(brw);
    /* Need to locate the two positions present in vertex + header.
     * These are currently hardcoded:
     */
    c.nr_attrs = brw_count_bits(c.key.attrs);
-   c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
+   if (BRW_IS_IGDNG(brw))
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 3;  /* are vertices packed, or reg-aligned? */
+   else
+       c.nr_regs = (c.nr_attrs + 1) / 2 + 1;  /* are vertices packed, or reg-aligned? */
+
    c.nr_bytes = c.nr_regs * REG_SIZE;
 
    
diff --git a/src/mesa/drivers/dri/i965/brw_gs.h b/src/mesa/drivers/dri/i965/brw_gs.h
index 18a4537c323..bbb991ea2e5 100644
--- a/src/mesa/drivers/dri/i965/brw_gs.h
+++ b/src/mesa/drivers/dri/i965/brw_gs.h
@@ -62,6 +62,7 @@ struct brw_gs_compile {
    GLuint nr_attrs;
    GLuint nr_regs;
    GLuint nr_bytes;
+   GLboolean need_ff_sync;
 };
 
 #define ATTR_SIZE  (4*4)
diff --git a/src/mesa/drivers/dri/i965/brw_gs_emit.c b/src/mesa/drivers/dri/i965/brw_gs_emit.c
index 22e0d25c2e7..980eac7646d 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_emit.c
@@ -101,6 +101,23 @@ static void brw_gs_emit_vue(struct brw_gs_compile *c,
 		 BRW_URB_SWIZZLE_NONE);
 }
 
+void brw_gs_ff_sync(struct brw_gs_compile *c, int num_prim)
+{
+	struct brw_compile *p = &c->func;
+	brw_MOV(p, get_element_ud(c->reg.R0, 1), brw_imm_ud(num_prim));
+	brw_ff_sync(p, 
+				c->reg.R0,
+				0,
+				c->reg.R0,
+				1,	
+				1,		/* used */
+				1,  	/* msg length */
+				1,		/* response length */
+				0,		/* eot */
+				1,		/* write compelete */
+				0,		/* urb offset */
+				BRW_URB_SWIZZLE_NONE);
+}
 
 
 void brw_gs_quads( struct brw_gs_compile *c )
@@ -110,6 +127,8 @@ void brw_gs_quads( struct brw_gs_compile *c )
    /* Use polygons for correct edgeflag behaviour. Note that vertex 3
     * is the PV for quads, but vertex 0 for polygons:
     */
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);    
    brw_gs_emit_vue(c, c->reg.vertex[3], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_POLYGON << 2)); 
@@ -120,6 +139,8 @@ void brw_gs_quad_strip( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 4);
    
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[2], 0, ((_3DPRIM_POLYGON << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[3], 0, (_3DPRIM_POLYGON << 2));
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, (_3DPRIM_POLYGON << 2)); 
@@ -129,6 +150,9 @@ void brw_gs_quad_strip( struct brw_gs_compile *c )
 void brw_gs_tris( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 3);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_TRILIST << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 0, (_3DPRIM_TRILIST << 2));
    brw_gs_emit_vue(c, c->reg.vertex[2], 1, ((_3DPRIM_TRILIST << 2) | R02_PRIM_END));
@@ -137,6 +161,9 @@ void brw_gs_tris( struct brw_gs_compile *c )
 void brw_gs_lines( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 2);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 0, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_START));
    brw_gs_emit_vue(c, c->reg.vertex[1], 1, ((_3DPRIM_LINESTRIP << 2) | R02_PRIM_END));
 }
@@ -144,6 +171,9 @@ void brw_gs_lines( struct brw_gs_compile *c )
 void brw_gs_points( struct brw_gs_compile *c )
 {
    brw_gs_alloc_regs(c, 1);
+
+   if (c->need_ff_sync)
+	   brw_gs_ff_sync(c, 1);      
    brw_gs_emit_vue(c, c->reg.vertex[0], 1, ((_3DPRIM_POINTLIST << 2) | R02_PRIM_START | R02_PRIM_END));
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_gs_state.c b/src/mesa/drivers/dri/i965/brw_gs_state.c
index 27023cf0344..a761c03153c 100644
--- a/src/mesa/drivers/dri/i965/brw_gs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_gs_state.c
@@ -95,6 +95,9 @@ gs_unit_create_from_key(struct brw_context *brw, struct brw_gs_unit_key *key)
 
    gs.thread4.max_threads = 0; /* Hardware requirement */
 
+   if (BRW_IS_IGDNG(brw))
+      gs.thread4.rendering_enable = 1;
+
    if (INTEL_DEBUG & DEBUG_STATS)
       gs.thread4.stats_enable = 1;
 
diff --git a/src/mesa/drivers/dri/i965/brw_misc_state.c b/src/mesa/drivers/dri/i965/brw_misc_state.c
index 9bc5c35139c..85a7706404f 100644
--- a/src/mesa/drivers/dri/i965/brw_misc_state.c
+++ b/src/mesa/drivers/dri/i965/brw_misc_state.c
@@ -118,7 +118,10 @@ static void upload_binding_table_pointers(struct brw_context *brw)
 
    BEGIN_BATCH(6, IGNORE_CLIPRECTS);
    OUT_BATCH(CMD_BINDING_TABLE_PTRS << 16 | (6 - 2));
-   OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   if (brw->vs.bind_bo != NULL)
+      OUT_RELOC(brw->vs.bind_bo, I915_GEM_DOMAIN_SAMPLER, 0, 0); /* vs */
+   else
+      OUT_BATCH(0);
    OUT_BATCH(0); /* gs */
    OUT_BATCH(0); /* clip */
    OUT_BATCH(0); /* sf */
@@ -208,7 +211,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 {
    struct intel_context *intel = &brw->intel;
    struct intel_region *region = brw->state.depth_region;
-   unsigned int len = BRW_IS_G4X(brw) ? 6 : 5;
+   unsigned int len = (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw)) ? 6 : 5;
 
    if (region == NULL) {
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
@@ -219,7 +222,7 @@ static void emit_depthbuffer(struct brw_context *brw)
       OUT_BATCH(0);
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -241,6 +244,8 @@ static void emit_depthbuffer(struct brw_context *brw)
 	 return;
       }
 
+      assert(region->tiling != I915_TILING_X);
+
       BEGIN_BATCH(len, IGNORE_CLIPRECTS);
       OUT_BATCH(CMD_DEPTH_BUFFER << 16 | (len - 2));
       OUT_BATCH(((region->pitch * region->cpp) - 1) |
@@ -256,7 +261,7 @@ static void emit_depthbuffer(struct brw_context *brw)
 		((region->height - 1) << 19));
       OUT_BATCH(0);
 
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_G4X(brw) || BRW_IS_IGDNG(brw))
          OUT_BATCH(0);
 
       ADVANCE_BATCH();
@@ -369,7 +374,7 @@ static void upload_aa_line_parameters(struct brw_context *brw)
 {
    struct brw_aa_line_parameters balp;
    
-   if (!BRW_IS_G4X(brw))
+   if (BRW_IS_965(brw))
       return;
 
    /* use legacy aa line coverage computation */
@@ -506,14 +511,27 @@ static void upload_state_base_address( struct brw_context *brw )
    /* Output the structure (brw_state_base_address) directly to the
     * batchbuffer, so we can emit relocations inline.
     */
-   BEGIN_BATCH(6, IGNORE_CLIPRECTS);
-   OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
-   OUT_BATCH(1); /* General state base address */
-   OUT_BATCH(1); /* Surface state base address */
-   OUT_BATCH(1); /* Indirect object base address */
-   OUT_BATCH(1); /* General state upper bound */
-   OUT_BATCH(1); /* Indirect object upper bound */
-   ADVANCE_BATCH();
+   if (BRW_IS_IGDNG(brw)) {
+       BEGIN_BATCH(8, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (8 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* Instruction base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       OUT_BATCH(1); /* Instruction access upper bound */
+       ADVANCE_BATCH();
+   } else {
+       BEGIN_BATCH(6, IGNORE_CLIPRECTS);
+       OUT_BATCH(CMD_STATE_BASE_ADDRESS << 16 | (6 - 2));
+       OUT_BATCH(1); /* General state base address */
+       OUT_BATCH(1); /* Surface state base address */
+       OUT_BATCH(1); /* Indirect object base address */
+       OUT_BATCH(1); /* General state upper bound */
+       OUT_BATCH(1); /* Indirect object upper bound */
+       ADVANCE_BATCH();
+   }
 }
 
 const struct brw_tracked_state brw_state_base_address = {
diff --git a/src/mesa/drivers/dri/i965/brw_sf.c b/src/mesa/drivers/dri/i965/brw_sf.c
index c3c85978f4d..e1c2c7777b5 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.c
+++ b/src/mesa/drivers/dri/i965/brw_sf.c
@@ -166,6 +166,9 @@ static void upload_sf_prog(struct brw_context *brw)
    key.do_flat_shading = (ctx->Light.ShadeModel == GL_FLAT);
    key.do_twoside_color = (ctx->Light.Enabled && ctx->Light.Model.TwoSide);
 
+   /* _NEW_HINT */
+   key.linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+
    /* _NEW_POLYGON */
    if (key.do_twoside_color) {
       /* If we're rendering to a FBO, we have to invert the polygon
@@ -188,7 +191,7 @@ static void upload_sf_prog(struct brw_context *brw)
 
 const struct brw_tracked_state brw_sf_prog = {
    .dirty = {
-      .mesa  = (_NEW_LIGHT|_NEW_POLYGON|_NEW_POINT),
+      .mesa  = (_NEW_HINT | _NEW_LIGHT | _NEW_POLYGON | _NEW_POINT),
       .brw   = (BRW_NEW_REDUCED_PRIMITIVE),
       .cache = CACHE_NEW_VS_PROG
    },
diff --git a/src/mesa/drivers/dri/i965/brw_sf.h b/src/mesa/drivers/dri/i965/brw_sf.h
index 1c0fb70fe06..6426b6df9ff 100644
--- a/src/mesa/drivers/dri/i965/brw_sf.h
+++ b/src/mesa/drivers/dri/i965/brw_sf.h
@@ -51,7 +51,8 @@ struct brw_sf_prog_key {
    GLuint do_flat_shading:1;
    GLuint frontface_ccw:1;
    GLuint do_point_sprite:1;
-   GLuint pad:10;
+   GLuint linear_color:1;  /**< linear interp vs. perspective interp */
+   GLuint pad:25;
    GLenum SpriteOrigin;
 };
 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_emit.c b/src/mesa/drivers/dri/i965/brw_sf_emit.c
index 862835f157a..ca8f97f9f9e 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_emit.c
@@ -151,6 +151,8 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
+   GLuint jmpi = 1;
+
    if (!nr)
       return;
 
@@ -159,18 +161,21 @@ static void do_flatshade_triangle( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
    brw_push_insn_state(p);
    
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr*2+1));
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr*2+1)));
    brw_JMPI(p, ip, ip, c->pv);
 
    copy_colors(c, c->vert[1], c->vert[0]);
    copy_colors(c, c->vert[2], c->vert[0]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*4+1));
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*(nr*4+1)));
 
    copy_colors(c, c->vert[0], c->vert[1]);
    copy_colors(c, c->vert[2], c->vert[1]);
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr*2));
+   brw_JMPI(p, ip, ip, brw_imm_d(jmpi*nr*2));
 
    copy_colors(c, c->vert[0], c->vert[2]);
    copy_colors(c, c->vert[1], c->vert[2]);
@@ -184,7 +189,8 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    struct brw_compile *p = &c->func;
    struct brw_reg ip = brw_ip_reg();
    GLuint nr = brw_count_bits(c->key.attrs & VERT_RESULT_COLOR_BITS);
-   
+   GLuint jmpi = 1;
+
    if (!nr)
       return;
 
@@ -193,13 +199,16 @@ static void do_flatshade_line( struct brw_sf_compile *c )
    if (c->key.primitive == SF_UNFILLED_TRIS)
       return;
 
+   if (BRW_IS_IGDNG(p->brw))
+       jmpi = 2;
+
    brw_push_insn_state(p);
    
-   brw_MUL(p, c->pv, c->pv, brw_imm_ud(nr+1));
+   brw_MUL(p, c->pv, c->pv, brw_imm_d(jmpi*(nr+1)));
    brw_JMPI(p, ip, ip, c->pv);
    copy_colors(c, c->vert[1], c->vert[0]);
 
-   brw_JMPI(p, ip, ip, brw_imm_ud(nr));
+   brw_JMPI(p, ip, ip, brw_imm_ud(jmpi*nr));
    copy_colors(c, c->vert[0], c->vert[1]);
 
    brw_pop_insn_state(p);
@@ -218,7 +227,7 @@ static void alloc_regs( struct brw_sf_compile *c )
 
    /* Values computed by fixed function unit:
     */
-   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_UD);
+   c->pv  = retype(brw_vec1_grf(1, 1), BRW_REGISTER_TYPE_D);
    c->det = brw_vec1_grf(1, 2);
    c->dx0 = brw_vec1_grf(1, 3);
    c->dx2 = brw_vec1_grf(1, 4);
@@ -295,9 +304,6 @@ static void invert_det( struct brw_sf_compile *c)
 
 }
 
-#define NON_PERPECTIVE_ATTRS  (FRAG_BIT_WPOS | \
-                               FRAG_BIT_COL0 | \
-			       FRAG_BIT_COL1)
 
 static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLuint reg,
@@ -306,9 +312,16 @@ static GLboolean calculate_masks( struct brw_sf_compile *c,
 				  GLushort *pc_linear)
 {
    GLboolean is_last_attr = (reg == c->nr_setup_regs - 1);
-   GLuint persp_mask = c->key.attrs & ~NON_PERPECTIVE_ATTRS;
+   GLuint persp_mask;
    GLuint linear_mask;
 
+   if (c->key.do_flat_shading || c->key.linear_color)
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS |
+                                    FRAG_BIT_COL0 |
+                                    FRAG_BIT_COL1);
+   else
+      persp_mask = c->key.attrs & ~(FRAG_BIT_WPOS);
+
    if (c->key.do_flat_shading)
       linear_mask = c->key.attrs & ~(FRAG_BIT_COL0|FRAG_BIT_COL1);
    else
@@ -674,7 +687,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 					       (1<<_3DPRIM_POLYGON) |
 					       (1<<_3DPRIM_RECTLIST) |
 					       (1<<_3DPRIM_TRIFAN_NOSTIPPLE)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
@@ -695,7 +708,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 					       (1<<_3DPRIM_LINESTRIP_CONT) |
 					       (1<<_3DPRIM_LINESTRIP_BF) |
 					       (1<<_3DPRIM_LINESTRIP_CONT_BF)));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
@@ -708,7 +721,7 @@ void brw_emit_anyprim_setup( struct brw_sf_compile *c )
 
    brw_set_conditionalmod(p, BRW_CONDITIONAL_Z);
    brw_AND(p, v1_null_ud, payload_attr, brw_imm_ud(1<<BRW_SPRITE_POINT_ENABLE));
-   jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+   jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
    {
       saveflag = p->flag_value;
       brw_push_insn_state(p); 
diff --git a/src/mesa/drivers/dri/i965/brw_sf_state.c b/src/mesa/drivers/dri/i965/brw_sf_state.c
index c99918724b3..b5f6371c82c 100644
--- a/src/mesa/drivers/dri/i965/brw_sf_state.c
+++ b/src/mesa/drivers/dri/i965/brw_sf_state.c
@@ -162,7 +162,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
 {
    struct brw_sf_unit_state sf;
    dri_bo *bo;
-
+   int chipset_max_threads;
    memset(&sf, 0, sizeof(sf));
 
    sf.thread0.grf_reg_count = ALIGN(key->total_grf, 16) / 16 - 1;
@@ -171,13 +171,26 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    sf.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
 
    sf.thread3.dispatch_grf_start_reg = 3;
-   sf.thread3.urb_entry_read_offset = 1;
+
+   if (BRW_IS_IGDNG(brw))
+       sf.thread3.urb_entry_read_offset = 3;
+   else
+       sf.thread3.urb_entry_read_offset = 1;
+
    sf.thread3.urb_entry_read_length = key->urb_entry_read_length;
 
    sf.thread4.nr_urb_entries = key->nr_urb_entries;
    sf.thread4.urb_entry_allocation_size = key->sfsize - 1;
-   /* Each SF thread produces 1 PUE, and there can be up to 24 threads */
-   sf.thread4.max_threads = MIN2(24, key->nr_urb_entries) - 1;
+
+   /* Each SF thread produces 1 PUE, and there can be up to 24(Pre-IGDNG) or 
+    * 48(IGDNG) threads 
+    */
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 48;
+   else
+      chipset_max_threads = 24;
+
+   sf.thread4.max_threads = MIN2(chipset_max_threads, key->nr_urb_entries) - 1;
 
    if (INTEL_DEBUG & DEBUG_SINGLE_THREAD)
       sf.thread4.max_threads = 0;
@@ -233,7 +246,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    else if (sf.sf6.line_width <= 0x2)
        sf.sf6.line_width = 0;
 
-   /* _NEW_POINT */
+   /* _NEW_BUFFERS */
    key->render_to_fbo = brw->intel.ctx.DrawBuffer->Name != 0;
    if (!key->render_to_fbo) {
       /* Rendering to an OpenGL window */
@@ -263,6 +276,7 @@ sf_unit_create_from_key(struct brw_context *brw, struct brw_sf_unit_key *key,
    }
    /* XXX clamp max depends on AA vs. non-AA */
 
+   /* _NEW_POINT */
    sf.sf7.sprite_point = key->point_sprite;
    sf.sf7.point_size = CLAMP(rint(key->point_size), 1, 255) * (1<<3);
    sf.sf7.use_point_size_state = !key->point_attenuated;
@@ -328,7 +342,8 @@ const struct brw_tracked_state brw_sf_unit = {
       .mesa  = (_NEW_POLYGON | 
 		_NEW_LINE | 
 		_NEW_POINT | 
-		_NEW_SCISSOR),
+		_NEW_SCISSOR |
+		_NEW_BUFFERS),
       .brw   = BRW_NEW_URB_FENCE,
       .cache = (CACHE_NEW_SF_VP |
 		CACHE_NEW_SF_PROG)
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 81b0a45998f..bf9f6cae55e 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -72,11 +72,13 @@ const struct brw_tracked_state brw_sf_vp;
 const struct brw_tracked_state brw_state_base_address;
 const struct brw_tracked_state brw_urb_fence;
 const struct brw_tracked_state brw_vertex_state;
+const struct brw_tracked_state brw_vs_surfaces;
 const struct brw_tracked_state brw_vs_prog;
 const struct brw_tracked_state brw_vs_unit;
 const struct brw_tracked_state brw_wm_input_sizes;
 const struct brw_tracked_state brw_wm_prog;
 const struct brw_tracked_state brw_wm_samplers;
+const struct brw_tracked_state brw_wm_constant_surface;
 const struct brw_tracked_state brw_wm_surfaces;
 const struct brw_tracked_state brw_wm_unit;
 
@@ -91,6 +93,20 @@ const struct brw_tracked_state brw_drawing_rect;
 const struct brw_tracked_state brw_indices;
 const struct brw_tracked_state brw_vertices;
 
+/**
+ * Use same key for WM and VS surfaces.
+ */
+struct brw_surface_key {
+   GLenum target, depthmode;
+   dri_bo *bo;
+   GLint format, internal_format;
+   GLint first_level, last_level;
+   GLint width, height, depth;
+   GLint pitch, cpp;
+   uint32_t tiling;
+   GLuint offset;
+};
+
 /***********************************************************************
  * brw_state.c
  */
@@ -135,8 +151,8 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
 			  void *aux_return);
 void brw_state_cache_check_size( struct brw_context *brw );
 
-void brw_init_cache( struct brw_context *brw );
-void brw_destroy_cache( struct brw_context *brw );
+void brw_init_caches( struct brw_context *brw );
+void brw_destroy_caches( struct brw_context *brw );
 
 /***********************************************************************
  * brw_state_batch.c
@@ -150,4 +166,9 @@ GLboolean brw_cached_batch_struct( struct brw_context *brw,
 void brw_destroy_batch_cache( struct brw_context *brw );
 void brw_clear_batch_cache_flush( struct brw_context *brw );
 
+/* brw_wm_surface_state.c */
+dri_bo *
+brw_create_constant_surface( struct brw_context *brw,
+                             struct brw_surface_key *key );
+
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index d5b51664066..e40d7a04164 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -56,9 +56,9 @@
  * incorrect program is run for the other instance.
  */
 
+#include "main/imports.h"
 #include "brw_state.h"
 #include "intel_batchbuffer.h"
-#include "main/imports.h"
 
 /* XXX: Fixme - have to include these to get the sizes of the prog_key
  * structs:
@@ -69,8 +69,10 @@
 #include "brw_sf.h"
 #include "brw_gs.h"
 
-static GLuint hash_key( const void *key, GLuint key_size,
-			dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
+
+static GLuint
+hash_key(const void *key, GLuint key_size,
+         dri_bo **reloc_bufs, GLuint nr_reloc_bufs)
 {
    GLuint *ikey = (GLuint *)key;
    GLuint hash = 0, i;
@@ -95,6 +97,7 @@ static GLuint hash_key( const void *key, GLuint key_size,
    return hash;
 }
 
+
 /**
  * Marks a new buffer as being chosen for the given cache id.
  */
@@ -111,6 +114,7 @@ update_cache_last(struct brw_cache *cache, enum brw_cache_id cache_id,
    cache->brw->state.dirty.cache |= 1 << cache_id;
 }
 
+
 static struct brw_cache_item *
 search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 	     GLuint hash, const void *key, GLuint key_size,
@@ -143,7 +147,8 @@ search_cache(struct brw_cache *cache, enum brw_cache_id cache_id,
 }
 
 
-static void rehash( struct brw_cache *cache )
+static void
+rehash(struct brw_cache *cache)
 {
    struct brw_cache_item **items;
    struct brw_cache_item *c, *next;
@@ -164,15 +169,17 @@ static void rehash( struct brw_cache *cache )
    cache->size = size;
 }
 
+
 /**
  * Returns the buffer object matching cache_id and key, or NULL.
  */
-dri_bo *brw_search_cache( struct brw_cache *cache,
-			  enum brw_cache_id cache_id,
-			  const void *key,
-			  GLuint key_size,
-			  dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
-			  void *aux_return )
+dri_bo *
+brw_search_cache(struct brw_cache *cache,
+                 enum brw_cache_id cache_id,
+                 const void *key,
+                 GLuint key_size,
+                 dri_bo **reloc_bufs, GLuint nr_reloc_bufs,
+                 void *aux_return)
 {
    struct brw_cache_item *item;
    GLuint hash = hash_key(key, key_size, reloc_bufs, nr_reloc_bufs);
@@ -192,6 +199,7 @@ dri_bo *brw_search_cache( struct brw_cache *cache,
    return item->bo;
 }
 
+
 dri_bo *
 brw_upload_cache( struct brw_cache *cache,
 		  enum brw_cache_id cache_id,
@@ -265,7 +273,9 @@ brw_upload_cache( struct brw_cache *cache,
    return bo;
 }
 
-/* This doesn't really work with aux data.  Use search/upload instead
+
+/**
+ * This doesn't really work with aux data.  Use search/upload instead
  */
 dri_bo *
 brw_cache_data_sz(struct brw_cache *cache,
@@ -296,6 +306,7 @@ brw_cache_data_sz(struct brw_cache *cache,
    return bo;
 }
 
+
 /**
  * Wrapper around brw_cache_data_sz using the cache_id's canonical key size.
  *
@@ -319,21 +330,22 @@ enum pool_type {
    DW_GENERAL_STATE
 };
 
+
 static void
-brw_init_cache_id( struct brw_context *brw,
-		const char *name,
-		enum brw_cache_id id,
-		GLuint key_size,
-		GLuint aux_size)
+brw_init_cache_id(struct brw_cache *cache,
+                  const char *name,
+                  enum brw_cache_id id,
+                  GLuint key_size,
+                  GLuint aux_size)
 {
-   struct brw_cache *cache = &brw->cache;
-
    cache->name[id] = strdup(name);
    cache->key_size[id] = key_size;
    cache->aux_size[id] = aux_size;
 }
 
-void brw_init_cache( struct brw_context *brw )
+
+static void
+brw_init_non_surface_cache(struct brw_context *brw)
 {
    struct brw_cache *cache = &brw->cache;
 
@@ -342,114 +354,136 @@ void brw_init_cache( struct brw_context *brw )
    cache->size = 7;
    cache->n_items = 0;
    cache->items = (struct brw_cache_item **)
-      _mesa_calloc(cache->size * 
-		   sizeof(struct brw_cache_item));
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_VP",
 		     BRW_CC_VP,
 		     sizeof(struct brw_cc_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CC_UNIT",
 		     BRW_CC_UNIT,
 		     sizeof(struct brw_cc_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_PROG",
 		     BRW_WM_PROG,
 		     sizeof(struct brw_wm_prog_key),
 		     sizeof(struct brw_wm_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER_DEFAULT_COLOR",
 		     BRW_SAMPLER_DEFAULT_COLOR,
 		     sizeof(struct brw_sampler_default_color),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SAMPLER",
 		     BRW_SAMPLER,
 		     0,		/* variable key/data size */
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "WM_UNIT",
 		     BRW_WM_UNIT,
 		     sizeof(struct brw_wm_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_PROG",
 		     BRW_SF_PROG,
 		     sizeof(struct brw_sf_prog_key),
 		     sizeof(struct brw_sf_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_VP",
 		     BRW_SF_VP,
 		     sizeof(struct brw_sf_viewport),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SF_UNIT",
 		     BRW_SF_UNIT,
 		     sizeof(struct brw_sf_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_UNIT",
 		     BRW_VS_UNIT,
 		     sizeof(struct brw_vs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "VS_PROG",
 		     BRW_VS_PROG,
 		     sizeof(struct brw_vs_prog_key),
 		     sizeof(struct brw_vs_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_UNIT",
 		     BRW_CLIP_UNIT,
 		     sizeof(struct brw_clip_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "CLIP_PROG",
 		     BRW_CLIP_PROG,
 		     sizeof(struct brw_clip_prog_key),
 		     sizeof(struct brw_clip_prog_data));
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_UNIT",
 		     BRW_GS_UNIT,
 		     sizeof(struct brw_gs_unit_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "GS_PROG",
 		     BRW_GS_PROG,
 		     sizeof(struct brw_gs_prog_key),
 		     sizeof(struct brw_gs_prog_data));
+}
+
+
+static void
+brw_init_surface_cache(struct brw_context *brw)
+{
+   struct brw_cache *cache = &brw->surface_cache;
+
+   cache->brw = brw;
 
-   brw_init_cache_id(brw,
+   cache->size = 7;
+   cache->n_items = 0;
+   cache->items = (struct brw_cache_item **)
+      _mesa_calloc(cache->size * sizeof(struct brw_cache_item));
+
+   brw_init_cache_id(cache,
 		     "SS_SURFACE",
 		     BRW_SS_SURFACE,
 		     sizeof(struct brw_surface_state),
 		     0);
 
-   brw_init_cache_id(brw,
+   brw_init_cache_id(cache,
 		     "SS_SURF_BIND",
 		     BRW_SS_SURF_BIND,
 		     0,
 		     0);
 }
 
+
+void
+brw_init_caches(struct brw_context *brw)
+{
+   brw_init_non_surface_cache(brw);
+   brw_init_surface_cache(brw);
+}
+
+
 static void
-brw_clear_cache( struct brw_context *brw )
+brw_clear_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    struct brw_cache_item *c, *next;
    GLuint i;
@@ -457,8 +491,8 @@ brw_clear_cache( struct brw_context *brw )
    if (INTEL_DEBUG & DEBUG_STATE)
       _mesa_printf("%s\n", __FUNCTION__);
 
-   for (i = 0; i < brw->cache.size; i++) {
-      for (c = brw->cache.items[i]; c; c = next) {
+   for (i = 0; i < cache->size; i++) {
+      for (c = cache->items[i]; c; c = next) {
 	 int j;
 
 	 next = c->next;
@@ -468,10 +502,10 @@ brw_clear_cache( struct brw_context *brw )
 	 free((void *)c->key);
 	 free(c);
       }
-      brw->cache.items[i] = NULL;
+      cache->items[i] = NULL;
    }
 
-   brw->cache.n_items = 0;
+   cache->n_items = 0;
 
    if (brw->curbe.last_buf) {
       _mesa_free(brw->curbe.last_buf);
@@ -483,25 +517,46 @@ brw_clear_cache( struct brw_context *brw )
    brw->state.dirty.cache |= ~0;
 }
 
-void brw_state_cache_check_size( struct brw_context *brw )
+
+void
+brw_state_cache_check_size(struct brw_context *brw)
 {
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s (n_items=%d)\n", __FUNCTION__, brw->cache.n_items);
+
    /* un-tuned guess.  We've got around 20 state objects for a total of around
     * 32k, so 1000 of them is around 1.5MB.
     */
    if (brw->cache.n_items > 1000)
-      brw_clear_cache(brw);
+      brw_clear_cache(brw, &brw->cache);
+
+   if (brw->surface_cache.n_items > 1000)
+      brw_clear_cache(brw, &brw->surface_cache);
 }
 
-void brw_destroy_cache( struct brw_context *brw )
+
+static void
+brw_destroy_cache(struct brw_context *brw, struct brw_cache *cache)
 {
    GLuint i;
 
-   brw_clear_cache(brw);
+   if (INTEL_DEBUG & DEBUG_STATE)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   brw_clear_cache(brw, cache);
    for (i = 0; i < BRW_MAX_CACHE; i++) {
-      dri_bo_unreference(brw->cache.last_bo[i]);
-      free(brw->cache.name[i]);
+      dri_bo_unreference(cache->last_bo[i]);
+      free(cache->name[i]);
    }
-   free(brw->cache.items);
-   brw->cache.items = NULL;
-   brw->cache.size = 0;
+   free(cache->items);
+   cache->items = NULL;
+   cache->size = 0;
+}
+
+
+void
+brw_destroy_caches(struct brw_context *brw)
+{
+   brw_destroy_cache(brw, &brw->cache);
+   brw_destroy_cache(brw, &brw->surface_cache);
 }
diff --git a/src/mesa/drivers/dri/i965/brw_state_dump.c b/src/mesa/drivers/dri/i965/brw_state_dump.c
index a7132622696..e94fa7d2b4c 100644
--- a/src/mesa/drivers/dri/i965/brw_state_dump.c
+++ b/src/mesa/drivers/dri/i965/brw_state_dump.c
@@ -126,6 +126,8 @@ static void dump_wm_surface_state(struct brw_context *brw)
 		surf->ss3.pitch + 1, surf->ss3.tiled_surface ? "" : "not ");
       state_out(name, surf, surfoff, 4, "mip base %d\n",
 		surf->ss4.min_lod);
+      state_out(name, surf, surfoff, 5, "x,y offset: %d,%d\n",
+		surf->ss5.x_offset, surf->ss5.y_offset);
 
       dri_bo_unmap(surf_bo);
    }
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 5de1450e612..38d9dd8991e 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -59,11 +59,12 @@ const struct brw_tracked_state *atoms[] =
    &brw_curbe_offsets,
    &brw_recalculate_urb_fence,
 
-
    &brw_cc_vp,
    &brw_cc_unit,
 
-   &brw_wm_surfaces,		/* must do before samplers */
+   &brw_vs_surfaces,		/* must do before unit */
+   &brw_wm_constant_surface,	/* must do before wm surfaces/bind bo */
+   &brw_wm_surfaces,		/* must do before samplers and unit */
    &brw_wm_samplers,
 
    &brw_wm_unit,
@@ -88,54 +89,26 @@ const struct brw_tracked_state *atoms[] =
 
    &brw_line_stipple,
    &brw_aa_line_parameters,
-   /* Ordering of the commands below is documented as fixed.  
-    */
-#if 0
-   &brw_pipelined_state_pointers,
-   &brw_urb_fence,
-   &brw_constant_buffer_state,
-#else
+
    &brw_psp_urb_cbs,
-#endif
 
    &brw_drawing_rect,
    &brw_indices,
    &brw_vertices,
 
-   NULL,			/* brw_constant_buffer */
+   &brw_constant_buffer
 };
 
 
 void brw_init_state( struct brw_context *brw )
 {
-   GLuint i;
-
-   brw_init_cache(brw);
-
-   brw->state.atoms = _mesa_malloc(sizeof(atoms));
-   brw->state.nr_atoms = sizeof(atoms)/sizeof(*atoms);
-   _mesa_memcpy(brw->state.atoms, atoms, sizeof(atoms));
-
-   /* Patch in a pointer to the dynamic state atom:
-    */
-   for (i = 0; i < brw->state.nr_atoms; i++)
-      if (brw->state.atoms[i] == NULL)
-	 brw->state.atoms[i] = &brw->curbe.tracked_state;
-
-   _mesa_memcpy(&brw->curbe.tracked_state, 
-		&brw_constant_buffer,
-		sizeof(brw_constant_buffer));
+   brw_init_caches(brw);
 }
 
 
 void brw_destroy_state( struct brw_context *brw )
 {
-   if (brw->state.atoms) {
-      _mesa_free(brw->state.atoms);
-      brw->state.atoms = NULL;
-   }
-
-   brw_destroy_cache(brw);
+   brw_destroy_caches(brw);
    brw_destroy_batch_cache(brw);
 }
 
@@ -218,6 +191,7 @@ static struct dirty_bit_map mesa_bits[] = {
    DEFINE_BIT(_NEW_MULTISAMPLE),
    DEFINE_BIT(_NEW_TRACK_MATRIX),
    DEFINE_BIT(_NEW_PROGRAM),
+   DEFINE_BIT(_NEW_PROGRAM_CONSTANTS),
    {0, 0, 0}
 };
 
@@ -231,7 +205,6 @@ static struct dirty_bit_map brw_bits[] = {
    DEFINE_BIT(BRW_NEW_PRIMITIVE),
    DEFINE_BIT(BRW_NEW_CONTEXT),
    DEFINE_BIT(BRW_NEW_WM_INPUT_DIMENSIONS),
-   DEFINE_BIT(BRW_NEW_INPUT_VARYING),
    DEFINE_BIT(BRW_NEW_PSP),
    DEFINE_BIT(BRW_NEW_FENCE),
    DEFINE_BIT(BRW_NEW_INDICES),
@@ -336,7 +309,7 @@ void brw_validate_state( struct brw_context *brw )
 
    /* do prepare stage for all atoms */
    for (i = 0; i < Elements(atoms); i++) {
-      const struct brw_tracked_state *atom = brw->state.atoms[i];
+      const struct brw_tracked_state *atom = atoms[i];
 
       if (brw->intel.Fallback)
          break;
@@ -367,8 +340,8 @@ void brw_upload_state(struct brw_context *brw)
       _mesa_memset(&examined, 0, sizeof(examined));
       prev = *state;
 
-      for (i = 0; i < brw->state.nr_atoms; i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+      for (i = 0; i < Elements(atoms); i++) {	 
+	 const struct brw_tracked_state *atom = atoms[i];
 	 struct brw_state_flags generated;
 
 	 assert(atom->dirty.mesa ||
@@ -397,7 +370,7 @@ void brw_upload_state(struct brw_context *brw)
    }
    else {
       for (i = 0; i < Elements(atoms); i++) {	 
-	 const struct brw_tracked_state *atom = brw->state.atoms[i];
+	 const struct brw_tracked_state *atom = atoms[i];
 
 	 if (brw->intel.Fallback)
 	    break;
diff --git a/src/mesa/drivers/dri/i965/brw_structs.h b/src/mesa/drivers/dri/i965/brw_structs.h
index 89e29812034..8ba7eb27b36 100644
--- a/src/mesa/drivers/dri/i965/brw_structs.h
+++ b/src/mesa/drivers/dri/i965/brw_structs.h
@@ -33,6 +33,14 @@
 #ifndef BRW_STRUCTS_H
 #define BRW_STRUCTS_H
 
+
+/** Number of general purpose registers (VS, WM, etc) */
+#define BRW_MAX_GRF 128
+
+/** Number of message register file registers */
+#define BRW_MAX_MRF 16
+
+
 /* Command packets:
  */
 struct header 
@@ -815,7 +823,9 @@ struct brw_gs_unit_state
 
    struct
    {
-      GLuint pad0:10;
+      GLuint pad0:8;
+      GLuint rendering_enable:1; /* for IGDNG */
+      GLuint pad4:1;
       GLuint stats_enable:1; 
       GLuint nr_urb_entries:7; 
       GLuint pad1:1;
@@ -923,6 +933,28 @@ struct brw_wm_unit_state
    
    GLfloat global_depth_offset_constant;  
    GLfloat global_depth_offset_scale;   
+   
+   /* for IGDNG only */
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_1:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_1:26;
+   } wm8;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_2:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_2:26;
+   } wm9;       
+
+   struct {
+      GLuint pad0:1;
+      GLuint grf_reg_count_3:3; 
+      GLuint pad1:2;
+      GLuint kernel_start_pointer_3:26;
+   } wm10;       
 };
 
 struct brw_sampler_default_color {
@@ -1075,7 +1107,7 @@ struct brw_surface_state
       GLuint y_offset:4;
       GLuint pad0:1;
       GLuint x_offset:7;
-   } ss5;   /* NEW in Integrated Graphics Device */
+   } ss5;   /* New in G4X */
 
 };
 
@@ -1298,6 +1330,14 @@ struct brw_instruction
 	 GLuint pad1:6;
       } ia16;
 
+       struct 
+       {
+           GLuint pad:26;
+           GLuint end_of_thread:1;
+           GLuint pad1:1;
+           GLuint sfid:4;
+       } send_igdng;  /* for IGDNG only */
+
    } bits2;
 
    union
@@ -1385,6 +1425,21 @@ struct brw_instruction
       } math;
 
       struct {
+	 GLuint function:4;
+	 GLuint int_type:1;
+	 GLuint precision:1;
+	 GLuint saturate:1;
+	 GLuint data_type:1;
+	 GLuint snapshot:1;
+	 GLuint pad0:10;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } math_igdng;
+
+      struct {
 	 GLuint binding_table_index:8;
 	 GLuint sampler:4;
 	 GLuint return_format:2; 
@@ -1407,9 +1462,38 @@ struct brw_instruction
          GLuint end_of_thread:1;
       } sampler_g4x;
 
+      struct {
+	 GLuint binding_table_index:8;
+	 GLuint sampler:4;
+	 GLuint msg_type:4;
+	 GLuint simd_mode:2;
+	 GLuint pad0:1;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } sampler_igdng;
+
       struct brw_urb_immediate urb;
 
       struct {
+	 GLuint opcode:4;
+	 GLuint offset:6;
+	 GLuint swizzle_control:2; 
+	 GLuint pad:1;
+	 GLuint allocate:1;
+	 GLuint used:1;
+	 GLuint complete:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } urb_igdng;
+
+      struct {
 	 GLuint binding_table_index:8;
 	 GLuint msg_control:4;  
 	 GLuint msg_type:2;  
@@ -1423,6 +1507,19 @@ struct brw_instruction
 
       struct {
 	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;  
+	 GLuint msg_type:3;  
+	 GLuint target_cache:2;    
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_read_igdng;
+
+      struct {
+	 GLuint binding_table_index:8;
 	 GLuint msg_control:3;
 	 GLuint pixel_scoreboard_clear:1;
 	 GLuint msg_type:3;    
@@ -1435,6 +1532,20 @@ struct brw_instruction
       } dp_write;
 
       struct {
+	 GLuint binding_table_index:8;
+	 GLuint msg_control:3;
+	 GLuint pixel_scoreboard_clear:1;
+	 GLuint msg_type:3;    
+	 GLuint send_commit_msg:1;
+	 GLuint pad0:3;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } dp_write_igdng;
+
+      struct {
 	 GLuint pad:16;
 	 GLuint response_length:4;
 	 GLuint msg_length:4;
@@ -1443,6 +1554,15 @@ struct brw_instruction
 	 GLuint end_of_thread:1;
       } generic;
 
+      struct {
+	 GLuint pad:19;
+	 GLuint header_present:1;
+	 GLuint response_length:5;
+	 GLuint msg_length:4;
+	 GLuint pad1:2;
+	 GLuint end_of_thread:1;
+      } generic_igdng;
+
       GLint d;
       GLuint ud;
    } bits3;
diff --git a/src/mesa/drivers/dri/i965/brw_tex_layout.c b/src/mesa/drivers/dri/i965/brw_tex_layout.c
index 51a617fcb40..7f9b2535340 100644
--- a/src/mesa/drivers/dri/i965/brw_tex_layout.c
+++ b/src/mesa/drivers/dri/i965/brw_tex_layout.c
@@ -28,7 +28,6 @@
   * Authors:
   *   Keith Whitwell <[email protected]>
   */
-        
 
 /* Code to layout images in a mipmap tree for i965.
  */
@@ -37,17 +36,90 @@
 #include "intel_tex_layout.h"
 #include "intel_context.h"
 #include "main/macros.h"
+#include "intel_chipset.h"
 
 #define FILE_DEBUG_FLAG DEBUG_MIPTREE
 
-GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_tree *mt )
+GLboolean brw_miptree_layout(struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling)
 {
-   /* XXX: these vary depending on image format: 
-    */
-/*    GLint align_w = 4; */
+   /* XXX: these vary depending on image format: */
+   /* GLint align_w = 4; */
 
    switch (mt->target) {
-   case GL_TEXTURE_CUBE_MAP: 
+   case GL_TEXTURE_CUBE_MAP:
+      if (IS_IGDNG(intel->intelScreen->deviceID)) {
+          GLuint align_h = 2, align_w = 4;
+          GLuint level;
+          GLuint x = 0;
+          GLuint y = 0;
+          GLuint width = mt->width0;
+          GLuint height = mt->height0;
+          GLuint qpitch = 0;
+          GLuint y_pitch = 0;
+
+          mt->pitch = mt->width0;
+          intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+          y_pitch = ALIGN(height, align_h);
+
+          if (mt->compressed) {
+              mt->pitch = ALIGN(mt->width0, align_w);
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) / 4 * 6;
+          } else {
+              qpitch = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * mt->pitch * mt->cpp;
+              mt->total_height = (y_pitch + ALIGN(minify(y_pitch), align_h) + 11 * align_h) * 6;
+          }
+
+          if (mt->first_level != mt->last_level) {
+              GLuint mip1_width;
+
+              if (mt->compressed) {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + ALIGN(minify(minify(mt->width0)), align_w);
+              } else {
+                  mip1_width = ALIGN(minify(mt->width0), align_w)
+                      + minify(minify(mt->width0));
+              }
+
+              if (mip1_width > mt->pitch) {
+                  mt->pitch = mip1_width;
+              }
+          }
+
+          mt->pitch = intel_miptree_pitch_align(intel, mt, tiling, mt->pitch);
+
+          for (level = mt->first_level; level <= mt->last_level; level++) {
+              GLuint img_height;
+              GLuint nr_images = 6;
+              GLuint q = 0;
+
+              intel_miptree_set_level_info(mt, level, nr_images, x, y, width, 
+                                           height, 1);
+
+              for (q = 0; q < nr_images; q++)
+                  intel_miptree_set_image_offset_ex(mt, level, q, x, y, q * qpitch);
+
+              if (mt->compressed)
+                  img_height = MAX2(1, height/4);
+              else
+                  img_height = ALIGN(height, align_h);
+
+              if (level == mt->first_level + 1) {
+                  x += ALIGN(width, align_w);
+              }
+              else {
+                  y += img_height;
+              }
+
+              width  = minify(width);
+              height = minify(height);
+          }
+
+          break;
+      }
+
    case GL_TEXTURE_3D: {
       GLuint width  = mt->width0;
       GLuint height = mt->height0;
@@ -59,25 +131,25 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
       GLuint align_w = 4;
 
       mt->total_height = 0;
-      
+      intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
+
       if (mt->compressed) {
-          align_w = intel_compressed_alignment(mt->internal_format);
           mt->pitch = ALIGN(width, align_w);
           pack_y_pitch = (height + 3) / 4;
       } else {
-          mt->pitch = intel_miptree_pitch_align (intel, mt, mt->width0);
-          pack_y_pitch = ALIGN(mt->height0, align_h);
+	 mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->width0);
+	 pack_y_pitch = ALIGN(mt->height0, align_h);
       }
 
-      pack_x_pitch = mt->pitch;
+      pack_x_pitch = width;
       pack_x_nr = 1;
 
-      for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
+      for (level = mt->first_level ; level <= mt->last_level ; level++) {
 	 GLuint nr_images = mt->target == GL_TEXTURE_3D ? depth : 6;
 	 GLint x = 0;
 	 GLint y = 0;
 	 GLint q, j;
-	    
+
 	 intel_miptree_set_level_info(mt, level, nr_images,
 				      0, mt->total_height,
 				      width, height, depth);
@@ -89,7 +161,7 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	    }
 
 	    x = 0;
-	    y += pack_y_pitch;	    
+	    y += pack_y_pitch;
 	 }
 
 
@@ -98,40 +170,40 @@ GLboolean brw_miptree_layout( struct intel_context *intel, struct intel_mipmap_t
 	 height = minify(height);
 	 depth  = minify(depth);
 
-    if (mt->compressed) {
-        pack_y_pitch = (height + 3) / 4;
-        
-        if (pack_x_pitch > ALIGN(width, align_w)) {
-            pack_x_pitch = ALIGN(width, align_w);
-            pack_x_nr <<= 1;
-        }
-    } else {
-        if (pack_x_pitch > 4) {
-            pack_x_pitch >>= 1;
-            pack_x_nr <<= 1;
-            assert(pack_x_pitch * pack_x_nr <= mt->pitch);
-        }
-
-        if (pack_y_pitch > 2) {
-            pack_y_pitch >>= 1;
-            pack_y_pitch = ALIGN(pack_y_pitch, align_h);
-        }
-    }
+	 if (mt->compressed) {
+	    pack_y_pitch = (height + 3) / 4;
+
+	    if (pack_x_pitch > ALIGN(width, align_w)) {
+	       pack_x_pitch = ALIGN(width, align_w);
+	       pack_x_nr <<= 1;
+	    }
+	 } else {
+	    if (pack_x_pitch > 4) {
+	       pack_x_pitch >>= 1;
+	       pack_x_nr <<= 1;
+	       assert(pack_x_pitch * pack_x_nr <= mt->pitch);
+	    }
+
+	    if (pack_y_pitch > 2) {
+	       pack_y_pitch >>= 1;
+	       pack_y_pitch = ALIGN(pack_y_pitch, align_h);
+	    }
+	 }
 
       }
       break;
    }
 
    default:
-      i945_miptree_layout_2d(intel, mt);
+      i945_miptree_layout_2d(intel, mt, tiling);
       break;
    }
-   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__, 
-		mt->pitch, 
+   DBG("%s: %dx%dx%d - sz 0x%x\n", __FUNCTION__,
+		mt->pitch,
 		mt->total_height,
 		mt->cpp,
 		mt->pitch * mt->total_height * mt->cpp );
-		
+
    return GL_TRUE;
 }
 
diff --git a/src/mesa/drivers/dri/i965/brw_urb.c b/src/mesa/drivers/dri/i965/brw_urb.c
index 7673dd36eb9..8c6f4355a6e 100644
--- a/src/mesa/drivers/dri/i965/brw_urb.c
+++ b/src/mesa/drivers/dri/i965/brw_urb.c
@@ -143,7 +143,29 @@ static void recalculate_urb_fence( struct brw_context *brw )
       brw->urb.nr_clip_entries = limits[CLP].preferred_nr_entries;
       brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;	
       brw->urb.nr_cs_entries = limits[CS].preferred_nr_entries;	
-      
+
+      brw->urb.constrained = 0;
+
+      if (BRW_IS_IGDNG(brw)) {
+         brw->urb.nr_vs_entries = 128;
+         brw->urb.nr_sf_entries = 48;
+         if (check_urb_layout(brw)) {
+            goto done;
+         } else {
+            brw->urb.constrained = 1;
+            brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+            brw->urb.nr_sf_entries = limits[SF].preferred_nr_entries;
+         }
+      } else if (BRW_IS_G4X(brw)) {
+	 brw->urb.nr_vs_entries = 64;
+	 if (check_urb_layout(brw)) {
+	    goto done;
+	 } else {
+	    brw->urb.constrained = 1;
+	    brw->urb.nr_vs_entries = limits[VS].preferred_nr_entries;
+	 }
+      }
+
       if (!check_urb_layout(brw)) {
 	 brw->urb.nr_vs_entries = limits[VS].min_nr_entries;	
 	 brw->urb.nr_gs_entries = limits[GS].min_nr_entries;	
@@ -169,9 +191,8 @@ static void recalculate_urb_fence( struct brw_context *brw )
 	 if (INTEL_DEBUG & (DEBUG_URB|DEBUG_FALLBACKS))
 	    _mesa_printf("URB CONSTRAINED\n");
       }
-      else 
-	 brw->urb.constrained = 0;
 
+done:
       if (INTEL_DEBUG & DEBUG_URB)
 	 _mesa_printf("URB fence: %d ..VS.. %d ..GS.. %d ..CLP.. %d ..SF.. %d ..CS.. %d\n",
 		      brw->urb.vs_start,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 1e4f66091e3..4a591365c98 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -58,6 +58,7 @@ struct brw_vs_compile {
 
    GLuint first_output;
    GLuint nr_outputs;
+   GLuint first_overflow_output; /**< VERT_ATTRIB_x */
 
    GLuint first_tmp;
    GLuint last_tmp;
diff --git a/src/mesa/drivers/dri/i965/brw_vs_emit.c b/src/mesa/drivers/dri/i965/brw_vs_emit.c
index b69616d6e52..514f15d5e3a 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_emit.c
@@ -69,13 +69,18 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 {
    GLuint i, reg = 0, mrf;
 
-#if 0
-   if (c->vp->program.Base.Parameters->NumParameters >= 6)
-      c->vp->use_const_buffer = 1;
+   /* Determine whether to use a real constant buffer or use a block
+    * of GRF registers for constants.  The later is faster but only
+    * works if everything fits in the GRF.
+    * XXX this heuristic/check may need some fine tuning...
+    */
+   if (c->vp->program.Base.Parameters->NumParameters +
+       c->vp->program.Base.NumTemporaries + 20 > BRW_MAX_GRF)
+      c->vp->use_const_buffer = GL_TRUE;
    else
-#endif
       c->vp->use_const_buffer = GL_FALSE;
-   /*printf("use_const_buffer = %d\n", c->use_const_buffer);*/
+
+   /*printf("use_const_buffer = %d\n", c->vp->use_const_buffer);*/
 
    /* r0 -- reserved as usual
     */
@@ -124,15 +129,21 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
       }
    }
 
-   /* Allocate outputs: TODO: could organize the non-position outputs
-    * to go straight into message regs.
+   /* Allocate outputs.  The non-position outputs go straight into message regs.
     */
    c->nr_outputs = 0;
    c->first_output = reg;
-   mrf = 4;
+   c->first_overflow_output = 0;
+
+   if (BRW_IS_IGDNG(c->func.brw))
+       mrf = 8;
+   else
+       mrf = 4;
+
    for (i = 0; i < VERT_RESULT_MAX; i++) {
       if (c->prog_data.outputs_written & (1 << i)) {
 	 c->nr_outputs++;
+         assert(i < Elements(c->regs[PROGRAM_OUTPUT]));
 	 if (i == VERT_RESULT_HPOS) {
 	    c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
 	    reg++;
@@ -143,8 +154,17 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
 	    mrf++;		/* just a placeholder?  XXX fix later stages & remove this */
 	 }
 	 else {
-	    c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
-	    mrf++;
+            if (mrf < 16) {
+               c->regs[PROGRAM_OUTPUT][i] = brw_message_reg(mrf);
+               mrf++;
+            }
+            else {
+               /* too many vertex results to fit in MRF, use GRF for overflow */
+               if (!c->first_overflow_output)
+                  c->first_overflow_output = i;
+               c->regs[PROGRAM_OUTPUT][i] = brw_vec8_grf(reg, 0);
+               reg++;
+            }
 	 }
       }
    }     
@@ -201,7 +221,11 @@ static void brw_vs_alloc_regs( struct brw_vs_compile *c )
     */
    c->prog_data.urb_read_length = (c->nr_inputs + 1) / 2;
 
-   c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
+   if (BRW_IS_IGDNG(c->func.brw))
+       c->prog_data.urb_entry_size = (c->nr_outputs + 6 + 3) / 4;
+   else
+       c->prog_data.urb_entry_size = (c->nr_outputs + 2 + 3) / 4;
+
    c->prog_data.total_grf = reg;
 
    if (INTEL_DEBUG & DEBUG_VS) {
@@ -709,10 +733,11 @@ get_constant(struct brw_vs_compile *c,
    struct brw_compile *p = &c->func;
    struct brw_reg const_reg;
    struct brw_reg const2_reg;
+   const GLboolean relAddr = src->RelAddr;
 
    assert(argIndex < 3);
 
-   if (c->current_const[argIndex].index != src->Index || src->RelAddr) {
+   if (c->current_const[argIndex].index != src->Index || relAddr) {
       struct brw_reg addrReg = c->regs[PROGRAM_ADDRESS][0];
 
       c->current_const[argIndex].index = src->Index;
@@ -725,13 +750,13 @@ get_constant(struct brw_vs_compile *c,
       brw_dp_READ_4_vs(p,
                        c->current_const[argIndex].reg,/* writeback dest */
                        0,                             /* oword */
-                       src->RelAddr,                  /* relative indexing? */
+                       relAddr,                       /* relative indexing? */
                        addrReg,                       /* address register */
                        16 * src->Index,               /* byte offset */
                        SURF_INDEX_VERT_CONST_BUFFER   /* binding table index */
                        );
 
-      if (src->RelAddr) {
+      if (relAddr) {
          /* second read */
          const2_reg = get_tmp(c);
 
@@ -742,7 +767,7 @@ get_constant(struct brw_vs_compile *c,
          brw_dp_READ_4_vs(p,
                           const2_reg,              /* writeback dest */
                           1,                       /* oword */
-                          src->RelAddr,            /* relative indexing? */
+                          relAddr,                 /* relative indexing? */
                           addrReg,                 /* address register */
                           16 * src->Index,         /* byte offset */
                           SURF_INDEX_VERT_CONST_BUFFER
@@ -752,7 +777,7 @@ get_constant(struct brw_vs_compile *c,
 
    const_reg = c->current_const[argIndex].reg;
 
-   if (src->RelAddr) {
+   if (relAddr) {
       /* merge the two Owords into the constant register */
       /* const_reg[7..4] = const2_reg[7..4] */
       brw_MOV(p,
@@ -1061,6 +1086,8 @@ static void emit_vertex_write( struct brw_vs_compile *c)
    struct brw_reg m0 = brw_message_reg(0);
    struct brw_reg pos = c->regs[PROGRAM_OUTPUT][VERT_RESULT_HPOS];
    struct brw_reg ndc;
+   int eot;
+   GLuint len_vertext_header = 2;
 
    if (c->key.copy_edgeflag) {
       brw_MOV(p, 
@@ -1070,14 +1097,16 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 
    /* Build ndc coords */
    ndc = get_tmp(c);
+   /* ndc = 1.0 / pos.w */
    emit_math1(c, BRW_MATH_FUNCTION_INV, ndc, brw_swizzle1(pos, 3), BRW_MATH_PRECISION_FULL);
+   /* ndc.xyz = pos * ndc */
    brw_MUL(p, brw_writemask(ndc, WRITEMASK_XYZ), pos, ndc);
 
    /* Update the header for point size, user clipping flags, and -ve rhw
     * workaround.
     */
    if ((c->prog_data.outputs_written & (1<<VERT_RESULT_PSIZ)) ||
-       c->key.nr_userclip || !BRW_IS_G4X(p->brw))
+       c->key.nr_userclip || BRW_IS_965(p->brw))
    {
       struct brw_reg header1 = retype(get_tmp(c), BRW_REGISTER_TYPE_UD);
       GLuint i;
@@ -1108,7 +1137,7 @@ static void emit_vertex_write( struct brw_vs_compile *c)
        * Later, clipping will detect ucp[6] and ensure the primitive is
        * clipped against all fixed planes.
        */
-      if (!BRW_IS_G4X(p->brw)) {
+      if (BRW_IS_965(p->brw)) {
 	 brw_CMP(p,
 		 vec8(brw_null_reg()),
 		 BRW_CONDITIONAL_L,
@@ -1135,7 +1164,23 @@ static void emit_vertex_write( struct brw_vs_compile *c)
     */
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_MOV(p, offset(m0, 2), ndc);
-   brw_MOV(p, offset(m0, 3), pos);
+
+   if (BRW_IS_IGDNG(p->brw)) {
+       /* There are 20 DWs (D0-D19) in VUE vertex header on IGDNG */
+       brw_MOV(p, offset(m0, 3), pos); /* a portion of vertex header */
+       /* m4, m5 contain the distances from vertex to the user clip planeXXX. 
+        * Seems it is useless for us.
+        * m6 is used for aligning, so that the remainder of vertex element is 
+        * reg-aligned.
+        */
+       brw_MOV(p, offset(m0, 7), pos); /* the remainder of vertex element */
+       len_vertext_header = 6;
+   } else {
+       brw_MOV(p, offset(m0, 3), pos);
+       len_vertext_header = 2;
+   }
+
+   eot = (c->first_overflow_output == 0);
 
    brw_urb_WRITE(p, 
 		 brw_null_reg(), /* dest */
@@ -1143,12 +1188,43 @@ static void emit_vertex_write( struct brw_vs_compile *c)
 		 c->r0,		/* src */
 		 0,		/* allocate */
 		 1,		/* used */
-		 c->nr_outputs + 3, /* msg len */
+		 MIN2(c->nr_outputs + 1 + len_vertext_header, (BRW_MAX_MRF-1)), /* msg len */
 		 0,		/* response len */
-		 1, 		/* eot */
+		 eot, 		/* eot */
 		 1, 		/* writes complete */
 		 0, 		/* urb destination offset */
 		 BRW_URB_SWIZZLE_INTERLEAVE);
+
+   if (c->first_overflow_output > 0) {
+      /* Not all of the vertex outputs/results fit into the MRF.
+       * Move the overflowed attributes from the GRF to the MRF and
+       * issue another brw_urb_WRITE().
+       */
+      /* XXX I'm not 100% sure about which MRF regs to use here.  Starting
+       * at mrf[4] atm...
+       */
+      GLuint i, mrf = 0;
+      for (i = c->first_overflow_output; i < VERT_RESULT_MAX; i++) {
+         if (c->prog_data.outputs_written & (1 << i)) {
+            /* move from GRF to MRF */
+            brw_MOV(p, brw_message_reg(4+mrf), c->regs[PROGRAM_OUTPUT][i]);
+            mrf++;
+         }
+      }
+
+      brw_urb_WRITE(p,
+                    brw_null_reg(), /* dest */
+                    4,              /* starting mrf reg nr */
+                    c->r0,          /* src */
+                    0,              /* allocate */
+                    1,              /* used */
+                    mrf+1,          /* msg len */
+                    0,              /* response len */
+                    1,              /* eot */
+                    1,              /* writes complete */
+                    BRW_MAX_MRF-1,  /* urb destination offset */
+                    BRW_URB_SWIZZLE_INTERLEAVE);
+   }
 }
 
 
@@ -1177,15 +1253,15 @@ post_vs_emit( struct brw_vs_compile *c,
  */
 void brw_vs_emit(struct brw_vs_compile *c )
 {
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
+#define MAX_LOOP_DEPTH 32
    struct brw_compile *p = &c->func;
-   GLuint nr_insns = c->vp->program.Base.NumInstructions;
-   GLuint insn, if_insn = 0;
+   const GLuint nr_insns = c->vp->program.Base.NumInstructions;
+   GLuint insn, if_depth = 0, loop_depth = 0;
    GLuint end_offset = 0;
    struct brw_instruction *end_inst, *last_inst;
-   struct brw_instruction *if_inst[MAX_IFSN];
-   struct brw_indirect stack_index = brw_indirect(0, 0);   
-
+   struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+   const struct brw_indirect stack_index = brw_indirect(0, 0);   
    GLuint index;
    GLuint file;
 
@@ -1219,7 +1295,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
 
    for (insn = 0; insn < nr_insns; insn++) {
 
-      struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
+      const struct prog_instruction *inst = &c->vp->program.Base.Instructions[insn];
       struct brw_reg args[3], dst;
       GLuint i;
       
@@ -1232,7 +1308,7 @@ void brw_vs_emit(struct brw_vs_compile *c )
        */
       if (inst->Opcode != OPCODE_SWZ)
 	  for (i = 0; i < 3; i++) {
-	      struct prog_src_register *src = &inst->SrcReg[i];
+	      const struct prog_src_register *src = &inst->SrcReg[i];
 	      index = src->Index;
 	      file = src->File;	
 	      if (file == PROGRAM_OUTPUT && c->output_regs[index].used_in_src)
@@ -1376,16 +1452,51 @@ void brw_vs_emit(struct brw_vs_compile *c )
 	 emit_xpd(p, dst, args[0], args[1]);
 	 break;
       case OPCODE_IF:
-	 assert(if_insn < MAX_IFSN);
-         if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+	 assert(if_depth < MAX_IF_DEPTH);
+         if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 	 break;
       case OPCODE_ELSE:
-	 if_inst[if_insn-1] = brw_ELSE(p, if_inst[if_insn-1]);
+	 if_inst[if_depth-1] = brw_ELSE(p, if_inst[if_depth-1]);
 	 break;
       case OPCODE_ENDIF:
-         assert(if_insn > 0);
-	 brw_ENDIF(p, if_inst[--if_insn]);
+         assert(if_depth > 0);
+	 brw_ENDIF(p, if_inst[--if_depth]);
 	 break;			
+#if 0
+      case OPCODE_BGNLOOP:
+         loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
+         break;
+      case OPCODE_BRK:
+         brw_BREAK(p);
+         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case OPCODE_CONT:
+         brw_CONT(p);
+         brw_set_predicate_control(p, BRW_PREDICATE_NONE);
+         break;
+      case OPCODE_ENDLOOP: 
+         {
+            struct brw_instruction *inst0, *inst1;
+            loop_depth--;
+            inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+            /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+            while (inst0 > loop_inst[loop_depth]) {
+               inst0--;
+               if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+                  inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+                  inst0->bits3.if_else.pop_count = 0;
+               }
+               else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                  inst0->bits3.if_else.jump_count = inst1 - inst0;
+                  inst0->bits3.if_else.pop_count = 0;
+               }
+            }
+         }
+         break;
+#else
+         (void) loop_inst;
+         (void) loop_depth;
+#endif
       case OPCODE_BRA:
          brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
          brw_ADD(p, brw_ip_reg(), brw_ip_reg(), brw_imm_d(1*16));
diff --git a/src/mesa/drivers/dri/i965/brw_vs_state.c b/src/mesa/drivers/dri/i965/brw_vs_state.c
index 3d295388437..d790ab65553 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_state.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_state.c
@@ -97,7 +97,11 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
     * brw_urb_WRITE() results.
     */
    vs.thread1.single_program_flow = 0;
-   vs.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (BRW_IS_IGDNG(brw))
+      vs.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      vs.thread1.binding_table_entry_count = key->nr_surfaces;
 
    vs.thread3.urb_entry_read_length = key->urb_entry_read_length;
    vs.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
@@ -105,10 +109,16 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
    vs.thread3.urb_entry_read_offset = 0;
    vs.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   vs.thread4.nr_urb_entries = key->nr_urb_entries;
+   if (BRW_IS_IGDNG(brw))
+       vs.thread4.nr_urb_entries = key->nr_urb_entries >> 2;
+   else
+       vs.thread4.nr_urb_entries = key->nr_urb_entries;
+
    vs.thread4.urb_entry_allocation_size = key->urb_size - 1;
 
-   if (BRW_IS_G4X(brw))
+   if (BRW_IS_IGDNG(brw))
+      chipset_max_threads = 72;
+   else if (BRW_IS_G4X(brw))
       chipset_max_threads = 32;
    else
       chipset_max_threads = 16;
@@ -120,6 +130,8 @@ vs_unit_create_from_key(struct brw_context *brw, struct brw_vs_unit_key *key)
 
    /* No samplers for ARB_vp programs:
     */
+   /* It has to be set to 0 for IGDNG
+    */
    vs.vs5.sampler_count = 0;
 
    if (INTEL_DEBUG & DEBUG_STATS)
diff --git a/src/mesa/drivers/dri/i965/brw_vs_surface_state.c b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
new file mode 100644
index 00000000000..89f47522a1c
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_vs_surface_state.c
@@ -0,0 +1,226 @@
+/*
+ Copyright (C) Intel Corp.  2006.  All Rights Reserved.
+ Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
+ develop this 3D driver.
+
+ Permission is hereby granted, free of charge, to any person obtaining
+ a copy of this software and associated documentation files (the
+ "Software"), to deal in the Software without restriction, including
+ without limitation the rights to use, copy, modify, merge, publish,
+ distribute, sublicense, and/or sell copies of the Software, and to
+ permit persons to whom the Software is furnished to do so, subject to
+ the following conditions:
+
+ The above copyright notice and this permission notice (including the
+ next paragraph) shall be included in all copies or substantial
+ portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+ **********************************************************************/
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "main/mtypes.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "shader/prog_parameter.h"
+
+#include "brw_context.h"
+#include "brw_state.h"
+#include "brw_defines.h"
+
+/* Creates a new VS constant buffer reflecting the current VS program's
+ * constants, if needed by the VS program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_vs_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_VERTEX_PROGRAM */
+   if (!vp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
+
+/**
+ * Update the surface state for a VS constant buffer.
+ *
+ * Sets brw->vs.surf_bo[surf] and brw->vp->const_buffer.
+ */
+static void
+brw_update_vs_constant_surface( GLcontext *ctx,
+                                GLuint surf)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct brw_surface_key key;
+   struct brw_vertex_program *vp =
+      (struct brw_vertex_program *) brw->vertex_program;
+   const struct gl_program_parameter_list *params = vp->program.Base.Parameters;
+
+   assert(surf == 0);
+
+   /* If we're in this state update atom, we need to update VS constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(vp->const_buffer);
+   vp->const_buffer = brw_vs_update_constant_buffer(brw);
+
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (vp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+      brw->vs.surf_bo[surf] = NULL;
+      return;
+   }
+
+   memset(&key, 0, sizeof(key));
+
+   key.format = MESA_FORMAT_RGBA_FLOAT32;
+   key.internal_format = GL_RGBA;
+   key.bo = vp->const_buffer;
+   key.depthmode = GL_NONE;
+   key.pitch = params->NumParameters;
+   key.width = params->NumParameters;
+   key.height = 1;
+   key.depth = 1;
+   key.cpp = 16;
+
+   /*
+   printf("%s:\n", __FUNCTION__);
+   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
+          key.width, key.height, key.depth, key.cpp, key.pitch);
+   */
+
+   drm_intel_bo_unreference(brw->vs.surf_bo[surf]);
+   brw->vs.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
+   if (brw->vs.surf_bo[surf] == NULL) {
+      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   }
+}
+
+
+/**
+ * Constructs the binding table for the VS surface state.
+ */
+static dri_bo *
+brw_vs_get_binding_table(struct brw_context *brw)
+{
+   dri_bo *bind_bo;
+
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
+			      NULL, 0,
+			      brw->vs.surf_bo, BRW_VS_MAX_SURF,
+			      NULL);
+
+   if (bind_bo == NULL) {
+      GLuint data_size = BRW_VS_MAX_SURF * sizeof(GLuint);
+      uint32_t *data = malloc(data_size);
+      int i;
+
+      for (i = 0; i < BRW_VS_MAX_SURF; i++)
+         if (brw->vs.surf_bo[i])
+            data[i] = brw->vs.surf_bo[i]->offset;
+         else
+            data[i] = 0;
+
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
+				  NULL, 0,
+				  brw->vs.surf_bo, BRW_VS_MAX_SURF,
+				  data, data_size,
+				  NULL, NULL);
+
+      /* Emit binding table relocations to surface state */
+      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+	 if (brw->vs.surf_bo[i] != NULL) {
+	    /* The presumed offsets were set in the data values for
+	     * brw_upload_cache.
+	     */
+	    drm_intel_bo_emit_reloc(bind_bo, i * 4,
+				    brw->vs.surf_bo[i], 0,
+				    I915_GEM_DOMAIN_INSTRUCTION, 0);
+	 }
+      }
+
+      free(data);
+   }
+
+   return bind_bo;
+}
+
+/**
+ * Vertex shader surfaces (constant buffer).
+ *
+ * This consumes the state updates for the constant buffer needing
+ * to be updated, and produces BRW_NEW_NR_VS_SURFACES for the VS unit and
+ * CACHE_NEW_SURF_BIND for the binding table upload.
+ */
+static void prepare_vs_surfaces(struct brw_context *brw )
+{
+   GLcontext *ctx = &brw->intel.ctx;
+   int i;
+   int nr_surfaces = 0;
+
+   brw_update_vs_constant_surface(ctx, SURF_INDEX_VERT_CONST_BUFFER);
+
+   for (i = 0; i < BRW_VS_MAX_SURF; i++) {
+      if (brw->vs.surf_bo[i] != NULL) {
+	 nr_surfaces = i + 1;
+      }
+   }
+
+   if (brw->vs.nr_surfaces != nr_surfaces) {
+      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
+      brw->vs.nr_surfaces = nr_surfaces;
+   }
+
+   /* Note that we don't end up updating the bind_bo if we don't have a
+    * surface to be pointing at.  This should be relatively harmless, as it
+    * just slightly increases our working set size.
+    */
+   if (brw->vs.nr_surfaces != 0) {
+      dri_bo_unreference(brw->vs.bind_bo);
+      brw->vs.bind_bo = brw_vs_get_binding_table(brw);
+   }
+}
+
+const struct brw_tracked_state brw_vs_surfaces = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_VERTEX_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_vs_surfaces,
+};
+
+
+
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index ba03afd6c13..ac117901515 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -177,14 +177,6 @@ static void brw_note_fence( struct intel_context *intel, GLuint fence )
    brw_context(&intel->ctx)->state.dirty.brw |= BRW_NEW_FENCE;
 }
 
-
-static void brw_note_unlock( struct intel_context *intel )
-{
-   struct brw_context *brw = brw_context(&intel->ctx);
-   brw_state_cache_check_size(brw);
-}
-
-
 /* called from intelWaitForIdle() and intelFlush()
  *
  * For now, just flush everything.  Could be smarter later.
@@ -194,7 +186,7 @@ static GLuint brw_flush_cmd( void )
    struct brw_mi_flush flush;
    flush.opcode = CMD_MI_FLUSH;
    flush.pad = 0;
-   flush.flags = BRW_FLUSH_READ_CACHE | BRW_FLUSH_STATE_CACHE;
+   flush.flags = BRW_FLUSH_STATE_CACHE;
    return *(GLuint *)&flush;
 }
 
@@ -215,7 +207,6 @@ void brwInitVtbl( struct brw_context *brw )
 
    brw->intel.vtbl.invalidate_state = brw_invalidate_state;
    brw->intel.vtbl.note_fence = brw_note_fence;
-   brw->intel.vtbl.note_unlock = brw_note_unlock;
    brw->intel.vtbl.new_batch = brw_new_batch;
    brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
diff --git a/src/mesa/drivers/dri/i965/brw_wm.c b/src/mesa/drivers/dri/i965/brw_wm.c
index 8a3b7df9c78..14e05be4f6c 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.c
+++ b/src/mesa/drivers/dri/i965/brw_wm.c
@@ -272,6 +272,9 @@ static void brw_wm_populate_key( struct brw_context *brw,
    /* _NEW_LIGHT */
    key->flat_shade = (ctx->Light.ShadeModel == GL_FLAT);
 
+   /* _NEW_HINT */
+   key->linear_color = (ctx->Hint.PerspectiveCorrection == GL_FASTEST);
+
    /* _NEW_TEXTURE */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
       const struct gl_texture_unit *unit = &ctx->Texture.Unit[i];
@@ -351,6 +354,7 @@ const struct brw_tracked_state brw_wm_prog = {
    .dirty = {
       .mesa  = (_NEW_COLOR |
 		_NEW_DEPTH |
+                _NEW_HINT |
 		_NEW_STENCIL |
 		_NEW_POLYGON |
 		_NEW_LINE |
diff --git a/src/mesa/drivers/dri/i965/brw_wm.h b/src/mesa/drivers/dri/i965/brw_wm.h
index 295fed851bb..ba497432c60 100644
--- a/src/mesa/drivers/dri/i965/brw_wm.h
+++ b/src/mesa/drivers/dri/i965/brw_wm.h
@@ -63,6 +63,7 @@ struct brw_wm_prog_key {
    GLuint computes_depth:1;	/* could be derived from program string */
    GLuint source_depth_to_render_target:1;
    GLuint flat_shade:1;
+   GLuint linear_color:1;  /**< linear interpolation vs perspective interp */
    GLuint runtime_check_aads_emit:1;
    
    GLbitfield proj_attrib_mask; /**< one bit per fragment program attribute */
@@ -241,15 +242,20 @@ struct brw_wm_compile {
    GLuint max_wm_grf;
    GLuint last_scratch;
 
+   GLuint cur_inst;  /**< index of current instruction */
+
+   GLboolean out_of_regs;  /**< ran out of GRF registers? */
+
    /** Mapping from Mesa registers to hardware registers */
    struct {
       GLboolean inited;
       struct brw_reg reg;
    } wm_regs[PROGRAM_PAYLOAD+1][256][4];
 
+   GLboolean used_grf[BRW_WM_MAX_GRF];
+   GLuint first_free_grf;
    struct brw_reg stack;
    struct brw_reg emit_mask_reg;
-   GLuint reg_index;  /**< Index of next free GRF register */
    GLuint tmp_regs[BRW_WM_MAX_GRF];
    GLuint tmp_index;
    GLuint tmp_max;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_emit.c b/src/mesa/drivers/dri/i965/brw_wm_emit.c
index a870e75e523..9f82916c025 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_emit.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_emit.c
@@ -353,6 +353,19 @@ static void emit_mad( struct brw_compile *p,
    }
 }
 
+static void emit_trunc( struct brw_compile *p,
+		      const struct brw_reg *dst,
+		      GLuint mask,
+		      const struct brw_reg *arg0)
+{
+   GLuint i;
+
+   for (i = 0; i < 4; i++) {
+      if (mask & (1<<i)) {
+	 brw_RNDZ(p, dst[i], arg0[i]);
+      }
+   }
+}
 
 static void emit_lrp( struct brw_compile *p, 
 		      const struct brw_reg *dst,
@@ -701,6 +714,7 @@ static void emit_tex( struct brw_wm_compile *c,
    GLuint msgLength, responseLength;
    GLuint i, nr;
    GLuint emit;
+   GLuint msg_type;
 
    /* How many input regs are there?
     */
@@ -738,6 +752,18 @@ static void emit_tex( struct brw_wm_compile *c,
 
    responseLength = 8;		/* always */
 
+   if (BRW_IS_IGDNG(p->brw)) {
+       if (inst->tex_shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE_IGDNG;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_IGDNG;
+   } else {
+       if (inst->tex_shadow)
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE;
+       else
+           msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE;
+   }
+
    brw_SAMPLE(p, 
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
@@ -745,12 +771,12 @@ static void emit_tex( struct brw_wm_compile *c,
               SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      (inst->tex_shadow ? 
-	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE : 
-	       BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE),
+	      msg_type, 
 	      responseLength,
 	      msgLength,
-	      0);	
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
 }
 
 
@@ -762,7 +788,7 @@ static void emit_txb( struct brw_wm_compile *c,
 {
    struct brw_compile *p = &c->func;
    GLuint msgLength;
-
+   GLuint msg_type;
    /* Shadow ignored for txb.
     */
    switch (inst->tex_idx) {
@@ -787,6 +813,11 @@ static void emit_txb( struct brw_wm_compile *c,
    brw_MOV(p, brw_message_reg(8), arg[3]);
    msgLength = 9;
 
+   if (BRW_IS_IGDNG(p->brw))
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS_IGDNG;
+   else
+       msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+
    brw_SAMPLE(p, 
 	      retype(vec16(dst[0]), BRW_REGISTER_TYPE_UW),
 	      1,
@@ -794,10 +825,12 @@ static void emit_txb( struct brw_wm_compile *c,
               SURF_INDEX_TEXTURE(inst->tex_unit),
 	      inst->tex_unit,	  /* sampler */
 	      inst->writemask,
-	      BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,
+	      msg_type,
 	      8,		/* responseLength */
 	      msgLength,
-	      0);	
+	      0,	
+	      1,
+	      BRW_SAMPLER_SIMD_MODE_SIMD16);	
 }
 
 
@@ -1009,7 +1042,7 @@ static void emit_fb_write( struct brw_wm_compile *c,
 	      get_element_ud(brw_vec8_grf(1,0), 6), 
 	      brw_imm_ud(1<<26)); 
 
-      jmp = brw_JMPI(p, ip, ip, brw_imm_w(0));
+      jmp = brw_JMPI(p, ip, ip, brw_imm_d(0));
       {
 	 emit_aa(c, arg1, 2);
 	 fire_fb_write(c, 0, nr, target, eot);
@@ -1222,6 +1255,10 @@ void brw_wm_emit( struct brw_wm_compile *c )
 	 emit_dph(p, dst, dst_flags, args[0], args[1]);
 	 break;
 
+      case OPCODE_TRUNC:
+	 emit_trunc(p, dst, dst_flags, args[0]);
+	 break;
+
       case OPCODE_LRP:
 	 emit_lrp(p, dst, dst_flags, args[0], args[1], args[2]);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_fp.c b/src/mesa/drivers/dri/i965/brw_wm_fp.c
index 49aad281d7a..b9e8dd2e96e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_fp.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_fp.c
@@ -354,13 +354,25 @@ static void emit_interp( struct brw_wm_compile *c,
 		 src_undef());
       }
       else {
-	 emit_op(c,
-		 WM_LINTERP,
-		 dst,
-		 0,
-		 interp,
-		 deltas,
-		 src_undef());
+         if (c->key.linear_color) {
+            emit_op(c,
+                    WM_LINTERP,
+                    dst,
+                    0,
+                    interp,
+                    deltas,
+                    src_undef());
+         }
+         else {
+            /* perspective-corrected color interpolation */
+            emit_op(c,
+                    WM_PINTERP,
+                    dst,
+                    0,
+                    interp,
+                    deltas,
+                    get_pixel_w(c));
+         }
       }
       break;
    case FRAG_ATTRIB_FOGC:
diff --git a/src/mesa/drivers/dri/i965/brw_wm_glsl.c b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
index a907e1b7887..19f777fe32e 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_glsl.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_glsl.c
@@ -1,5 +1,7 @@
 #include "main/macros.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+#include "shader/prog_optimize.h"
 #include "brw_context.h"
 #include "brw_eu.h"
 #include "brw_wm.h"
@@ -21,7 +23,6 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 	const struct prog_instruction *inst = &fp->Base.Instructions[i];
 	switch (inst->Opcode) {
 	    case OPCODE_IF:
-	    case OPCODE_TRUNC:
 	    case OPCODE_ENDIF:
 	    case OPCODE_CAL:
 	    case OPCODE_BRK:
@@ -42,6 +43,83 @@ GLboolean brw_wm_is_glsl(const struct gl_fragment_program *fp)
 }
 
 
+
+static void
+reclaim_temps(struct brw_wm_compile *c);
+
+
+/** Mark GRF register as used. */
+static void
+prealloc_grf(struct brw_wm_compile *c, int r)
+{
+   c->used_grf[r] = GL_TRUE;
+}
+
+
+/** Mark given GRF register as not in use. */
+static void
+release_grf(struct brw_wm_compile *c, int r)
+{
+   /*assert(c->used_grf[r]);*/
+   c->used_grf[r] = GL_FALSE;
+   c->first_free_grf = MIN2(c->first_free_grf, r);
+}
+
+
+/** Return index of a free GRF, mark it as used. */
+static int
+alloc_grf(struct brw_wm_compile *c)
+{
+   GLuint r;
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   /* no free temps, try to reclaim some */
+   reclaim_temps(c);
+   c->first_free_grf = 0;
+
+   /* try alloc again */
+   for (r = c->first_free_grf; r < BRW_WM_MAX_GRF; r++) {
+      if (!c->used_grf[r]) {
+         c->used_grf[r] = GL_TRUE;
+         c->first_free_grf = r + 1;  /* a guess */
+         return r;
+      }
+   }
+
+   for (r = 0; r < BRW_WM_MAX_GRF; r++) {
+      assert(c->used_grf[r]);
+   }
+
+   /* really, no free GRF regs found */
+   if (!c->out_of_regs) {
+      /* print warning once per compilation */
+      _mesa_warning(NULL, "i965: ran out of registers for fragment program");
+      c->out_of_regs = GL_TRUE;
+   }
+
+   return -1;
+}
+
+
+/** Return number of GRF registers used */
+static int
+num_grf_used(const struct brw_wm_compile *c)
+{
+   int r;
+   for (r = BRW_WM_MAX_GRF - 1; r >= 0; r--)
+      if (c->used_grf[r])
+         return r + 1;
+   return 0;
+}
+
+
+
 /**
  * Record the mapping of a Mesa register to a hardware register.
  */
@@ -68,11 +146,23 @@ static int get_scalar_dst_index(const struct prog_instruction *inst)
 static struct brw_reg alloc_tmp(struct brw_wm_compile *c)
 {
     struct brw_reg reg;
-    if(c->tmp_index == c->tmp_max)
-	c->tmp_regs[ c->tmp_max++ ] = c->reg_index++;
-    
+
+    /* if we need to allocate another temp, grow the tmp_regs[] array */
+    if (c->tmp_index == c->tmp_max) {
+       int r = alloc_grf(c);
+       if (r < 0) {
+          /*printf("Out of temps in %s\n", __FUNCTION__);*/
+          r = 50; /* XXX random register! */
+       }
+       c->tmp_regs[ c->tmp_max++ ] = r;
+    }
+
+    /* form the GRF register */
     reg = brw_vec8_grf(c->tmp_regs[ c->tmp_index++ ], 0);
+    /*printf("alloc_temp %d\n", reg.nr);*/
+    assert(reg.nr < BRW_WM_MAX_GRF);
     return reg;
+
 }
 
 /**
@@ -130,35 +220,29 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 	    return brw_null_reg();
     }
 
+    assert(index < 256);
+    assert(component < 4);
+
     /* see if we've already allocated a HW register for this Mesa register */
     if (c->wm_regs[file][index][component].inited) {
-	/* yes, re-use */
-	reg = c->wm_regs[file][index][component].reg;
+       /* yes, re-use */
+       reg = c->wm_regs[file][index][component].reg;
     }
     else {
 	/* no, allocate new register */
-	reg = brw_vec8_grf(c->reg_index, 0);
-    }
+       int grf = alloc_grf(c);
+       /*printf("alloc grf %d for reg %d:%d.%d\n", grf, file, index, component);*/
+       if (grf < 0) {
+          /* totally out of temps */
+          grf = 51; /* XXX random register! */
+       }
 
-    /* if this is a new register allocation, record it in the table */
-    if (!c->wm_regs[file][index][component].inited) {
-	set_reg(c, file, index, component, reg);
-	c->reg_index++;
-    }
+       reg = brw_vec8_grf(grf, 0);
+       /*printf("Alloc new grf %d for %d.%d\n", reg.nr, index, component);*/
 
-    if (c->reg_index >= BRW_WM_MAX_GRF - 12) {
-	/* ran out of temporary registers! */
-#if 1
-        /* This is a big hack for now.
-         * Return bad register index, just don't hang the GPU.
-         */
-        _mesa_fprintf(stderr, "out of regs %d\n", c->reg_index);
-        c->reg_index = BRW_WM_MAX_GRF - 13;
-#else
-	return brw_null_reg();
-#endif
+       set_reg(c, file, index, component, reg);
     }
- 
+
     if (neg & (1 << component)) {
 	reg = negate(reg);
     }
@@ -168,6 +252,46 @@ get_reg(struct brw_wm_compile *c, int file, int index, int component,
 }
 
 
+
+/**
+ * This is called if we run out of GRF registers.  Examine the live intervals
+ * of temp regs in the program and free those which won't be used again.
+ */
+static void
+reclaim_temps(struct brw_wm_compile *c)
+{
+   GLint intBegin[MAX_PROGRAM_TEMPS];
+   GLint intEnd[MAX_PROGRAM_TEMPS];
+   int index;
+
+   /*printf("Reclaim temps:\n");*/
+
+   _mesa_find_temp_intervals(c->prog_instructions, c->nr_fp_insns,
+                             intBegin, intEnd);
+
+   for (index = 0; index < MAX_PROGRAM_TEMPS; index++) {
+      if (intEnd[index] != -1 && intEnd[index] < c->cur_inst) {
+         /* program temp[i] can be freed */
+         int component;
+         /*printf("  temp[%d] is dead\n", index);*/
+         for (component = 0; component < 4; component++) {
+            if (c->wm_regs[PROGRAM_TEMPORARY][index][component].inited) {
+               int r = c->wm_regs[PROGRAM_TEMPORARY][index][component].reg.nr;
+               release_grf(c, r);
+               /*
+               printf("  Reclaim temp %d, reg %d at inst %d\n",
+                      index, r, c->cur_inst);
+               */
+               c->wm_regs[PROGRAM_TEMPORARY][index][component].inited = GL_FALSE;
+            }
+         }
+      }
+   }
+}
+
+
+
+
 /**
  * Preallocate registers.  This sets up the Mesa to hardware register
  * mapping for certain registers, such as constants (uniforms/state vars)
@@ -179,6 +303,10 @@ static void prealloc_reg(struct brw_wm_compile *c)
     struct brw_reg reg;
     int urb_read_length = 0;
     GLuint inputs = FRAG_BIT_WPOS | c->fp_interp_emitted | c->fp_deriv_emitted;
+    GLuint reg_index = 0;
+
+    memset(c->used_grf, GL_FALSE, sizeof(c->used_grf));
+    c->first_free_grf = 0;
 
     for (i = 0; i < 4; i++) {
         if (i < c->key.nr_depth_regs) 
@@ -187,14 +315,20 @@ static void prealloc_reg(struct brw_wm_compile *c)
             reg = brw_vec8_grf(0, 0);
 	set_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, i, reg);
     }
-    c->reg_index += 2 * c->key.nr_depth_regs;
+    reg_index += 2 * c->key.nr_depth_regs;
 
     /* constants */
     {
-        const int nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_params = c->fp->program.Base.Parameters->NumParameters;
+        const GLuint nr_temps = c->fp->program.Base.NumTemporaries;
 
         /* use a real constant buffer, or just use a section of the GRF? */
-        c->fp->use_const_buffer = GL_FALSE; /* (nr_params > 8);*/
+        /* XXX this heuristic may need adjustment... */
+        if ((nr_params + nr_temps) * 4 + reg_index > 80)
+           c->fp->use_const_buffer = GL_TRUE;
+        else
+           c->fp->use_const_buffer = GL_FALSE;
+        /*printf("WM use_const_buffer = %d\n", c->fp->use_const_buffer);*/
 
         if (c->fp->use_const_buffer) {
            /* We'll use a real constant buffer and fetch constants from
@@ -216,7 +350,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
            for (i = 0; i < nr_params; i++) {
               /* loop over XYZW channels */
               for (j = 0; j < 4; j++, index++) {
-                 reg = brw_vec1_grf(c->reg_index + index / 8, index % 8);
+                 reg = brw_vec1_grf(reg_index + index / 8, index % 8);
                  /* Save pointer to parameter/constant value.
                   * Constants will be copied in prepare_constant_buffer()
                   */
@@ -226,7 +360,7 @@ static void prealloc_reg(struct brw_wm_compile *c)
            }
            /* number of constant regs used (each reg is float[8]) */
            c->nr_creg = 2 * ((4 * nr_params + 15) / 16);
-           c->reg_index += c->nr_creg;
+           reg_index += c->nr_creg;
         }
     }
 
@@ -242,23 +376,31 @@ static void prealloc_reg(struct brw_wm_compile *c)
 	  fp_input = -1;
 
        if (fp_input >= 0 && inputs & (1 << fp_input)) {
-	  urb_read_length = c->reg_index;
-	  reg = brw_vec8_grf(c->reg_index, 0);
+	  urb_read_length = reg_index;
+	  reg = brw_vec8_grf(reg_index, 0);
 	  for (j = 0; j < 4; j++)
 	     set_reg(c, PROGRAM_PAYLOAD, fp_input, j, reg);
        }
        if (c->key.vp_outputs_written & (1 << i)) {
-	  c->reg_index += 2;
+	  reg_index += 2;
        }
     }
 
     c->prog_data.first_curbe_grf = c->key.nr_depth_regs * 2;
     c->prog_data.urb_read_length = urb_read_length;
     c->prog_data.curb_read_length = c->nr_creg;
-    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index++;
-    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, c->reg_index, 0);
-    c->reg_index += 2;
+    c->emit_mask_reg = brw_uw1_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index++;
+    c->stack =  brw_uw16_reg(BRW_GENERAL_REGISTER_FILE, reg_index, 0);
+    reg_index += 2;
+
+    /* mark GRF regs [0..reg_index-1] as in-use */
+    for (i = 0; i < reg_index; i++)
+       prealloc_grf(c, i);
+
+    /* Don't use GRF 126, 127.  Using them seems to lead to GPU lock-ups */
+    prealloc_grf(c, 126);
+    prealloc_grf(c, 127);
 
     /* An instruction may reference up to three constants.
      * They'll be found in these registers.
@@ -267,12 +409,12 @@ static void prealloc_reg(struct brw_wm_compile *c)
     if (c->fp->use_const_buffer) {
        for (i = 0; i < 3; i++) {
           c->current_const[i].index = -1;
-          c->current_const[i].reg = alloc_tmp(c);
+          c->current_const[i].reg = brw_vec8_grf(alloc_grf(c), 0);
        }
     }
 #if 0
     printf("USE CONST BUFFER? %d\n", c->fp->use_const_buffer);
-    printf("AFTER PRE_ALLOC, reg_index = %d\n", c->reg_index);
+    printf("AFTER PRE_ALLOC, reg_index = %d\n", reg_index);
 #endif
 }
 
@@ -376,6 +518,14 @@ static struct brw_reg get_src_reg(struct brw_wm_compile *c,
     const GLuint nr = 1;
     const GLuint component = GET_SWZ(src->Swizzle, channel);
 
+    /* Extended swizzle terms */
+    if (component == SWIZZLE_ZERO) {
+       return brw_imm_f(0.0F);
+    }
+    else if (component == SWIZZLE_ONE) {
+       return brw_imm_f(1.0F);
+    }
+
     if (c->fp->use_const_buffer &&
         (src->File == PROGRAM_STATE_VAR ||
          src->File == PROGRAM_CONSTANT ||
@@ -673,27 +823,26 @@ static void emit_fb_write(struct brw_wm_compile *c,
     }
 
     if (c->key.dest_depth_reg) {
-        GLuint comp = c->key.dest_depth_reg / 2;
-        GLuint off = c->key.dest_depth_reg % 2;
+        const GLuint comp = c->key.dest_depth_reg / 2;
+        const GLuint off = c->key.dest_depth_reg % 2;
 
-        assert(comp == 1);
-        assert(off == 0);
-#if 0
-        /* XXX do we need this code?   comp always 1, off always 0, it seems */
         if (off != 0) {
+            /* XXX this code needs review/testing */
+            struct brw_reg arg1_0 = get_src_reg(c, inst, 1, comp);
+            struct brw_reg arg1_1 = get_src_reg(c, inst, 1, comp+1);
+
             brw_push_insn_state(p);
             brw_set_compression_control(p, BRW_COMPRESSION_NONE);
 
-            brw_MOV(p, brw_message_reg(nr), offset(arg1[comp],1));
+            brw_MOV(p, brw_message_reg(nr), offset(arg1_0, 1));
             /* 2nd half? */
-            brw_MOV(p, brw_message_reg(nr+1), arg1[comp+1]);
+            brw_MOV(p, brw_message_reg(nr+1), arg1_1);
             brw_pop_insn_state(p);
         }
         else
-#endif
         {
-           struct brw_reg src =  get_src_reg(c, inst, 1, 1);
-           brw_MOV(p, brw_message_reg(nr), src);
+            struct brw_reg src =  get_src_reg(c, inst, 1, 1);
+            brw_MOV(p, brw_message_reg(nr), src);
         }
         nr += 2;
    }
@@ -2476,6 +2625,7 @@ static void emit_txb(struct brw_wm_compile *c,
     struct brw_reg dst[4], src[4], payload_reg;
     GLuint unit = c->fp->program.Base.SamplerUnits[inst->TexSrcUnit];
     GLuint i;
+    GLuint msg_type;
 
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
@@ -2504,6 +2654,14 @@ static void emit_txb(struct brw_wm_compile *c,
     }
     brw_MOV(p, brw_message_reg(5), src[3]);          /* bias */
     brw_MOV(p, brw_message_reg(6), brw_imm_f(0));    /* ref (unused?) */
+
+    if (BRW_IS_IGDNG(p->brw)) {
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_IGDNG;
+    } else {
+        /* Does it work well on SIMD8? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
+    }
+
     brw_SAMPLE(p,
                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW),  /* dest */
                1,                                           /* msg_reg_nr */
@@ -2511,10 +2669,12 @@ static void emit_txb(struct brw_wm_compile *c,
                SURF_INDEX_TEXTURE(unit),
                unit,                                        /* sampler */
                inst->DstReg.WriteMask,                      /* writemask */
-               BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS,      /* msg_type */
+               msg_type,                                    /* msg_type */
                4,                                           /* response_length */
                4,                                           /* msg_length */
-               0);                                          /* eot */
+               0,                                           /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
 }
 
 
@@ -2528,6 +2688,7 @@ static void emit_tex(struct brw_wm_compile *c,
     GLuint i, nr;
     GLuint emit;
     GLboolean shadow = (c->key.shadowtex_mask & (1<<unit)) ? 1 : 0;
+    GLuint msg_type;
 
     payload_reg = get_reg(c, PROGRAM_PAYLOAD, PAYLOAD_DEPTH, 0, 1, 0, 0);
 
@@ -2568,6 +2729,16 @@ static void emit_tex(struct brw_wm_compile *c,
        brw_MOV(p, brw_message_reg(6), src[2]);        /* ref value / R coord */
     }
 
+    if (BRW_IS_IGDNG(p->brw)) {
+        if (shadow)
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_COMPARE_IGDNG;
+        else
+            msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_IGDNG;
+    } else {
+        /* Does it work for shadow on SIMD8 ? */
+        msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+    }
+    
     brw_SAMPLE(p,
                retype(vec8(dst[0]), BRW_REGISTER_TYPE_UW), /* dest */
                1,                                          /* msg_reg_nr */
@@ -2575,10 +2746,12 @@ static void emit_tex(struct brw_wm_compile *c,
                SURF_INDEX_TEXTURE(unit),
                unit,                                       /* sampler */
                inst->DstReg.WriteMask,                     /* writemask */
-               BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE,           /* msg_type */
+               msg_type,                                   /* msg_type */
                4,                                          /* response_length */
                shadow ? 6 : 4,                             /* msg_length */
-               0);                                         /* eot */
+               0,                                          /* eot */
+               1,
+               BRW_SAMPLER_SIMD_MODE_SIMD8);	
 
     if (shadow)
 	brw_MOV(p, dst[3], brw_imm_f(1.0));
@@ -2595,15 +2768,15 @@ static void post_wm_emit( struct brw_wm_compile *c )
 
 static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 {
-#define MAX_IFSN 32
+#define MAX_IF_DEPTH 32
 #define MAX_LOOP_DEPTH 32
-    struct brw_instruction *if_inst[MAX_IFSN], *loop_inst[MAX_LOOP_DEPTH];
-    struct brw_instruction *inst0, *inst1;
-    int i, if_insn = 0, loop_insn = 0;
+    struct brw_instruction *if_inst[MAX_IF_DEPTH], *loop_inst[MAX_LOOP_DEPTH];
+    GLuint i, if_depth = 0, loop_depth = 0;
     struct brw_compile *p = &c->func;
     struct brw_indirect stack_index = brw_indirect(0, 0);
 
-    c->reg_index = 0;
+    c->out_of_regs = GL_FALSE;
+
     prealloc_reg(c);
     brw_set_compression_control(p, BRW_COMPRESSION_NONE);
     brw_MOV(p, get_addr_reg(stack_index), brw_address(c->stack));
@@ -2611,6 +2784,8 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
     for (i = 0; i < c->nr_fp_insns; i++) {
         const struct prog_instruction *inst = &c->prog_instructions[i];
 
+        c->cur_inst = i;
+
 #if 0
         _mesa_printf("Inst %d: ", i);
         _mesa_print_instruction(inst);
@@ -2770,15 +2945,15 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		emit_kil(c);
 		break;
 	    case OPCODE_IF:
-		assert(if_insn < MAX_IFSN);
-		if_inst[if_insn++] = brw_IF(p, BRW_EXECUTE_8);
+		assert(if_depth < MAX_IF_DEPTH);
+		if_inst[if_depth++] = brw_IF(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_ELSE:
-		if_inst[if_insn-1]  = brw_ELSE(p, if_inst[if_insn-1]);
+		if_inst[if_depth-1]  = brw_ELSE(p, if_inst[if_depth-1]);
 		break;
 	    case OPCODE_ENDIF:
-		assert(if_insn > 0);
-		brw_ENDIF(p, if_inst[--if_insn]);
+		assert(if_depth > 0);
+		brw_ENDIF(p, if_inst[--if_depth]);
 		break;
 	    case OPCODE_BGNSUB:
 		brw_save_label(p, inst->Comment, p->nr_insn);
@@ -2812,7 +2987,7 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		break;
 	    case OPCODE_BGNLOOP:
                 /* XXX may need to invalidate the current_constant regs */
-		loop_inst[loop_insn++] = brw_DO(p, BRW_EXECUTE_8);
+		loop_inst[loop_depth++] = brw_DO(p, BRW_EXECUTE_8);
 		break;
 	    case OPCODE_BRK:
 		brw_BREAK(p);
@@ -2823,36 +2998,40 @@ static void brw_wm_emit_glsl(struct brw_context *brw, struct brw_wm_compile *c)
 		brw_set_predicate_control(p, BRW_PREDICATE_NONE);
 		break;
 	    case OPCODE_ENDLOOP: 
-		loop_insn--;
-		inst0 = inst1 = brw_WHILE(p, loop_inst[loop_insn]);
-		/* patch all the BREAK instructions from
-		   last BEGINLOOP */
-		while (inst0 > loop_inst[loop_insn]) {
-		    inst0--;
-		    if (inst0->header.opcode == BRW_OPCODE_BREAK) {
-			inst0->bits3.if_else.jump_count = inst1 - inst0 + 1;
+               {
+                  struct brw_instruction *inst0, *inst1;
+                  GLuint br = 1;
+
+                  if (BRW_IS_IGDNG(brw))
+                     br = 2;
+ 
+                  loop_depth--;
+                  inst0 = inst1 = brw_WHILE(p, loop_inst[loop_depth]);
+                  /* patch all the BREAK/CONT instructions from last BEGINLOOP */
+                  while (inst0 > loop_inst[loop_depth]) {
+                     inst0--;
+                     if (inst0->header.opcode == BRW_OPCODE_BREAK) {
+			inst0->bits3.if_else.jump_count = br * (inst1 - inst0 + 1);
 			inst0->bits3.if_else.pop_count = 0;
-		    } else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
-                        inst0->bits3.if_else.jump_count = inst1 - inst0;
+                     }
+                     else if (inst0->header.opcode == BRW_OPCODE_CONTINUE) {
+                        inst0->bits3.if_else.jump_count = br * (inst1 - inst0);
                         inst0->bits3.if_else.pop_count = 0;
-                    }
-		}
-		break;
+                     }
+                  }
+               }
+               break;
 	    default:
 		_mesa_printf("unsupported IR in fragment shader %d\n",
 			inst->Opcode);
 	}
+
 	if (inst->CondUpdate)
 	    brw_set_predicate_control(p, BRW_PREDICATE_NORMAL);
 	else
 	    brw_set_predicate_control(p, BRW_PREDICATE_NONE);
     }
     post_wm_emit(c);
-
-    if (c->reg_index >= BRW_WM_MAX_GRF) {
-        _mesa_problem(NULL, "Ran out of registers in brw_wm_emit_glsl()");
-        /* XXX we need to do some proper error recovery here */
-    }
 }
 
 
@@ -2876,6 +3055,6 @@ void brw_wm_glsl_emit(struct brw_context *brw, struct brw_wm_compile *c)
         brw_wm_print_program(c, "brw_wm_glsl_emit done");
     }
 
-    c->prog_data.total_grf = c->reg_index;
+    c->prog_data.total_grf = num_grf_used(c);
     c->prog_data.total_scratch = 0;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_wm_iz.c b/src/mesa/drivers/dri/i965/brw_wm_iz.c
index bd60ac9b315..8fd067abe7d 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_iz.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_iz.c
@@ -116,6 +116,10 @@ const struct {
  { C, 0, 1, 1, 1 } 
 };
 
+/**
+ * \param line_aa  AA_NEVER, AA_ALWAYS or AA_SOMETIMES
+ * \param lookup  bitmask of IZ_* flags
+ */
 void brw_wm_lookup_iz( GLuint line_aa,
 		       GLuint lookup,
 		       struct brw_wm_prog_key *key )
diff --git a/src/mesa/drivers/dri/i965/brw_wm_pass1.c b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
index ab9aa2f10d0..3436a247170 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_pass1.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_pass1.c
@@ -159,6 +159,7 @@ void brw_wm_pass1( struct brw_wm_compile *c )
       case OPCODE_FRC:
       case OPCODE_MOV:
       case OPCODE_SWZ:
+      case OPCODE_TRUNC:
 	 read0 = writemask;
 	 break;
 
diff --git a/src/mesa/drivers/dri/i965/brw_wm_state.c b/src/mesa/drivers/dri/i965/brw_wm_state.c
index 67b41173fb2..39f8c6d522b 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_state.c
@@ -71,7 +71,9 @@ wm_unit_populate_key(struct brw_context *brw, struct brw_wm_unit_key *key)
       key->max_threads = 1;
    else {
       /* WM maximum threads is number of EUs times number of threads per EU. */
-      if (BRW_IS_G4X(brw))
+      if (BRW_IS_IGDNG(brw))
+         key->max_threads = 12 * 6;
+      else if (BRW_IS_G4X(brw))
 	 key->max_threads = 10 * 5;
       else
 	 key->max_threads = 8 * 4;
@@ -141,7 +143,11 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread0.kernel_start_pointer = brw->wm.prog_bo->offset >> 6; /* reloc */
    wm.thread1.depth_coef_urb_read_offset = 1;
    wm.thread1.floating_point_mode = BRW_FLOATING_POINT_NON_IEEE_754;
-   wm.thread1.binding_table_entry_count = key->nr_surfaces;
+
+   if (BRW_IS_IGDNG(brw))
+      wm.thread1.binding_table_entry_count = 0; /* hardware requirement */
+   else
+      wm.thread1.binding_table_entry_count = key->nr_surfaces;
 
    if (key->total_scratch != 0) {
       wm.thread2.scratch_space_base_pointer =
@@ -158,7 +164,11 @@ wm_unit_create_from_key(struct brw_context *brw, struct brw_wm_unit_key *key,
    wm.thread3.const_urb_entry_read_length = key->curb_entry_read_length;
    wm.thread3.const_urb_entry_read_offset = key->curbe_offset * 2;
 
-   wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+   if (BRW_IS_IGDNG(brw)) 
+      wm.wm4.sampler_count = 0; /* hardware requirement */
+   else
+      wm.wm4.sampler_count = (key->sampler_count + 1) / 4;
+
    if (brw->wm.sampler_bo != NULL) {
       /* reloc */
       wm.wm4.sampler_state_pointer = brw->wm.sampler_bo->offset >> 5;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 805df8a4af4..096f74394eb 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -176,22 +176,6 @@ static GLuint translate_tex_format( GLuint mesa_format, GLenum internal_format,
    }
 }
 
-
-/**
- * Use same key for WM and VS surfaces.
- */
-struct brw_surface_key {
-   GLenum target, depthmode;
-   dri_bo *bo;
-   GLint format, internal_format;
-   GLint first_level, last_level;
-   GLint width, height, depth;
-   GLint pitch, cpp;
-   uint32_t tiling;
-   GLuint offset;
-};
-
-
 static void
 brw_set_surface_tiling(struct brw_surface_state *surf, uint32_t tiling)
 {
@@ -268,7 +252,7 @@ brw_create_texture_surface( struct brw_context *brw,
       surf.ss0.cube_neg_z = 1;
    }
 
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -321,10 +305,11 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.tiling = intelObj->mt->region->tiling;
 
    dri_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                         &key, sizeof(key),
-                                         &key.bo, key.bo ? 1 : 0,
-                                         NULL);
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
+                                            &key, sizeof(key),
+                                            &key.bo, key.bo ? 1 : 0,
+                                            NULL);
    if (brw->wm.surf_bo[surf] == NULL) {
       brw->wm.surf_bo[surf] = brw_create_texture_surface(brw, &key);
    }
@@ -336,7 +321,7 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
  * Create the constant buffer surface.  Vertex/fragment shader constants will be
  * read from this buffer with Data Port Read instructions/messages.
  */
-static dri_bo *
+dri_bo *
 brw_create_constant_surface( struct brw_context *brw,
                              struct brw_surface_key *key )
 {
@@ -362,7 +347,7 @@ brw_create_constant_surface( struct brw_context *brw,
    surf.ss3.pitch = (key->pitch * key->cpp) - 1; /* ignored?? */
    brw_set_surface_tiling(&surf, key->tiling); /* tiling now allowed */
  
-   bo = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+   bo = brw_upload_cache(&brw->surface_cache, BRW_SS_SURFACE,
 			 key, sizeof(*key),
 			 &key->bo, key->bo ? 1 : 0,
 			 &surf, sizeof(surf),
@@ -380,39 +365,70 @@ brw_create_constant_surface( struct brw_context *brw,
    return bo;
 }
 
+/* Creates a new WM constant buffer reflecting the current fragment program's
+ * constants, if needed by the fragment program.
+ *
+ * Otherwise, constants go through the CURBEs using the brw_constant_buffer
+ * state atom.
+ */
+static drm_intel_bo *
+brw_wm_update_constant_buffer(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params = fp->program.Base.Parameters;
+   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   drm_intel_bo *const_buffer;
+
+   /* BRW_NEW_FRAGMENT_PROGRAM */
+   if (!fp->use_const_buffer)
+      return NULL;
+
+   const_buffer = drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer",
+				     size, 64);
+
+   /* _NEW_PROGRAM_CONSTANTS */
+   dri_bo_subdata(const_buffer, 0, size, params->ParameterValues);
+
+   return const_buffer;
+}
 
 /**
  * Update the surface state for a WM constant buffer.
  * The constant buffer will be (re)allocated here if needed.
  */
-static dri_bo *
+static void
 brw_update_wm_constant_surface( GLcontext *ctx,
-                                GLuint surf,
-                                dri_bo *const_buffer,
-                                const struct gl_program_parameter_list *params)
+                                GLuint surf)
 {
    struct brw_context *brw = brw_context(ctx);
    struct brw_surface_key key;
-   struct intel_context *intel = &brw->intel;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   const struct gl_program_parameter_list *params =
+      fp->program.Base.Parameters;
 
-   /* free old const buffer if too small */
-   if (const_buffer && const_buffer->size < size) {
-      dri_bo_unreference(const_buffer);
-      const_buffer = NULL;
-   }
+   /* If we're in this state update atom, we need to update WM constants, so
+    * free the old buffer and create a new one for the new contents.
+    */
+   dri_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
 
-   /* alloc new buffer if needed */
-   if (!const_buffer) {
-      const_buffer =
-         drm_intel_bo_alloc(intel->bufmgr, "fp_const_buffer", size, 64);
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+      brw->wm.surf_bo[surf] = NULL;
+      return;
    }
 
    memset(&key, 0, sizeof(key));
 
    key.format = MESA_FORMAT_RGBA_FLOAT32;
    key.internal_format = GL_RGBA;
-   key.bo = const_buffer;
+   key.bo = fp->const_buffer;
    key.depthmode = GL_NONE;
    key.pitch = params->NumParameters;
    key.width = params->NumParameters;
@@ -427,77 +443,59 @@ brw_update_wm_constant_surface( GLcontext *ctx,
    */
 
    dri_bo_unreference(brw->wm.surf_bo[surf]);
-   brw->wm.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
+   brw->wm.surf_bo[surf] = brw_search_cache(&brw->surface_cache,
+                                            BRW_SS_SURFACE,
                                             &key, sizeof(key),
                                             &key.bo, key.bo ? 1 : 0,
                                             NULL);
    if (brw->wm.surf_bo[surf] == NULL) {
       brw->wm.surf_bo[surf] = brw_create_constant_surface(brw, &key);
    }
-
-   return const_buffer;
+   brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
 }
 
-
 /**
- * Update the surface state for a VS constant buffer.
- * The constant buffer will be (re)allocated here if needed.
+ * Updates surface / buffer for fragment shader constant buffer, if
+ * one is required.
+ *
+ * This consumes the state updates for the constant buffer, and produces
+ * BRW_NEW_WM_SURFACES to get picked up by brw_prepare_wm_surfaces for
+ * inclusion in the binding table.
  */
-static dri_bo *
-brw_update_vs_constant_surface( GLcontext *ctx,
-                                GLuint surf,
-                                dri_bo *const_buffer,
-                                const struct gl_program_parameter_list *params)
+static void prepare_wm_constant_surface(struct brw_context *brw )
 {
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_surface_key key;
-   struct intel_context *intel = &brw->intel;
-   const int size = params->NumParameters * 4 * sizeof(GLfloat);
-
-   assert(surf == 0);
-
-   /* free old const buffer if too small */
-   if (const_buffer && const_buffer->size < size) {
-      dri_bo_unreference(const_buffer);
-      const_buffer = NULL;
-   }
-
-   /* alloc new buffer if needed */
-   if (!const_buffer) {
-      const_buffer =
-         drm_intel_bo_alloc(intel->bufmgr, "vp_const_buffer", size, 64);
-   }
-
-   memset(&key, 0, sizeof(key));
-
-   key.format = MESA_FORMAT_RGBA_FLOAT32;
-   key.internal_format = GL_RGBA;
-   key.bo = const_buffer;
-   key.depthmode = GL_NONE;
-   key.pitch = params->NumParameters;
-   key.width = params->NumParameters;
-   key.height = 1;
-   key.depth = 1;
-   key.cpp = 16;
+   GLcontext *ctx = &brw->intel.ctx;
+   struct brw_fragment_program *fp =
+      (struct brw_fragment_program *) brw->fragment_program;
+   GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
 
-   /*
-   printf("%s:\n", __FUNCTION__);
-   printf("  width %d  height %d  depth %d  cpp %d  pitch %d\n",
-          key.width, key.height, key.depth, key.cpp, key.pitch);
-   */
+   drm_intel_bo_unreference(fp->const_buffer);
+   fp->const_buffer = brw_wm_update_constant_buffer(brw);
 
-   dri_bo_unreference(brw->vs.surf_bo[surf]);
-   brw->vs.surf_bo[surf] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-                                            &key, sizeof(key),
-                                            &key.bo, key.bo ? 1 : 0,
-                                            NULL);
-   if (brw->vs.surf_bo[surf] == NULL) {
-      brw->vs.surf_bo[surf] = brw_create_constant_surface(brw, &key);
+   /* If there's no constant buffer, then no surface BO is needed to point at
+    * it.
+    */
+   if (fp->const_buffer == 0) {
+      if (brw->wm.surf_bo[surf] != NULL) {
+	 drm_intel_bo_unreference(brw->wm.surf_bo[surf]);
+	 brw->wm.surf_bo[surf] = NULL;
+	 brw->state.dirty.brw |= BRW_NEW_WM_SURFACES;
+      }
+      return;
    }
 
-   return const_buffer;
+   brw_update_wm_constant_surface(ctx, surf);
 }
 
+const struct brw_tracked_state brw_wm_constant_surface = {
+   .dirty = {
+      .mesa = (_NEW_PROGRAM_CONSTANTS),
+      .brw = (BRW_NEW_FRAGMENT_PROGRAM),
+      .cache = 0
+   },
+   .prepare = prepare_wm_constant_surface,
+};
+
 
 /**
  * Sets up a surface state structure to point at the given region.
@@ -507,7 +505,7 @@ brw_update_vs_constant_surface( GLcontext *ctx,
 static void
 brw_update_renderbuffer_surface(struct brw_context *brw,
 				struct gl_renderbuffer *rb,
-				unsigned int unit, GLboolean cached)
+				unsigned int unit)
 {
    GLcontext *ctx = &brw->intel.ctx;
    dri_bo *region_bo = NULL;
@@ -567,12 +565,11 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 		      ctx->Color.BlendEnabled);
 
    dri_bo_unreference(brw->wm.surf_bo[unit]);
-   brw->wm.surf_bo[unit] = NULL;
-   if (cached) 
-       brw->wm.surf_bo[unit] = brw_search_cache(&brw->cache, BRW_SS_SURFACE,
-	       &key, sizeof(key),
-	       &region_bo, 1,
-	       NULL);
+   brw->wm.surf_bo[unit] = brw_search_cache(&brw->surface_cache,
+					    BRW_SS_SURFACE,
+					    &key, sizeof(key),
+					    &region_bo, 1,
+					    NULL);
 
    if (brw->wm.surf_bo[unit] == NULL) {
       struct brw_surface_state surf;
@@ -581,7 +578,27 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 
       surf.ss0.surface_format = key.surface_format;
       surf.ss0.surface_type = key.surface_type;
-      surf.ss1.base_addr =  key.draw_offset;
+      if (key.tiling == I915_TILING_NONE) {
+	 surf.ss1.base_addr = key.draw_offset;
+      } else {
+	 uint32_t tile_offset = key.draw_offset % 4096;
+
+	 surf.ss1.base_addr = key.draw_offset - tile_offset;
+
+	 assert(BRW_IS_G4X(brw) || tile_offset == 0);
+	 if (BRW_IS_G4X(brw)) {
+	    if (key.tiling == I915_TILING_X) {
+	       /* Note that the low bits of these fields are missing, so
+		* there's the possibility of getting in trouble.
+		*/
+	       surf.ss5.x_offset = (tile_offset % 512) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 512 / 2;
+	    } else {
+	       surf.ss5.x_offset = (tile_offset % 128) / key.cpp / 4;
+	       surf.ss5.y_offset = tile_offset / 128 / 2;
+	    }
+	 }
+      }
       if (region_bo != NULL)
 	 surf.ss1.base_addr += region_bo->offset; /* reloc */
 
@@ -598,7 +615,8 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
       surf.ss0.writedisable_alpha = !key.color_mask[3];
 
       /* Key size will never match key size for textures, so we're safe. */
-      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->cache, BRW_SS_SURFACE,
+      brw->wm.surf_bo[unit] = brw_upload_cache(&brw->surface_cache,
+                                               BRW_SS_SURFACE,
                                                &key, sizeof(key),
 					       &region_bo, 1,
 					       &surf, sizeof(surf),
@@ -611,7 +629,7 @@ brw_update_renderbuffer_surface(struct brw_context *brw,
 	 drm_intel_bo_emit_reloc(brw->wm.surf_bo[unit],
 				 offsetof(struct brw_surface_state, ss1),
 				 region_bo,
-				 key.draw_offset,
+				 surf.ss1.base_addr,
 				 I915_GEM_DOMAIN_RENDER,
 				 I915_GEM_DOMAIN_RENDER);
       }
@@ -630,7 +648,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
 
    assert(brw->wm.nr_surfaces <= BRW_WM_MAX_SURF);
 
-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
+   bind_bo = brw_search_cache(&brw->surface_cache, BRW_SS_SURF_BIND,
 			      NULL, 0,
 			      brw->wm.surf_bo, brw->wm.nr_surfaces,
 			      NULL);
@@ -646,7 +664,7 @@ brw_wm_get_binding_table(struct brw_context *brw)
          else
             data[i] = 0;
 
-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
+      bind_bo = brw_upload_cache( &brw->surface_cache, BRW_SS_SURF_BIND,
 				  NULL, 0,
 				  brw->wm.surf_bo, brw->wm.nr_surfaces,
 				  data, data_size,
@@ -682,27 +700,17 @@ static void prepare_wm_surfaces(struct brw_context *brw )
       for (i = 0; i < ctx->DrawBuffer->_NumColorDrawBuffers; i++) {
          brw_update_renderbuffer_surface(brw,
 					 ctx->DrawBuffer->_ColorDrawBuffers[i],
-					 i,
-					 GL_FALSE);
+					 i);
       }
    } else {
-      brw_update_renderbuffer_surface(brw, NULL, 0, GL_TRUE);
+      brw_update_renderbuffer_surface(brw, NULL, 0);
    }
 
    old_nr_surfaces = brw->wm.nr_surfaces;
    brw->wm.nr_surfaces = MAX_DRAW_BUFFERS;
 
-   /* Update surface / buffer for fragment shader constant buffer */
-   {
-      const GLuint surf = SURF_INDEX_FRAG_CONST_BUFFER;
-      struct brw_fragment_program *fp =
-         (struct brw_fragment_program *) brw->fragment_program;
-      fp->const_buffer =
-         brw_update_wm_constant_surface(ctx, surf, fp->const_buffer,
-                                     fp->program.Base.Parameters);
-
-      brw->wm.nr_surfaces = surf + 1;
-   }
+   if (brw->wm.surf_bo[SURF_INDEX_FRAG_CONST_BUFFER] != NULL)
+       brw->wm.nr_surfaces = SURF_INDEX_FRAG_CONST_BUFFER + 1;
 
    /* Update surfaces for textures */
    for (i = 0; i < BRW_MAX_TEX_UNIT; i++) {
@@ -735,100 +743,16 @@ static void prepare_wm_surfaces(struct brw_context *brw )
       brw->state.dirty.brw |= BRW_NEW_NR_WM_SURFACES;
 }
 
-
-/**
- * Constructs the binding table for the VS surface state.
- */
-static dri_bo *
-brw_vs_get_binding_table(struct brw_context *brw)
-{
-   dri_bo *bind_bo;
-
-   assert(brw->vs.nr_surfaces <= BRW_VS_MAX_SURF);
-
-   bind_bo = brw_search_cache(&brw->cache, BRW_SS_SURF_BIND,
-			      NULL, 0,
-			      brw->vs.surf_bo, brw->vs.nr_surfaces,
-			      NULL);
-
-   if (bind_bo == NULL) {
-      GLuint data_size = brw->vs.nr_surfaces * sizeof(GLuint);
-      uint32_t *data = malloc(data_size);
-      int i;
-
-      for (i = 0; i < brw->vs.nr_surfaces; i++)
-         if (brw->vs.surf_bo[i])
-            data[i] = brw->vs.surf_bo[i]->offset;
-         else
-            data[i] = 0;
-
-      bind_bo = brw_upload_cache( &brw->cache, BRW_SS_SURF_BIND,
-				  NULL, 0,
-				  brw->vs.surf_bo, brw->vs.nr_surfaces,
-				  data, data_size,
-				  NULL, NULL);
-
-      /* Emit binding table relocations to surface state */
-      for (i = 0; i < BRW_VS_MAX_SURF; i++) {
-	 if (brw->vs.surf_bo[i] != NULL) {
-	    dri_bo_emit_reloc(bind_bo,
-			      I915_GEM_DOMAIN_INSTRUCTION, 0,
-			      0,
-			      i * sizeof(GLuint),
-			      brw->vs.surf_bo[i]);
-	 }
-      }
-
-      free(data);
-   }
-
-   return bind_bo;
-}
-
-
-/**
- * Vertex shader surfaces.  Just constant buffer for now.  Could add vertex 
- * shader textures in the future.
- */
-static void prepare_vs_surfaces(struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-
-   /* Update surface / buffer for vertex shader constant buffer */
-   {
-      const GLuint surf = SURF_INDEX_VERT_CONST_BUFFER;
-      struct brw_vertex_program *vp =
-         (struct brw_vertex_program *) brw->vertex_program;
-      vp->const_buffer =
-         brw_update_vs_constant_surface(ctx, surf, vp->const_buffer,
-                                        vp->program.Base.Parameters);
-
-      brw->vs.nr_surfaces = 1;
-   }
-
-   dri_bo_unreference(brw->vs.bind_bo);
-   brw->vs.bind_bo = brw_vs_get_binding_table(brw);
-
-   if (1)
-      brw->state.dirty.brw |= BRW_NEW_NR_VS_SURFACES;
-}
-
-
-static void
-prepare_surfaces(struct brw_context *brw)
-{
-   prepare_wm_surfaces(brw);
-   prepare_vs_surfaces(brw);
-}
-
-
 const struct brw_tracked_state brw_wm_surfaces = {
    .dirty = {
-      .mesa = _NEW_COLOR | _NEW_TEXTURE | _NEW_BUFFERS | _NEW_PROGRAM,
-      .brw = BRW_NEW_CONTEXT,
+      .mesa = (_NEW_COLOR |
+               _NEW_TEXTURE |
+               _NEW_BUFFERS),
+      .brw = (BRW_NEW_CONTEXT |
+	      BRW_NEW_WM_SURFACES),
       .cache = 0
    },
-   .prepare = prepare_surfaces,
+   .prepare = prepare_wm_surfaces,
 };
 
 
diff --git a/src/mesa/drivers/dri/i965/intel_generatemipmap.c b/src/mesa/drivers/dri/i965/intel_generatemipmap.c
new file mode 120000
index 00000000000..4c6b37ada01
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_generatemipmap.c
@@ -0,0 +1 @@
+../intel/intel_generatemipmap.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_read.c b/src/mesa/drivers/dri/i965/intel_pixel_read.c
new file mode 120000
index 00000000000..cc4589f4d42
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_pixel_read.c
@@ -0,0 +1 @@
+../intel/intel_pixel_read.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/intel/intel_batchbuffer.c b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
index 29dc05c518e..0f87fc46a44 100644
--- a/src/mesa/drivers/dri/intel/intel_batchbuffer.c
+++ b/src/mesa/drivers/dri/intel/intel_batchbuffer.c
@@ -195,7 +195,6 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
 {
    struct intel_context *intel = batch->intel;
    GLuint used = batch->ptr - batch->map;
-   GLboolean was_locked = intel->locked;
 
    if (used == 0) {
       batch->cliprect_mode = IGNORE_CLIPRECTS;
@@ -243,13 +242,9 @@ _intel_batchbuffer_flush(struct intel_batchbuffer *batch, const char *file,
    /* TODO: Just pass the relocation list and dma buffer up to the
     * kernel.
     */
-   if (!was_locked)
-      LOCK_HARDWARE(intel);
-
+   LOCK_HARDWARE(intel);
    do_flush_locked(batch, used, GL_FALSE);
-
-   if (!was_locked)
-      UNLOCK_HARDWARE(intel);
+   UNLOCK_HARDWARE(intel);
 
    if (INTEL_DEBUG & DEBUG_SYNC) {
       fprintf(stderr, "waiting for idle\n");
diff --git a/src/mesa/drivers/dri/intel/intel_blit.c b/src/mesa/drivers/dri/intel/intel_blit.c
index 49198281316..2e95bd1013f 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.c
+++ b/src/mesa/drivers/dri/intel/intel_blit.c
@@ -108,6 +108,8 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
 	 CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       }
 
+      assert(src->tiling != I915_TILING_Y);
+      assert(dst->tiling != I915_TILING_Y);
 #ifndef I915
       if (src->tiling != I915_TILING_NONE) {
 	 CMD |= XY_SRC_TILED;
@@ -175,66 +177,6 @@ intelCopyBuffer(const __DRIdrawablePrivate * dPriv,
    UNLOCK_HARDWARE(intel);
 }
 
-
-
-
-void
-intelEmitFillBlit(struct intel_context *intel,
-		  GLuint cpp,
-		  GLshort dst_pitch,
-		  dri_bo *dst_buffer,
-		  GLuint dst_offset,
-		  uint32_t dst_tiling,
-		  GLshort x, GLshort y,
-		  GLshort w, GLshort h,
-		  GLuint color)
-{
-   GLuint BR13, CMD;
-   BATCH_LOCALS;
-
-   dst_pitch *= cpp;
-
-   switch (cpp) {
-   case 1:
-      BR13 = (0xF0 << 16);
-      CMD = XY_COLOR_BLT_CMD;
-      break;
-   case 2:
-      BR13 = (0xF0 << 16) | BR13_565;
-      CMD = XY_COLOR_BLT_CMD;
-      break;
-   case 4:
-      BR13 = (0xF0 << 16) | BR13_8888;
-      CMD = XY_COLOR_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
-      break;
-   default:
-      return;
-   }
-#ifndef I915
-   if (dst_tiling != I915_TILING_NONE) {
-      CMD |= XY_DST_TILED;
-      dst_pitch /= 4;
-   }
-#endif
-
-   DBG("%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
-       __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
-
-   assert(w > 0);
-   assert(h > 0);
-
-   BEGIN_BATCH(6, NO_LOOP_CLIPRECTS);
-   OUT_BATCH(CMD);
-   OUT_BATCH(BR13 | dst_pitch);
-   OUT_BATCH((y << 16) | x);
-   OUT_BATCH(((y + h) << 16) | (x + w));
-   OUT_RELOC(dst_buffer,
-	     I915_GEM_DOMAIN_RENDER, I915_GEM_DOMAIN_RENDER,
-	     dst_offset);
-   OUT_BATCH(color);
-   ADVANCE_BATCH();
-}
-
 static GLuint translate_raster_op(GLenum logicop)
 {
    switch(logicop) {
@@ -261,7 +203,7 @@ static GLuint translate_raster_op(GLenum logicop)
 
 /* Copy BitBlt
  */
-void
+GLboolean
 intelEmitCopyBlit(struct intel_context *intel,
 		  GLuint cpp,
 		  GLshort src_pitch,
@@ -283,6 +225,19 @@ intelEmitCopyBlit(struct intel_context *intel,
    dri_bo *aper_array[3];
    BATCH_LOCALS;
 
+   if (dst_tiling != I915_TILING_NONE) {
+      if (dst_offset & 4095)
+	 return GL_FALSE;
+      if (dst_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+   if (src_tiling != I915_TILING_NONE) {
+      if (src_offset & 4095)
+	 return GL_FALSE;
+      if (src_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+
    /* do space/cliprects check before going any further */
    do {
        aper_array[0] = intel->batch->buf;
@@ -297,12 +252,7 @@ intelEmitCopyBlit(struct intel_context *intel,
    } while (pass < 2);
 
    if (pass >= 2) {
-       GLboolean locked = GL_FALSE;       
-       if (!intel->locked) {
-           LOCK_HARDWARE(intel);
-           locked = GL_TRUE;
-       }
-
+       LOCK_HARDWARE(intel);
        dri_bo_map(dst_buffer, GL_TRUE);
        dri_bo_map(src_buffer, GL_FALSE);
        _mesa_copy_rect((GLubyte *)dst_buffer->virtual + dst_offset,
@@ -316,11 +266,9 @@ intelEmitCopyBlit(struct intel_context *intel,
        
        dri_bo_unmap(src_buffer);
        dri_bo_unmap(dst_buffer);
-       
-       if (locked)
-           UNLOCK_HARDWARE(intel);
+       UNLOCK_HARDWARE(intel);
 
-       return;
+       return GL_TRUE;
    }
 
    intel_batchbuffer_require_space(intel->batch, 8 * 4, NO_LOOP_CLIPRECTS);
@@ -347,7 +295,7 @@ intelEmitCopyBlit(struct intel_context *intel,
       CMD = XY_SRC_COPY_BLT_CMD | XY_BLT_WRITE_ALPHA | XY_BLT_WRITE_RGB;
       break;
    default:
-      return;
+      return GL_FALSE;
    }
 
 #ifndef I915
@@ -362,7 +310,7 @@ intelEmitCopyBlit(struct intel_context *intel,
 #endif
 
    if (dst_y2 <= dst_y || dst_x2 <= dst_x) {
-      return;
+      return GL_TRUE;
    }
 
    assert(dst_x < dst_x2);
@@ -384,6 +332,8 @@ intelEmitCopyBlit(struct intel_context *intel,
    ADVANCE_BATCH();
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   return GL_TRUE;
 }
 
 
@@ -596,7 +546,7 @@ intelClearWithBlit(GLcontext *ctx, GLbitfield mask)
    UNLOCK_HARDWARE(intel);
 }
 
-void
+GLboolean
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLuint cpp,
 				  GLubyte *src_bits, GLuint src_size,
@@ -612,11 +562,18 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
    int dwords = ALIGN(src_size, 8) / 4;
    uint32_t opcode, br13, blit_cmd;
 
+   if (dst_tiling != I915_TILING_NONE) {
+      if (dst_offset & 4095)
+	 return GL_FALSE;
+      if (dst_tiling == I915_TILING_Y)
+	 return GL_FALSE;
+   }
+
    assert( logic_op - GL_CLEAR >= 0 );
    assert( logic_op - GL_CLEAR < 0x10 );
 
    if (w < 0 || h < 0)
-      return;
+      return GL_TRUE;
 
    dst_pitch *= cpp;
 
@@ -673,4 +630,6 @@ intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 			   REFERENCES_CLIPRECTS );
 
    intel_batchbuffer_emit_mi_flush(intel->batch);
+
+   return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_blit.h b/src/mesa/drivers/dri/intel/intel_blit.h
index 52065b13ed7..152fa3f17bf 100644
--- a/src/mesa/drivers/dri/intel/intel_blit.h
+++ b/src/mesa/drivers/dri/intel/intel_blit.h
@@ -35,7 +35,8 @@ extern void intelCopyBuffer(const __DRIdrawablePrivate * dpriv,
 
 extern void intelClearWithBlit(GLcontext * ctx, GLbitfield mask);
 
-extern void intelEmitCopyBlit(struct intel_context *intel,
+GLboolean
+intelEmitCopyBlit(struct intel_context *intel,
                               GLuint cpp,
                               GLshort src_pitch,
                               dri_bo *src_buffer,
@@ -50,16 +51,7 @@ extern void intelEmitCopyBlit(struct intel_context *intel,
                               GLshort w, GLshort h,
 			      GLenum logicop );
 
-extern void intelEmitFillBlit(struct intel_context *intel,
-                              GLuint cpp,
-                              GLshort dst_pitch,
-                              dri_bo *dst_buffer,
-                              GLuint dst_offset,
-			      uint32_t dst_tiling,
-                              GLshort x, GLshort y,
-                              GLshort w, GLshort h, GLuint color);
-
-void
+GLboolean
 intelEmitImmediateColorExpandBlit(struct intel_context *intel,
 				  GLuint cpp,
 				  GLubyte *src_bits, GLuint src_size,
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.c b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
index 2e6b77824df..9600557f2cc 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.c
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.c
@@ -35,6 +35,9 @@
 #include "intel_batchbuffer.h"
 #include "intel_regions.h"
 
+static GLboolean
+intel_bufferobj_unmap(GLcontext * ctx,
+                      GLenum target, struct gl_buffer_object *obj);
 
 /** Allocates a new dri_bo to store the data for the buffer object. */
 static void
@@ -100,7 +103,13 @@ intel_bufferobj_free(GLcontext * ctx, struct gl_buffer_object *obj)
    struct intel_buffer_object *intel_obj = intel_buffer_object(obj);
 
    assert(intel_obj);
-   assert(!obj->Pointer); /* Mesa should have unmapped it */
+
+   /* Buffer objects are automatically unmapped when deleting according
+    * to the spec, but Mesa doesn't do UnmapBuffer for us at context destroy
+    * (though it does if you call glDeleteBuffers)
+    */
+   if (obj->Pointer)
+      intel_bufferobj_unmap(ctx, 0, obj);
 
    _mesa_free(intel_obj->sys_buffer);
    if (intel_obj->region) {
@@ -234,6 +243,9 @@ intel_bufferobj_map(GLcontext * ctx,
       return obj->Pointer;
    }
 
+   if (!read_only)
+      intelFlush(ctx);
+
    if (intel_obj->region)
       intel_bufferobj_cow(intel, intel_obj);
 
@@ -294,14 +306,19 @@ intel_bufferobj_buffer(struct intel_context *intel,
    }
 
    if (intel_obj->buffer == NULL) {
+      void *sys_buffer = intel_obj->sys_buffer;
+
+      /* only one of buffer and sys_buffer could be non-NULL */
       intel_bufferobj_alloc_buffer(intel, intel_obj);
+      intel_obj->sys_buffer = NULL;
+
       intel_bufferobj_subdata(&intel->ctx,
 			      GL_ARRAY_BUFFER_ARB,
 			      0,
 			      intel_obj->Base.Size,
-			      intel_obj->sys_buffer,
+			      sys_buffer,
 			      &intel_obj->Base);
-      _mesa_free(intel_obj->sys_buffer);
+      _mesa_free(sys_buffer);
       intel_obj->sys_buffer = NULL;
    }
 
@@ -309,15 +326,13 @@ intel_bufferobj_buffer(struct intel_context *intel,
 }
 
 void
-intel_bufferobj_init(struct intel_context *intel)
+intelInitBufferObjectFuncs(struct dd_function_table *functions)
 {
-   GLcontext *ctx = &intel->ctx;
-
-   ctx->Driver.NewBufferObject = intel_bufferobj_alloc;
-   ctx->Driver.DeleteBuffer = intel_bufferobj_free;
-   ctx->Driver.BufferData = intel_bufferobj_data;
-   ctx->Driver.BufferSubData = intel_bufferobj_subdata;
-   ctx->Driver.GetBufferSubData = intel_bufferobj_get_subdata;
-   ctx->Driver.MapBuffer = intel_bufferobj_map;
-   ctx->Driver.UnmapBuffer = intel_bufferobj_unmap;
+   functions->NewBufferObject = intel_bufferobj_alloc;
+   functions->DeleteBuffer = intel_bufferobj_free;
+   functions->BufferData = intel_bufferobj_data;
+   functions->BufferSubData = intel_bufferobj_subdata;
+   functions->GetBufferSubData = intel_bufferobj_get_subdata;
+   functions->MapBuffer = intel_bufferobj_map;
+   functions->UnmapBuffer = intel_bufferobj_unmap;
 }
diff --git a/src/mesa/drivers/dri/intel/intel_buffer_objects.h b/src/mesa/drivers/dri/intel/intel_buffer_objects.h
index 04310156319..8164407f079 100644
--- a/src/mesa/drivers/dri/intel/intel_buffer_objects.h
+++ b/src/mesa/drivers/dri/intel/intel_buffer_objects.h
@@ -60,7 +60,7 @@ dri_bo *intel_bufferobj_buffer(struct intel_context *intel,
 
 /* Hook the bufferobject implementation into mesa: 
  */
-void intel_bufferobj_init(struct intel_context *intel);
+void intelInitBufferObjectFuncs(struct dd_function_table *functions);
 
 
 
@@ -72,10 +72,7 @@ void intel_bufferobj_init(struct intel_context *intel);
 static INLINE struct intel_buffer_object *
 intel_buffer_object(struct gl_buffer_object *obj)
 {
-   if (obj->Name)
-      return (struct intel_buffer_object *) obj;
-   else
-      return NULL;
+   return (struct intel_buffer_object *) obj;
 }
 
 /* Helpers for zerocopy image uploads.  See also intel_regions.h:
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index d2fad9e4ea1..e7357e78c53 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -157,7 +157,7 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    /* Do this here, not core Mesa, since this function is called from
     * many places within the driver.
     */
-   if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+   if (ctx->NewState & _NEW_BUFFERS) {
       /* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
       _mesa_update_framebuffer(ctx);
       /* this updates the DrawBuffer's Width/Height if it's a FBO */
@@ -345,6 +345,23 @@ intelDrawBuffer(GLcontext * ctx, GLenum mode)
 static void
 intelReadBuffer(GLcontext * ctx, GLenum mode)
 {
+   if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+      struct intel_context *const intel = intel_context(ctx);
+      const GLboolean was_front_buffer_reading =
+	intel->is_front_buffer_reading;
+
+      intel->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
+	|| (mode == GL_FRONT);
+
+      /* If we weren't front-buffer reading before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+      if (!was_front_buffer_reading && intel->is_front_buffer_reading) {
+	 intel_update_renderbuffers(intel->driContext,
+				    intel->driContext->driDrawablePriv);
+      }
+   }
+
    if (ctx->ReadBuffer == ctx->DrawBuffer) {
       /* This will update FBO completeness status.
        * A framebuffer will be incomplete if the GL_READ_BUFFER setting
diff --git a/src/mesa/drivers/dri/intel/intel_chipset.h b/src/mesa/drivers/dri/intel/intel_chipset.h
index 4593d90df3d..3c38f1676c9 100644
--- a/src/mesa/drivers/dri/intel/intel_chipset.h
+++ b/src/mesa/drivers/dri/intel/intel_chipset.h
@@ -67,13 +67,18 @@
 #define PCI_CHIP_G45_G                  0x2E22
 #define PCI_CHIP_G41_G                  0x2E32
 
+#define PCI_CHIP_ILD_G                  0x0042
+#define PCI_CHIP_ILM_G                  0x0046
+
 #define IS_MOBILE(devid)	(devid == PCI_CHIP_I855_GM || \
 				 devid == PCI_CHIP_I915_GM || \
 				 devid == PCI_CHIP_I945_GM || \
 				 devid == PCI_CHIP_I945_GME || \
 				 devid == PCI_CHIP_I965_GM || \
 				 devid == PCI_CHIP_I965_GME || \
-				 devid == PCI_CHIP_GM45_GM || IS_IGD(devid))
+				 devid == PCI_CHIP_GM45_GM || \
+				 IS_IGD(devid) || \
+				 devid == PCI_CHIP_ILM_G)
 
 #define IS_G45(devid)           (devid == PCI_CHIP_IGD_E_G || \
                                  devid == PCI_CHIP_Q45_G || \
@@ -82,6 +87,10 @@
 #define IS_GM45(devid)          (devid == PCI_CHIP_GM45_GM)
 #define IS_G4X(devid)		(IS_G45(devid) || IS_GM45(devid))
 
+#define IS_ILD(devid)           (devid == PCI_CHIP_ILD_G)
+#define IS_ILM(devid)           (devid == PCI_CHIP_ILM_G)
+#define IS_IGDNG(devid)           (IS_ILD(devid) || IS_ILM(devid))
+
 #define IS_915(devid)		(devid == PCI_CHIP_I915_G || \
 				 devid == PCI_CHIP_E7221_G || \
 				 devid == PCI_CHIP_I915_GM)
@@ -99,7 +108,8 @@
 				 devid == PCI_CHIP_I965_GM || \
 				 devid == PCI_CHIP_I965_GME || \
 				 devid == PCI_CHIP_I946_GZ || \
-				 IS_G4X(devid))
+				 IS_G4X(devid) || \
+				 IS_IGDNG(devid))
 
 #define IS_9XX(devid)		(IS_915(devid) || \
 				 IS_945(devid) || \
diff --git a/src/mesa/drivers/dri/intel/intel_clear.c b/src/mesa/drivers/dri/intel/intel_clear.c
index 19f47632ac0..cfddabd3182 100644
--- a/src/mesa/drivers/dri/intel/intel_clear.c
+++ b/src/mesa/drivers/dri/intel/intel_clear.c
@@ -53,240 +53,10 @@
 #include "intel_clear.h"
 #include "intel_fbo.h"
 #include "intel_pixel.h"
+#include "intel_regions.h"
 
 #define FILE_DEBUG_FLAG DEBUG_BLIT
 
-#define TRI_CLEAR_COLOR_BITS (BUFFER_BIT_BACK_LEFT |			\
-			      BUFFER_BIT_FRONT_LEFT |			\
-			      BUFFER_BIT_COLOR0 |			\
-			      BUFFER_BIT_COLOR1 |			\
-			      BUFFER_BIT_COLOR2 |			\
-			      BUFFER_BIT_COLOR3 |			\
-			      BUFFER_BIT_COLOR4 |			\
-			      BUFFER_BIT_COLOR5 |			\
-			      BUFFER_BIT_COLOR6 |			\
-			      BUFFER_BIT_COLOR7)
-
-
-/**
- * Per-context one-time init of things for intl_clear_tris().
- * Basically set up a private array object for vertex/color arrays.
- */
-static void
-init_clear(GLcontext *ctx)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct gl_array_object *arraySave = NULL;
-   const GLuint arrayBuffer = ctx->Array.ArrayBufferObj->Name;
-   const GLuint elementBuffer = ctx->Array.ElementArrayBufferObj->Name;
-
-   /* create new array object */
-   intel->clear.arrayObj = _mesa_new_array_object(ctx, ~0);
-
-   /* save current array object, bind new one */
-   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
-
-   /* one-time setup of vertex arrays (pos, color) */
-   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, 0);
-   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, 0);
-   _mesa_ColorPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), intel->clear.color);
-   _mesa_VertexPointer(3, GL_FLOAT, 3 * sizeof(GLfloat), intel->clear.vertices);
-   _mesa_Enable(GL_COLOR_ARRAY);
-   _mesa_Enable(GL_VERTEX_ARRAY);
-
-   /* restore original array object */
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
-   _mesa_reference_array_object(ctx, &arraySave, NULL);
-
-   /* restore original buffer objects */
-   _mesa_BindBufferARB(GL_ARRAY_BUFFER_ARB, arrayBuffer);
-   _mesa_BindBufferARB(GL_ELEMENT_ARRAY_BUFFER_ARB, elementBuffer);
-}
-
-
-
-/**
- * Perform glClear where mask contains only color, depth, and/or stencil.
- *
- * The implementation is based on calling into Mesa to set GL state and
- * performing normal triangle rendering.  The intent of this path is to
- * have as generic a path as possible, so that any driver could make use of
- * it.
- */
-void
-intel_clear_tris(GLcontext *ctx, GLbitfield mask)
-{
-   struct intel_context *intel = intel_context(ctx);
-   GLfloat dst_z;
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   int i;
-   GLboolean saved_fp_enable = GL_FALSE, saved_vp_enable = GL_FALSE;
-   GLuint saved_shader_program = 0;
-   unsigned int saved_active_texture;
-   struct gl_array_object *arraySave = NULL;
-
-   if (!intel->clear.arrayObj)
-      init_clear(ctx);
-
-   assert((mask & ~(TRI_CLEAR_COLOR_BITS | BUFFER_BIT_DEPTH |
-		    BUFFER_BIT_STENCIL)) == 0);
-
-   _mesa_PushAttrib(GL_COLOR_BUFFER_BIT |
-		    GL_CURRENT_BIT |
-		    GL_DEPTH_BUFFER_BIT |
-		    GL_ENABLE_BIT |
-		    GL_POLYGON_BIT |
-		    GL_STENCIL_BUFFER_BIT |
-		    GL_TRANSFORM_BIT |
-		    GL_CURRENT_BIT);
-   saved_active_texture = ctx->Texture.CurrentUnit;
-
-   /* Disable existing GL state we don't want to apply to a clear. */
-   _mesa_Disable(GL_ALPHA_TEST);
-   _mesa_Disable(GL_BLEND);
-   _mesa_Disable(GL_CULL_FACE);
-   _mesa_Disable(GL_FOG);
-   _mesa_Disable(GL_POLYGON_SMOOTH);
-   _mesa_Disable(GL_POLYGON_STIPPLE);
-   _mesa_Disable(GL_POLYGON_OFFSET_FILL);
-   _mesa_Disable(GL_LIGHTING);
-   _mesa_Disable(GL_CLIP_PLANE0);
-   _mesa_Disable(GL_CLIP_PLANE1);
-   _mesa_Disable(GL_CLIP_PLANE2);
-   _mesa_Disable(GL_CLIP_PLANE3);
-   _mesa_Disable(GL_CLIP_PLANE4);
-   _mesa_Disable(GL_CLIP_PLANE5);
-   _mesa_PolygonMode(GL_FRONT_AND_BACK, GL_FILL);
-   if (ctx->Extensions.ARB_fragment_program && ctx->FragmentProgram.Enabled) {
-      saved_fp_enable = GL_TRUE;
-      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
-   }
-   if (ctx->Extensions.ARB_vertex_program && ctx->VertexProgram.Enabled) {
-      saved_vp_enable = GL_TRUE;
-      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
-   }
-   if (ctx->Extensions.ARB_shader_objects && ctx->Shader.CurrentProgram) {
-      saved_shader_program = ctx->Shader.CurrentProgram->Name;
-      _mesa_UseProgramObjectARB(0);
-   }
-
-   if (ctx->Texture._EnabledUnits != 0) {
-      int i;
-
-      for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-	 _mesa_ActiveTextureARB(GL_TEXTURE0 + i);
-	 _mesa_Disable(GL_TEXTURE_1D);
-	 _mesa_Disable(GL_TEXTURE_2D);
-	 _mesa_Disable(GL_TEXTURE_3D);
-	 if (ctx->Extensions.ARB_texture_cube_map)
-	    _mesa_Disable(GL_TEXTURE_CUBE_MAP_ARB);
-	 if (ctx->Extensions.NV_texture_rectangle)
-	    _mesa_Disable(GL_TEXTURE_RECTANGLE_NV);
-	 if (ctx->Extensions.MESA_texture_array) {
-	    _mesa_Disable(GL_TEXTURE_1D_ARRAY_EXT);
-	    _mesa_Disable(GL_TEXTURE_2D_ARRAY_EXT);
-	 }
-      }
-   }
-
-   /* save current array object, bind our private one */
-   _mesa_reference_array_object(ctx, &arraySave, ctx->Array.ArrayObj);
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, intel->clear.arrayObj);
-
-   intel_meta_set_passthrough_transform(intel);
-
-   for (i = 0; i < 4; i++) {
-      COPY_4FV(intel->clear.color[i], ctx->Color.ClearColor);
-   }
-
-   /* convert clear Z from [0,1] to NDC coord in [-1,1] */
-   dst_z = -1.0 + 2.0 * ctx->Depth.Clear;
-
-   /* Prepare the vertices, which are the same regardless of which buffer we're
-    * drawing to.
-    */
-   intel->clear.vertices[0][0] = fb->_Xmin;
-   intel->clear.vertices[0][1] = fb->_Ymin;
-   intel->clear.vertices[0][2] = dst_z;
-   intel->clear.vertices[1][0] = fb->_Xmax;
-   intel->clear.vertices[1][1] = fb->_Ymin;
-   intel->clear.vertices[1][2] = dst_z;
-   intel->clear.vertices[2][0] = fb->_Xmax;
-   intel->clear.vertices[2][1] = fb->_Ymax;
-   intel->clear.vertices[2][2] = dst_z;
-   intel->clear.vertices[3][0] = fb->_Xmin;
-   intel->clear.vertices[3][1] = fb->_Ymax;
-   intel->clear.vertices[3][2] = dst_z;
-
-   while (mask != 0) {
-      GLuint this_mask = 0;
-      GLuint color_bit;
-
-      color_bit = _mesa_ffs(mask & TRI_CLEAR_COLOR_BITS);
-      if (color_bit != 0)
-	 this_mask |= (1 << (color_bit - 1));
-
-      /* Clear depth/stencil in the same pass as color. */
-      this_mask |= (mask & (BUFFER_BIT_DEPTH | BUFFER_BIT_STENCIL));
-
-      /* Select the current color buffer and use the color write mask if
-       * we have one, otherwise don't write any color channels.
-       */
-      if (this_mask & BUFFER_BIT_FRONT_LEFT)
-	 _mesa_DrawBuffer(GL_FRONT_LEFT);
-      else if (this_mask & BUFFER_BIT_BACK_LEFT)
-	 _mesa_DrawBuffer(GL_BACK_LEFT);
-      else if (color_bit != 0)
-	 _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0 +
-			  (color_bit - BUFFER_COLOR0 - 1));
-      else
-	 _mesa_ColorMask(GL_FALSE, GL_FALSE, GL_FALSE, GL_FALSE);
-
-      /* Control writing of the depth clear value to depth. */
-      if (this_mask & BUFFER_BIT_DEPTH) {
-	 _mesa_DepthFunc(GL_ALWAYS);
-	 _mesa_Enable(GL_DEPTH_TEST);
-      } else {
-	 _mesa_Disable(GL_DEPTH_TEST);
-	 _mesa_DepthMask(GL_FALSE);
-      }
-
-      /* Control writing of the stencil clear value to stencil. */
-      if (this_mask & BUFFER_BIT_STENCIL) {
-	 _mesa_Enable(GL_STENCIL_TEST);
-	 _mesa_StencilOpSeparate(GL_FRONT_AND_BACK,
-				 GL_REPLACE, GL_REPLACE, GL_REPLACE);
-	 _mesa_StencilFuncSeparate(GL_FRONT_AND_BACK, GL_ALWAYS,
-				   ctx->Stencil.Clear,
-				   ctx->Stencil.WriteMask[0]);
-      } else {
-	 _mesa_Disable(GL_STENCIL_TEST);
-      }
-
-      _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
-
-      mask &= ~this_mask;
-   }
-
-   intel_meta_restore_transform(intel);
-
-   _mesa_ActiveTextureARB(GL_TEXTURE0 + saved_active_texture);
-   if (saved_fp_enable)
-      _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
-   if (saved_vp_enable)
-      _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
-
-   if (saved_shader_program)
-      _mesa_UseProgramObjectARB(saved_shader_program);
-
-   _mesa_PopAttrib();
-
-   /* restore current array object */
-   _mesa_reference_array_object(ctx, &ctx->Array.ArrayObj, arraySave);
-   _mesa_reference_array_object(ctx, &arraySave, NULL);
-}
-
 static const char *buffer_names[] = {
    [BUFFER_FRONT_LEFT] = "front",
    [BUFFER_BACK_LEFT] = "back",
@@ -340,7 +110,7 @@ intelClear(GLcontext *ctx, GLbitfield mask)
          = intel_get_rb_region(fb, BUFFER_STENCIL);
       if (stencilRegion) {
          /* have hw stencil */
-         if (IS_965(intel->intelScreen->deviceID) ||
+         if (stencilRegion->tiling == I915_TILING_Y ||
 	     (ctx->Stencil.WriteMask[0] & 0xff) != 0xff) {
 	    /* We have to use the 3D engine if we're clearing a partial mask
 	     * of the stencil buffer, or if we're on a 965 which has a tiled
@@ -357,9 +127,10 @@ intelClear(GLcontext *ctx, GLbitfield mask)
 
    /* HW depth */
    if (mask & BUFFER_BIT_DEPTH) {
+      const struct intel_region *irb = intel_get_rb_region(fb, BUFFER_DEPTH);
+
       /* clear depth with whatever method is used for stencil (see above) */
-      if (IS_965(intel->intelScreen->deviceID) ||
-	  tri_mask & BUFFER_BIT_STENCIL)
+      if (irb->tiling == I915_TILING_Y || tri_mask & BUFFER_BIT_STENCIL)
          tri_mask |= BUFFER_BIT_DEPTH;
       else
          blit_mask |= BUFFER_BIT_DEPTH;
@@ -411,7 +182,7 @@ intelClear(GLcontext *ctx, GLbitfield mask)
 	 }
 	 DBG("\n");
       }
-      intel_clear_tris(ctx, tri_mask);
+      meta_clear_tris(&intel->meta, tri_mask);
    }
 
    if (swrast_mask) {
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index cfd983d3682..4abb525f78e 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -161,6 +161,12 @@ intelGetString(GLcontext * ctx, GLenum name)
       case PCI_CHIP_G41_G:
          chipset = "Intel(R) G41";
          break;
+      case PCI_CHIP_ILD_G:
+         chipset = "Intel(R) IGDNG_D";
+         break;
+      case PCI_CHIP_ILM_G:
+         chipset = "Intel(R) IGDNG_M";
+         break;
       default:
          chipset = "Unknown Intel Chipset";
          break;
@@ -220,7 +226,9 @@ intel_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
       struct intel_renderbuffer *stencil_rb;
 
       i = 0;
-      if ((intel->is_front_buffer_rendering || !intel_fb->color_rb[1])
+      if ((intel->is_front_buffer_rendering ||
+	   intel->is_front_buffer_reading ||
+	   !intel_fb->color_rb[1])
 	   && intel_fb->color_rb[0]) {
 	 attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
 	 attachments[i++] = intel_bits_per_pixel(intel_fb->color_rb[0]);
@@ -396,7 +404,7 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
     if (!driContext->driScreenPriv->dri2.enabled)
 	return;
 
-    if (!intel->internal_viewport_call && ctx->DrawBuffer->Name == 0) {
+    if (!intel->meta.internal_viewport_call && ctx->DrawBuffer->Name == 0) {
        /* If we're rendering to the fake front buffer, make sure all the pending
 	* drawing has landed on the real front buffer.  Otherwise when we
 	* eventually get to DRI2GetBuffersWithFormat the stale real front
@@ -561,10 +569,14 @@ intelInitDriverFunctions(struct dd_function_table *functions)
    functions->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
 
    intelInitTextureFuncs(functions);
+   intelInitTextureImageFuncs(functions);
+   intelInitTextureSubImageFuncs(functions);
+   intelInitTextureCopyImageFuncs(functions);
    intelInitStateFuncs(functions);
    intelInitClearFuncs(functions);
    intelInitBufferFuncs(functions);
    intelInitPixelFuncs(functions);
+   intelInitBufferObjectFuncs(functions);
 }
 
 
@@ -660,7 +672,15 @@ intelInitContext(struct intel_context *intel,
     */
    _mesa_init_point(ctx);
 
+   meta_init_metaops(ctx, &intel->meta);
    ctx->Const.MaxColorAttachments = 4;  /* XXX FBO: review this */
+   if (IS_965(intelScreen->deviceID)) {
+      if (MAX_WIDTH > 8192)
+	 ctx->Const.MaxRenderbufferSize = 8192;
+   } else {
+      if (MAX_WIDTH > 2048)
+	 ctx->Const.MaxRenderbufferSize = 2048;
+   }
 
    /* Initialize the software rasterizer and helper modules. */
    _swrast_CreateContext(ctx);
@@ -718,7 +738,6 @@ intelInitContext(struct intel_context *intel,
 
    intel->batch = intel_batchbuffer_alloc(intel);
 
-   intel_bufferobj_init(intel);
    intel_fbo_init(intel);
 
    if (intel->ctx.Mesa_DXTn) {
@@ -728,6 +747,15 @@ intelInitContext(struct intel_context *intel,
    else if (driQueryOptionb(&intel->optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
    }
+   intel->use_texture_tiling = driQueryOptionb(&intel->optionCache,
+					       "texture_tiling");
+   if (intel->use_texture_tiling &&
+       !intel->intelScreen->kernel_exec_fencing) {
+      fprintf(stderr, "No kernel support for execution fencing, "
+	      "disabling texture tiling");
+      intel->use_texture_tiling = GL_FALSE;
+   }
+   intel->use_early_z = driQueryOptionb(&intel->optionCache, "early_z");
 
    intel->prim.primitive = ~0;
 
@@ -767,8 +795,7 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       INTEL_FIREVERTICES(intel);
 
-      if (intel->clear.arrayObj)
-         _mesa_delete_array_object(&intel->ctx, intel->clear.arrayObj);
+      meta_destroy_metaops(&intel->meta);
 
       intel->vtbl.destroy(intel);
 
@@ -789,13 +816,64 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->prim.vb_bo = NULL;
 
       if (release_texture_heaps) {
-         /* This share group is about to go away, free our private
-          * texture object data.
+         /* Nothing is currently done here to free texture heaps;
+          * but we're not using the texture heap utilities, so I
+          * rather think we shouldn't.  I've taken a look, and can't
+          * find any private texture data hanging around anywhere, but
+          * I'm not yet certain there isn't any at all...
           */
-         if (INTEL_DEBUG & DEBUG_TEXTURE)
+         /* if (INTEL_DEBUG & DEBUG_TEXTURE)
             fprintf(stderr, "do something to free texture heaps\n");
+          */
       }
 
+      /* XXX In intelMakeCurrent() below, the context's static regions are 
+       * referenced inside the frame buffer; it's listed as a hack,
+       * with a comment of "XXX FBO temporary fix-ups!", but
+       * as long as it's there, we should release the regions here.
+       * The do/while loop around the block is used to allow the
+       * "continue" statements inside the block to exit the block,
+       * to avoid many layers of "if" constructs.
+       */
+      do {
+         __DRIdrawablePrivate * driDrawPriv = intel->driDrawable;
+         struct intel_framebuffer *intel_fb;
+         struct intel_renderbuffer *irbDepth, *irbStencil;
+         if (!driDrawPriv) {
+            /* We're already detached from the drawable; exit this block. */
+            continue;
+         }
+         intel_fb = (struct intel_framebuffer *) driDrawPriv->driverPrivate;
+         if (!intel_fb) {
+            /* The frame buffer is already gone; exit this block. */
+            continue;
+         }
+         irbDepth = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
+         irbStencil = intel_get_renderbuffer(&intel_fb->Base, BUFFER_STENCIL);
+
+         /* If the regions of the frame buffer still match the regions
+          * of the context, release them.  If they've changed somehow,
+          * leave them alone.
+          */
+         if (intel_fb->color_rb[0] && intel_fb->color_rb[0]->region == intel->front_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[0], NULL);
+         }
+         if (intel_fb->color_rb[1] && intel_fb->color_rb[1]->region == intel->back_region) {
+	    intel_renderbuffer_set_region(intel_fb->color_rb[1], NULL);
+         }
+
+         if (irbDepth && irbDepth->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbDepth, NULL);
+         }
+         /* Usually, the stencil buffer is the same as the depth buffer;
+          * but they're handled separately in MakeCurrent, so we'll
+          * handle them separately here.
+          */
+         if (irbStencil && irbStencil->region == intel->depth_region) {
+	    intel_renderbuffer_set_region(irbStencil, NULL);
+         }
+      } while (0);
+
       intel_region_release(&intel->front_region);
       intel_region_release(&intel->back_region);
       intel_region_release(&intel->depth_region);
@@ -804,6 +882,9 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
 
       /* free the Mesa context */
       _mesa_free_context_data(&intel->ctx);
+
+      FREE(intel);
+      driContextPriv->driverPrivate = NULL;
    }
 }
 
@@ -832,7 +913,10 @@ intelMakeCurrent(__DRIcontextPrivate * driContextPriv,
           if (driDrawPriv != driReadPriv)
               intel_update_renderbuffers(driContextPriv, driReadPriv);
       } else {
-          /* XXX FBO temporary fix-ups! */
+          /* XXX FBO temporary fix-ups!  These are released in 
+           * intelDextroyContext(), above.  Changes here should be
+           * reflected there.
+           */
           /* if the renderbuffers don't have regions, init them from the context */
          struct intel_renderbuffer *irbDepth
             = intel_get_renderbuffer(&intel_fb->Base, BUFFER_DEPTH);
@@ -917,7 +1001,6 @@ intelContendedLock(struct intel_context *intel, GLuint flags)
    int me = intel->hHWContext;
 
    drmGetLock(intel->driFd, intel->hHWContext, flags);
-   intel->locked = 1;
 
    if (INTEL_DEBUG & DEBUG_LOCK)
       _mesa_printf("%s - got contended lock\n", __progname);
@@ -974,9 +1057,12 @@ void LOCK_HARDWARE( struct intel_context *intel )
     struct intel_framebuffer *intel_fb = NULL;
     struct intel_renderbuffer *intel_rb = NULL;
 
-    _glthread_LOCK_MUTEX(lockMutex);
-    assert(!intel->locked);
-    intel->locked = 1;
+    intel->locked++;
+    if (intel->locked >= 2)
+       return;
+
+    if (!sPriv->dri2.enabled)
+       _glthread_LOCK_MUTEX(lockMutex);
 
     if (intel->driDrawable) {
        intel_fb = intel->driDrawable->driverPrivate;
@@ -1023,13 +1109,16 @@ void UNLOCK_HARDWARE( struct intel_context *intel )
 {
     __DRIscreen *sPriv = intel->driScreen;
 
-   intel->vtbl.note_unlock( intel );
-   intel->locked = 0;
+   intel->locked--;
+   if (intel->locked > 0)
+      return;
 
-   if (!sPriv->dri2.enabled)
-      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+   assert(intel->locked == 0);
 
-   _glthread_UNLOCK_MUTEX(lockMutex);
+   if (!sPriv->dri2.enabled) {
+      DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
+      _glthread_UNLOCK_MUTEX(lockMutex);
+   }
 
    if (INTEL_DEBUG & DEBUG_LOCK)
       _mesa_printf("%s - unlocked\n", __progname);
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index b5cf7e6648e..08bea88c958 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -33,6 +33,7 @@
 #include "main/mtypes.h"
 #include "main/mm.h"
 #include "texmem.h"
+#include "dri_metaops.h"
 #include "drm.h"
 #include "intel_bufmgr.h"
 
@@ -91,7 +92,6 @@ struct intel_context
       void (*new_batch) (struct intel_context * intel);
       void (*emit_invarient_state) (struct intel_context * intel);
       void (*note_fence) (struct intel_context *intel, GLuint fence);
-      void (*note_unlock) (struct intel_context *intel);
       void (*update_texture_state) (struct intel_context * intel);
 
       void (*render_start) (struct intel_context * intel);
@@ -158,19 +158,7 @@ struct intel_context
       void (*debug_batch)(struct intel_context *intel);
    } vtbl;
 
-   struct {
-      struct gl_fragment_program *bitmap_fp;
-      struct gl_vertex_program *passthrough_vp;
-
-      struct gl_fragment_program *saved_fp;
-      GLboolean saved_fp_enable;
-      struct gl_vertex_program *saved_vp;
-      GLboolean saved_vp_enable;
-
-      GLint saved_vp_x, saved_vp_y;
-      GLsizei saved_vp_width, saved_vp_height;
-      GLenum saved_matrix_mode;
-   } meta;
+   struct dri_metaops meta;
 
    GLint refcount;
    GLuint Fallback;
@@ -182,7 +170,6 @@ struct intel_context
    struct intel_region *front_region;
    struct intel_region *back_region;
    struct intel_region *depth_region;
-   GLboolean internal_viewport_call;
 
    /**
     * This value indicates that the kernel memory manager is being used
@@ -215,13 +202,6 @@ struct intel_context
    GLuint ClearColor565;
    GLuint ClearColor8888;
 
-   /* info for intel_clear_tris() */
-   struct
-   {
-      struct gl_array_object *arrayObj;
-      GLfloat vertices[4][3];
-      GLfloat color[4][4];
-   } clear;
 
    /* Offsets of fields within the current vertex:
     */
@@ -294,6 +274,17 @@ struct intel_context
     * easily.
     */
    GLboolean is_front_buffer_rendering;
+   /**
+    * Track whether front-buffer is the current read target.
+    *
+    * This is closely associated with is_front_buffer_rendering, but may
+    * be set separately.  The DRI2 fake front buffer must be referenced
+    * either way.
+    */
+   GLboolean is_front_buffer_reading;
+
+   GLboolean use_texture_tiling;
+   GLboolean use_early_z;
 
    drm_clip_rect_t fboRect;     /**< cliprect for FBO rendering */
 
@@ -549,6 +540,9 @@ void intel_viewport(GLcontext * ctx, GLint x, GLint y,
 void intel_update_renderbuffers(__DRIcontext *context,
 				__DRIdrawable *drawable);
 
+void i915_set_buf_info_for_region(uint32_t *state, struct intel_region *region,
+				  uint32_t buffer_id);
+
 /*======================================================================
  * Inline conversion functions.  
  * These are better-typed than the macros used previously:
diff --git a/src/mesa/drivers/dri/intel/intel_extensions.c b/src/mesa/drivers/dri/intel/intel_extensions.c
index 9ec1b4ec2f4..7742609d242 100644
--- a/src/mesa/drivers/dri/intel/intel_extensions.c
+++ b/src/mesa/drivers/dri/intel/intel_extensions.c
@@ -34,6 +34,7 @@
 #define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_shader_objects
+#define need_GL_ARB_vertex_array_object
 #define need_GL_ARB_vertex_program
 #define need_GL_ARB_vertex_shader
 #define need_GL_ARB_window_pos
@@ -45,9 +46,11 @@
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_framebuffer_blit
+#define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_point_parameters
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
+#define need_GL_APPLE_vertex_array_object
 #define need_GL_ATI_separate_stencil
 #define need_GL_ATI_envmap_bumpmap
 #define need_GL_NV_point_sprite
@@ -65,6 +68,7 @@
  * i965_dri.
  */
 static const struct dri_extension card_extensions[] = {
+   { "GL_ARB_half_float_pixel",           NULL },
    { "GL_ARB_multitexture",               NULL },
    { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
    { "GL_ARB_texture_border_clamp",       NULL },
@@ -75,6 +79,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_ARB_texture_env_dot3",           NULL },
    { "GL_ARB_texture_mirrored_repeat",    NULL },
    { "GL_ARB_texture_rectangle",          NULL },
+   { "GL_ARB_vertex_array_object",        GL_ARB_vertex_array_object_functions},
    { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
    { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
    { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
@@ -85,6 +90,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_blend_subtract",             NULL },
    { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
    { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+   { "GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions },
    { "GL_EXT_packed_depth_stencil",       NULL },
    { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
    { "GL_EXT_stencil_wrap",               NULL },
@@ -95,6 +101,7 @@ static const struct dri_extension card_extensions[] = {
    { "GL_EXT_texture_lod_bias",           NULL },
    { "GL_3DFX_texture_compression_FXT1",  NULL },
    { "GL_APPLE_client_storage",           NULL },
+   { "GL_APPLE_vertex_array_object",      GL_APPLE_vertex_array_object_functions},
    { "GL_MESA_pack_invert",               NULL },
    { "GL_MESA_ycbcr_texture",             NULL },
    { "GL_NV_blend_square",                NULL },
diff --git a/src/mesa/drivers/dri/intel/intel_fbo.c b/src/mesa/drivers/dri/intel/intel_fbo.c
index 30f58b1f44a..666893596e1 100644
--- a/src/mesa/drivers/dri/intel/intel_fbo.c
+++ b/src/mesa/drivers/dri/intel/intel_fbo.c
@@ -217,7 +217,8 @@ intel_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
       DBG("Allocating %d x %d Intel RBO (pitch %d)\n", width,
 	  height, pitch);
 
-      irb->region = intel_region_alloc(intel, cpp, width, height, pitch,
+      irb->region = intel_region_alloc(intel, I915_TILING_NONE,
+				       cpp, width, height, pitch,
 				       GL_TRUE);
       if (!irb->region)
          return GL_FALSE;       /* out of memory? */
@@ -575,9 +576,10 @@ intel_render_texture(GLcontext * ctx,
 
    ASSERT(newImage);
 
-   if (newImage->Border != 0) {
-      /* Fallback on drawing to a texture with a border, which won't have a
-       * miptree.
+   intel_image = intel_texture_image(newImage);
+   if (!intel_image->mt) {
+      /* Fallback on drawing to a texture that doesn't have a miptree
+       * (has a border, width/height 0, etc.)
        */
       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
       _mesa_render_texture(ctx, fb, att);
@@ -608,7 +610,6 @@ intel_render_texture(GLcontext * ctx,
        irb->Base.RefCount);
 
    /* point the renderbufer's region to the texture image region */
-   intel_image = intel_texture_image(newImage);
    if (irb->region != intel_image->mt->region) {
       if (irb->region)
 	 intel_region_release(&irb->region);
@@ -680,6 +681,11 @@ intel_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
       if (rb == NULL)
 	 continue;
 
+      if (irb == NULL) {
+	 fb->_Status = GL_FRAMEBUFFER_UNSUPPORTED_EXT;
+	 continue;
+      }
+
       switch (irb->texformat->MesaFormat) {
       case MESA_FORMAT_ARGB8888:
       case MESA_FORMAT_RGB565:
diff --git a/src/mesa/drivers/dri/intel/intel_generatemipmap.c b/src/mesa/drivers/dri/intel/intel_generatemipmap.c
new file mode 100644
index 00000000000..fe986092db6
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_generatemipmap.c
@@ -0,0 +1,283 @@
+/*
+ * Copyright (C) 1999-2007  Brian Paul   All Rights Reserved.
+ * Copyright © 2009 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <[email protected]>
+ *
+ */
+
+#include "main/glheader.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/bufferobj.h"
+#include "main/teximage.h"
+#include "main/texenv.h"
+#include "main/texobj.h"
+#include "main/texstate.h"
+#include "main/texparam.h"
+#include "main/varray.h"
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/buffers.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/depth.h"
+#include "main/hash.h"
+#include "main/mipmap.h"
+#include "main/blend.h"
+#include "glapi/dispatch.h"
+#include "swrast/swrast.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_pixel.h"
+#include "intel_tex.h"
+#include "intel_mipmap_tree.h"
+
+static const char *intel_fp_tex2d =
+      "!!ARBfp1.0\n"
+      "TEX result.color, fragment.texcoord[0], texture[0], 2D;\n"
+      "END\n";
+
+static GLboolean
+intel_generate_mipmap_level(GLcontext *ctx, GLuint tex_name,
+			    int level, int width, int height)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLfloat vertices[4][2];
+   GLint status;
+
+   /* Set to source from the previous level */
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_BASE_LEVEL, level - 1);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAX_LEVEL, level - 1);
+
+   /* Set to draw into the current level */
+   _mesa_FramebufferTexture2DEXT(GL_FRAMEBUFFER_EXT,
+				 GL_COLOR_ATTACHMENT0_EXT,
+				 GL_TEXTURE_2D,
+				 tex_name,
+				 level);
+   /* Choose to render to the color attachment. */
+   _mesa_DrawBuffer(GL_COLOR_ATTACHMENT0_EXT);
+
+   status = _mesa_CheckFramebufferStatusEXT (GL_FRAMEBUFFER_EXT);
+   if (status != GL_FRAMEBUFFER_COMPLETE_EXT)
+      return GL_FALSE;
+
+   meta_set_passthrough_transform(&intel->meta);
+
+   /* XXX: Doing it right would involve setting up the transformation to do
+    * 0-1 mapping or something, and not changing the vertex data.
+    */
+   vertices[0][0] = 0;
+   vertices[0][1] = 0;
+   vertices[1][0] = width;
+   vertices[1][1] = 0;
+   vertices[2][0] = width;
+   vertices[2][1] = height;
+   vertices[3][0] = 0;
+   vertices[3][1] = height;
+
+   _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
+   _mesa_Enable(GL_VERTEX_ARRAY);
+   meta_set_default_texrect(&intel->meta);
+
+   _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
+
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+
+   return GL_TRUE;
+}
+
+static GLboolean
+intel_generate_mipmap_2d(GLcontext *ctx,
+			 GLenum target,
+			 struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   GLint old_active_texture;
+   int level, max_levels, start_level, end_level;
+   GLuint fb_name;
+   GLboolean success = GL_FALSE;
+   struct gl_framebuffer *saved_fbo = NULL;
+
+   _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
+		    GL_CURRENT_BIT | GL_COLOR_BUFFER_BIT |
+		    GL_DEPTH_BUFFER_BIT);
+   _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
+   old_active_texture = ctx->Texture.CurrentUnit;
+   _mesa_reference_framebuffer(&saved_fbo, ctx->DrawBuffer);
+
+   _mesa_Disable(GL_POLYGON_STIPPLE);
+   _mesa_Disable(GL_DEPTH_TEST);
+   _mesa_Disable(GL_STENCIL_TEST);
+   _mesa_ColorMask(GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE);
+   _mesa_DepthMask(GL_FALSE);
+
+   /* Bind the given texture to GL_TEXTURE_2D with linear filtering for our
+    * minification.
+    */
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB);
+   _mesa_Enable(GL_TEXTURE_2D);
+   _mesa_BindTexture(GL_TEXTURE_2D, texObj->Name);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER,
+		       GL_LINEAR_MIPMAP_NEAREST);
+   _mesa_TexParameterf(GL_TEXTURE_2D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
+
+   /* Bind the new renderbuffer to the color attachment point. */
+   _mesa_GenFramebuffersEXT(1, &fb_name);
+   _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, fb_name);
+
+   meta_set_fragment_program(&intel->meta, &intel->meta.tex2d_fp,
+			     intel_fp_tex2d);
+   meta_set_passthrough_vertex_program(&intel->meta);
+
+   max_levels = _mesa_max_texture_levels(ctx, texObj->Target);
+   start_level = texObj->BaseLevel;
+   end_level = texObj->MaxLevel;
+
+   /* Loop generating level+1 from level. */
+   for (level = start_level; level < end_level && level < max_levels - 1; level++) {
+      const struct gl_texture_image *srcImage;
+      int width, height;
+
+      srcImage = _mesa_select_tex_image(ctx, texObj, target, level);
+      if (srcImage->Border != 0)
+	 goto fail;
+
+      width = srcImage->Width / 2;
+      if (width < 1)
+	 width = 1;
+      height = srcImage->Height / 2;
+      if (height < 1)
+	 height = 1;
+
+      if (width == srcImage->Width &&
+	  height == srcImage->Height) {
+	 /* Neither _mesa_max_texture_levels nor texObj->MaxLevel are the
+	  * maximum texture level for the object, so break out when we've gone
+	  * over the edge.
+	  */
+	 break;
+      }
+
+      /* Make sure that there's space allocated for the target level.
+       * We could skip this if there's already space allocated and save some
+       * time.
+       */
+      _mesa_TexImage2D(GL_TEXTURE_2D, level + 1, srcImage->InternalFormat,
+		       width, height, 0,
+		       GL_RGBA, GL_UNSIGNED_INT, NULL);
+
+      if (!intel_generate_mipmap_level(ctx, texObj->Name, level + 1,
+				       width, height))
+	 goto fail;
+   }
+
+   success = GL_TRUE;
+
+fail:
+   meta_restore_fragment_program(&intel->meta);
+   meta_restore_vertex_program(&intel->meta);
+
+   _mesa_DeleteFramebuffersEXT(1, &fb_name);
+   _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
+   if (saved_fbo)
+      _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, saved_fbo->Name);
+   _mesa_reference_framebuffer(&saved_fbo, NULL);
+   _mesa_PopClientAttrib();
+   _mesa_PopAttrib();
+
+   return success;
+}
+
+
+/**
+ * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
+ * level).
+ *
+ * The texture object's miptree must be mapped.
+ *
+ * It would be really nice if this was just called by Mesa whenever mipmaps
+ * needed to be regenerated, rather than us having to remember to do so in
+ * each texture image modification path.
+ *
+ * This function should also include an accelerated path.
+ */
+void
+intel_generate_mipmap(GLcontext *ctx, GLenum target,
+                      struct gl_texture_object *texObj)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_texture_object *intelObj = intel_texture_object(texObj);
+   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+   int face, i;
+
+   /* HW path */
+   if (target == GL_TEXTURE_2D &&
+       ctx->Extensions.EXT_framebuffer_object &&
+       ctx->Extensions.ARB_fragment_program &&
+       ctx->Extensions.ARB_vertex_program) {
+      GLboolean success;
+
+      /* We'll be accessing this texture using GL entrypoints, which should
+       * be resilient against other access to this texture.
+       */
+      _mesa_unlock_texture(ctx, texObj);
+      success = intel_generate_mipmap_2d(ctx, target, texObj);
+      _mesa_lock_texture(ctx, texObj);
+
+      if (success)
+	 return;
+   }
+
+   /* SW path */
+   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
+   _mesa_generate_mipmap(ctx, target, texObj);
+   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
+
+   /* Update the level information in our private data in the new images, since
+    * it didn't get set as part of a normal TexImage path.
+    */
+   for (face = 0; face < nr_faces; face++) {
+      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
+         struct intel_texture_image *intelImage;
+
+	 intelImage = intel_texture_image(texObj->Image[face][i]);
+	 if (intelImage == NULL)
+	    break;
+
+	 intelImage->level = i;
+	 intelImage->face = face;
+	 /* Unreference the miptree to signal that the new Data is a bare
+	  * pointer from mesa.
+	  */
+	 intel_miptree_release(intel, &intelImage->mt);
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
index 6e1e034e53d..c985da5aa25 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.c
@@ -57,14 +57,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 			      GLuint last_level,
 			      GLuint width0,
 			      GLuint height0,
-			      GLuint depth0, GLuint cpp, GLuint compress_byte)
+			      GLuint depth0, GLuint cpp, GLuint compress_byte,
+			      uint32_t tiling)
 {
    GLboolean ok;
    struct intel_mipmap_tree *mt = calloc(sizeof(*mt), 1);
 
-   DBG("%s target %s format %s level %d..%d\n", __FUNCTION__,
+   DBG("%s target %s format %s level %d..%d <-- %p\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target),
-       _mesa_lookup_enum_by_nr(internal_format), first_level, last_level);
+       _mesa_lookup_enum_by_nr(internal_format), 
+       first_level, last_level, mt);
 
    mt->target = target_to_target(target);
    mt->internal_format = internal_format;
@@ -80,15 +82,16 @@ intel_miptree_create_internal(struct intel_context *intel,
 
 #ifdef I915
    if (IS_945(intel->intelScreen->deviceID))
-      ok = i945_miptree_layout(intel, mt);
+      ok = i945_miptree_layout(intel, mt, tiling);
    else
-      ok = i915_miptree_layout(intel, mt);
+      ok = i915_miptree_layout(intel, mt, tiling);
 #else
-   ok = brw_miptree_layout(intel, mt);
+   ok = brw_miptree_layout(intel, mt, tiling);
 #endif
 
    if (!ok) {
       free(mt);
+      DBG("%s not okay - returning NULL\n", __FUNCTION__);
       return NULL;
    }
 
@@ -98,6 +101,7 @@ intel_miptree_create_internal(struct intel_context *intel,
 struct intel_mipmap_tree *
 intel_miptree_create(struct intel_context *intel,
 		     GLenum target,
+		     GLenum base_format,
 		     GLenum internal_format,
 		     GLuint first_level,
 		     GLuint last_level,
@@ -107,10 +111,23 @@ intel_miptree_create(struct intel_context *intel,
 		     GLboolean expect_accelerated_upload)
 {
    struct intel_mipmap_tree *mt;
+   uint32_t tiling;
+
+   if (intel->use_texture_tiling && compress_byte == 0 &&
+       intel->intelScreen->kernel_exec_fencing) {
+      if (IS_965(intel->intelScreen->deviceID) &&
+	  (base_format == GL_DEPTH_COMPONENT ||
+	   base_format == GL_DEPTH_STENCIL_EXT))
+	 tiling = I915_TILING_Y;
+      else
+	 tiling = I915_TILING_X;
+   } else
+      tiling = I915_TILING_NONE;
 
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level, width0,
-				      height0, depth0, cpp, compress_byte);
+				      height0, depth0, cpp, compress_byte,
+				      tiling);
    /*
     * pitch == 0 || height == 0  indicates the null texture
     */
@@ -118,6 +135,7 @@ intel_miptree_create(struct intel_context *intel,
       return NULL;
 
    mt->region = intel_region_alloc(intel,
+				   tiling,
 				   mt->cpp,
 				   mt->pitch,
 				   mt->total_height,
@@ -147,7 +165,8 @@ intel_miptree_create_for_region(struct intel_context *intel,
    mt = intel_miptree_create_internal(intel, target, internal_format,
 				      first_level, last_level,
 				      region->width, region->height, 1,
-				      region->cpp, compress_byte);
+				      region->cpp, compress_byte,
+				      I915_TILING_NONE);
    if (!mt)
       return mt;
 #if 0
@@ -185,6 +204,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch)
 {
 #ifdef I915
@@ -205,6 +225,11 @@ int intel_miptree_pitch_align (struct intel_context *intel,
 	 pitch_align = 4;
       }
 
+      if (tiling == I915_TILING_X)
+	 pitch_align = 512;
+      else if (tiling == I915_TILING_Y)
+	 pitch_align = 128;
+
       pitch = ALIGN(pitch * mt->cpp, pitch_align);
 
 #ifdef I915
@@ -328,23 +353,31 @@ intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
 }
 
 
-
 void
-intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
-			       GLuint level, GLuint img,
-			       GLuint x, GLuint y)
+intel_miptree_set_image_offset_ex(struct intel_mipmap_tree *mt,
+                                  GLuint level, GLuint img,
+                                  GLuint x, GLuint y, 
+                                  GLuint offset)
 {
    if (img == 0 && level == 0)
       assert(x == 0 && y == 0);
 
    assert(img < mt->level[level].nr_images);
 
-   mt->level[level].image_offset[img] = (x + y * mt->pitch) * mt->cpp;
+   mt->level[level].image_offset[img] = (x + y * mt->pitch) * mt->cpp + offset;
 
    DBG("%s level %d img %d pos %d,%d image_offset %x\n",
        __FUNCTION__, level, img, x, y, mt->level[level].image_offset[img]);
 }
 
+void
+intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
+			       GLuint level, GLuint img,
+			       GLuint x, GLuint y)
+{
+    intel_miptree_set_image_offset_ex(mt, level, img, x, y, 0);
+}
+
 
 /* Although we use the image_offset[] array to store relative offsets
  * to cube faces, Mesa doesn't know anything about this and expects
@@ -454,11 +487,11 @@ intel_miptree_image_data(struct intel_context *intel,
 			0, 0,                             /* source x, y */
 			dst->level[level].width, height); /* width, height */
 
-      src += src_image_pitch * dst->cpp;
+      src = (char *)src + src_image_pitch * dst->cpp;
    }
 }
 
-extern GLuint intel_compressed_alignment(GLenum);
+extern void intel_get_texture_alignment_unit(GLenum, GLuint *, GLuint *);
 /* Copy mipmap image between trees
  */
 void
@@ -475,20 +508,37 @@ intel_miptree_image_copy(struct intel_context *intel,
    const GLuint *dst_depth_offset = intel_miptree_depth_offsets(dst, level);
    const GLuint *src_depth_offset = intel_miptree_depth_offsets(src, level);
    GLuint i;
+   GLboolean success;
 
    if (dst->compressed) {
-       GLuint alignment = intel_compressed_alignment(dst->internal_format);
+       GLuint align_w, align_h;
+
+       intel_get_texture_alignment_unit(dst->internal_format, &align_w, &align_h);
        height = (height + 3) / 4;
-       width = ((width + alignment - 1) & ~(alignment - 1));
+       width = ALIGN(width, align_w);
    }
 
    for (i = 0; i < depth; i++) {
-      intel_region_copy(intel,
-                        dst->region, dst_offset + dst_depth_offset[i],
-                        0,
-                        0,
-                        src->region, src_offset + src_depth_offset[i],
-                        0, 0, width, height);
+      success = intel_region_copy(intel,
+				  dst->region, dst_offset + dst_depth_offset[i],
+				  0, 0,
+				  src->region, src_offset + src_depth_offset[i],
+				  0, 0, width, height, GL_COPY);
+      if (!success) {
+	 GLubyte *src_ptr, *dst_ptr;
+
+	 src_ptr = intel_region_map(intel, src->region);
+	 dst_ptr = intel_region_map(intel, dst->region);
+
+	 _mesa_copy_rect(dst_ptr + dst_offset + dst_depth_offset[i],
+			 dst->cpp,
+			 dst->pitch,
+			 0, 0, width, height,
+			 src_ptr + src_offset + src_depth_offset[i],
+			 src->pitch,
+			 0, 0);
+	 intel_region_unmap(intel, src->region);
+	 intel_region_unmap(intel, dst->region);
+      }
    }
-
 }
diff --git a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
index 4060b9df78f..c890b2a0d04 100644
--- a/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/intel/intel_mipmap_tree.h
@@ -126,6 +126,7 @@ struct intel_mipmap_tree
 
 struct intel_mipmap_tree *intel_miptree_create(struct intel_context *intel,
                                                GLenum target,
+                                               GLenum base_format,
                                                GLenum internal_format,
                                                GLuint first_level,
                                                GLuint last_level,
@@ -148,6 +149,7 @@ intel_miptree_create_for_region(struct intel_context *intel,
 
 int intel_miptree_pitch_align (struct intel_context *intel,
 			       struct intel_mipmap_tree *mt,
+			       uint32_t tiling,
 			       int pitch);
 
 void intel_miptree_reference(struct intel_mipmap_tree **dst,
@@ -194,6 +196,11 @@ void intel_miptree_set_level_info(struct intel_mipmap_tree *mt,
                                   GLuint x, GLuint y,
                                   GLuint w, GLuint h, GLuint d);
 
+void intel_miptree_set_image_offset_ex(struct intel_mipmap_tree *mt,
+                                       GLuint level,
+                                       GLuint img, GLuint x, GLuint y,
+                                       GLuint offset);
+
 void intel_miptree_set_image_offset(struct intel_mipmap_tree *mt,
                                     GLuint level,
                                     GLuint img, GLuint x, GLuint y);
@@ -218,10 +225,13 @@ void intel_miptree_image_copy(struct intel_context *intel,
 /* i915_mipmap_tree.c:
  */
 GLboolean i915_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean i945_miptree_layout(struct intel_context *intel,
-			      struct intel_mipmap_tree *mt);
+			      struct intel_mipmap_tree *mt,
+			      uint32_t tiling);
 GLboolean brw_miptree_layout(struct intel_context *intel,
-			     struct intel_mipmap_tree *mt);
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling);
 
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index fc0ac0b79c0..a3001416559 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -27,9 +27,12 @@
 
 #include "main/enums.h"
 #include "main/state.h"
+#include "main/bufferobj.h"
 #include "main/context.h"
 #include "main/enable.h"
 #include "main/matrix.h"
+#include "main/texstate.h"
+#include "main/varray.h"
 #include "main/viewport.h"
 #include "swrast/swrast.h"
 #include "shader/arbprogram.h"
@@ -174,167 +177,6 @@ intel_check_blit_format(struct intel_region * region,
 }
 
 void
-intel_meta_set_passthrough_transform(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   intel->meta.saved_vp_x = ctx->Viewport.X;
-   intel->meta.saved_vp_y = ctx->Viewport.Y;
-   intel->meta.saved_vp_width = ctx->Viewport.Width;
-   intel->meta.saved_vp_height = ctx->Viewport.Height;
-   intel->meta.saved_matrix_mode = ctx->Transform.MatrixMode;
-
-   intel->internal_viewport_call = GL_TRUE;
-   _mesa_Viewport(0, 0, ctx->DrawBuffer->Width, ctx->DrawBuffer->Height);
-   intel->internal_viewport_call = GL_FALSE;
-
-   _mesa_MatrixMode(GL_PROJECTION);
-   _mesa_PushMatrix();
-   _mesa_LoadIdentity();
-   _mesa_Ortho(0, ctx->DrawBuffer->Width, 0, ctx->DrawBuffer->Height, 1, -1);
-
-   _mesa_MatrixMode(GL_MODELVIEW);
-   _mesa_PushMatrix();
-   _mesa_LoadIdentity();
-}
-
-void
-intel_meta_restore_transform(struct intel_context *intel)
-{
-   _mesa_MatrixMode(GL_PROJECTION);
-   _mesa_PopMatrix();
-   _mesa_MatrixMode(GL_MODELVIEW);
-   _mesa_PopMatrix();
-
-   _mesa_MatrixMode(intel->meta.saved_matrix_mode);
-
-   intel->internal_viewport_call = GL_TRUE;
-   _mesa_Viewport(intel->meta.saved_vp_x, intel->meta.saved_vp_y,
-		  intel->meta.saved_vp_width, intel->meta.saved_vp_height);
-   intel->internal_viewport_call = GL_FALSE;
-}
-
-/**
- * Set up a vertex program to pass through the position and first texcoord
- * for pixel path.
- */
-void
-intel_meta_set_passthrough_vertex_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-   static const char *vp =
-      "!!ARBvp1.0\n"
-      "TEMP vertexClip;\n"
-      "DP4 vertexClip.x, state.matrix.mvp.row[0], vertex.position;\n"
-      "DP4 vertexClip.y, state.matrix.mvp.row[1], vertex.position;\n"
-      "DP4 vertexClip.z, state.matrix.mvp.row[2], vertex.position;\n"
-      "DP4 vertexClip.w, state.matrix.mvp.row[3], vertex.position;\n"
-      "MOV result.position, vertexClip;\n"
-      "MOV result.texcoord[0], vertex.texcoord[0];\n"
-      "MOV result.color, vertex.color;\n"
-      "END\n";
-
-   assert(intel->meta.saved_vp == NULL);
-
-   _mesa_reference_vertprog(ctx, &intel->meta.saved_vp,
-			    ctx->VertexProgram.Current);
-   if (intel->meta.passthrough_vp == NULL) {
-      GLuint prog_name;
-      _mesa_GenPrograms(1, &prog_name);
-      _mesa_BindProgram(GL_VERTEX_PROGRAM_ARB, prog_name);
-      _mesa_ProgramStringARB(GL_VERTEX_PROGRAM_ARB,
-			     GL_PROGRAM_FORMAT_ASCII_ARB,
-			     strlen(vp), (const GLubyte *)vp);
-      _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp,
-			       ctx->VertexProgram.Current);
-      _mesa_DeletePrograms(1, &prog_name);
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
-			    intel->meta.passthrough_vp);
-   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-			   &intel->meta.passthrough_vp->Base);
-
-   intel->meta.saved_vp_enable = ctx->VertexProgram.Enabled;
-   _mesa_Enable(GL_VERTEX_PROGRAM_ARB);
-}
-
-/**
- * Restores the previous vertex program after
- * intel_meta_set_passthrough_vertex_program()
- */
-void
-intel_meta_restore_vertex_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_vertprog(ctx, &ctx->VertexProgram.Current,
-			    intel->meta.saved_vp);
-   _mesa_reference_vertprog(ctx, &intel->meta.saved_vp, NULL);
-   ctx->Driver.BindProgram(ctx, GL_VERTEX_PROGRAM_ARB,
-			   &ctx->VertexProgram.Current->Base);
-
-   if (!intel->meta.saved_vp_enable)
-      _mesa_Disable(GL_VERTEX_PROGRAM_ARB);
-}
-
-/**
- * Binds the given program string to GL_FRAGMENT_PROGRAM_ARB, caching the
- * program object.
- */
-void
-intel_meta_set_fragment_program(struct intel_context *intel,
-				struct gl_fragment_program **prog,
-				const char *prog_string)
-{
-   GLcontext *ctx = &intel->ctx;
-   assert(intel->meta.saved_fp == NULL);
-
-   _mesa_reference_fragprog(ctx, &intel->meta.saved_fp,
-			    ctx->FragmentProgram.Current);
-   if (*prog == NULL) {
-      GLuint prog_name;
-      _mesa_GenPrograms(1, &prog_name);
-      _mesa_BindProgram(GL_FRAGMENT_PROGRAM_ARB, prog_name);
-      _mesa_ProgramStringARB(GL_FRAGMENT_PROGRAM_ARB,
-			     GL_PROGRAM_FORMAT_ASCII_ARB,
-			     strlen(prog_string), (const GLubyte *)prog_string);
-      _mesa_reference_fragprog(ctx, prog, ctx->FragmentProgram.Current);
-      /* Note that DeletePrograms unbinds the program on us */
-      _mesa_DeletePrograms(1, &prog_name);
-   }
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current, *prog);
-   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB, &((*prog)->Base));
-
-   intel->meta.saved_fp_enable = ctx->FragmentProgram.Enabled;
-   _mesa_Enable(GL_FRAGMENT_PROGRAM_ARB);
-}
-
-/**
- * Restores the previous fragment program after
- * intel_meta_set_fragment_program()
- */
-void
-intel_meta_restore_fragment_program(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   FLUSH_VERTICES(ctx, _NEW_PROGRAM);
-   _mesa_reference_fragprog(ctx, &ctx->FragmentProgram.Current,
-			    intel->meta.saved_fp);
-   _mesa_reference_fragprog(ctx, &intel->meta.saved_fp, NULL);
-   ctx->Driver.BindProgram(ctx, GL_FRAGMENT_PROGRAM_ARB,
-			   &ctx->FragmentProgram.Current->Base);
-
-   if (!intel->meta.saved_fp_enable)
-      _mesa_Disable(GL_FRAGMENT_PROGRAM_ARB);
-}
-
-void
 intelInitPixelFuncs(struct dd_function_table *functions)
 {
    functions->Accum = _swrast_Accum;
@@ -342,18 +184,7 @@ intelInitPixelFuncs(struct dd_function_table *functions)
       functions->Bitmap = intelBitmap;
       functions->CopyPixels = intelCopyPixels;
       functions->DrawPixels = intelDrawPixels;
-#ifdef I915
-      functions->ReadPixels = intelReadPixels;
-#endif
    }
-}
-
-void
-intel_free_pixel_state(struct intel_context *intel)
-{
-   GLcontext *ctx = &intel->ctx;
-
-   _mesa_reference_vertprog(ctx, &intel->meta.passthrough_vp, NULL);
-   _mesa_reference_fragprog(ctx, &intel->meta.bitmap_fp, NULL);
+   functions->ReadPixels = intelReadPixels;
 }
 
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.h b/src/mesa/drivers/dri/intel/intel_pixel.h
index cb41fa182cb..96a6dd17b25 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.h
+++ b/src/mesa/drivers/dri/intel/intel_pixel.h
@@ -31,16 +31,6 @@
 #include "main/mtypes.h"
 
 void intelInitPixelFuncs(struct dd_function_table *functions);
-void intel_meta_set_passthrough_transform(struct intel_context *intel);
-void intel_meta_restore_transform(struct intel_context *intel);
-void intel_meta_set_passthrough_vertex_program(struct intel_context *intel);
-void intel_meta_restore_vertex_program(struct intel_context *intel);
-void intel_meta_set_fragment_program(struct intel_context *intel,
-				     struct gl_fragment_program **prog,
-				     const char *prog_string);
-void intel_meta_restore_fragment_program(struct intel_context *intel);
-void intel_free_pixel_state(struct intel_context *intel);
-
 GLboolean intel_check_blit_fragment_ops(GLcontext * ctx,
 					GLboolean src_alpha_is_one);
 
@@ -76,6 +66,4 @@ void intelBitmap(GLcontext * ctx,
 		 const struct gl_pixelstore_attrib *unpack,
 		 const GLubyte * pixels);
 
-void intel_clear_tris(GLcontext *ctx, GLbitfield mask);
-
 #endif
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
index a2ccae1b7d6..540e7620a9d 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_bitmap.c
@@ -42,6 +42,7 @@
 #include "main/varray.h"
 #include "main/attrib.h"
 #include "main/enable.h"
+#include "main/viewport.h"
 #include "shader/arbprogram.h"
 #include "glapi/dispatch.h"
 #include "swrast/swrast.h"
@@ -194,7 +195,7 @@ do_blit_bitmap( GLcontext *ctx,
    struct gl_framebuffer *fb = ctx->DrawBuffer;
    GLfloat tmpColor[4];
    GLubyte ubcolor[4];
-   GLuint color8888, color565;
+   GLuint color;
    unsigned int num_cliprects;
    drm_clip_rect_t *cliprects;
    int x_off, y_off;
@@ -232,8 +233,11 @@ do_blit_bitmap( GLcontext *ctx,
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[2], tmpColor[2]);
    UNCLAMPED_FLOAT_TO_UBYTE(ubcolor[3], tmpColor[3]);
 
-   color8888 = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1], ubcolor[2], ubcolor[3]);
-   color565 = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+   if (dst->cpp == 2)
+      color = INTEL_PACKCOLOR565(ubcolor[0], ubcolor[1], ubcolor[2]);
+   else
+      color = INTEL_PACKCOLOR8888(ubcolor[0], ubcolor[1],
+				  ubcolor[2], ubcolor[3]);
 
    if (!intel_check_blit_fragment_ops(ctx, tmpColor[3] == 1.0F))
       return GL_FALSE;
@@ -307,21 +311,21 @@ do_blit_bitmap( GLcontext *ctx,
 				   fb->Name == 0 ? GL_TRUE : GL_FALSE) == 0)
 		  continue;
 
-	       /* 
-		*/
-	       intelEmitImmediateColorExpandBlit( intel,
-						  dst->cpp,
-						  (GLubyte *)stipple, 
-						  sz,
-						  (dst->cpp == 2) ? color565 : color8888,
-						  dst->pitch,
-						  dst->buffer,
-						  0,
-						  dst->tiling,
-						  box_x + px,
-						  box_y + py,
-						  w, h,
-						  logic_op);
+	       if (!intelEmitImmediateColorExpandBlit(intel,
+						      dst->cpp,
+						      (GLubyte *)stipple,
+						      sz,
+						      color,
+						      dst->pitch,
+						      dst->buffer,
+						      0,
+						      dst->tiling,
+						      box_x + px,
+						      box_y + py,
+						      w, h,
+						      logic_op)) {
+		  return GL_FALSE;
+	       }
 	    } 
 	 } 
       }
@@ -360,7 +364,6 @@ intel_texture_bitmap(GLcontext * ctx,
       "END\n";
    GLuint texname;
    GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
    GLint old_active_texture;
    GLubyte *unpacked_bitmap;
    GLubyte *a8_bitmap;
@@ -409,6 +412,12 @@ intel_texture_bitmap(GLcontext * ctx,
       return GL_FALSE;
    }
 
+   if (ctx->Fog.Enabled) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "glBitmap() fallback: fog\n");
+      return GL_FALSE;
+   }
+
    /* Check that we can load in a texture this big. */
    if (width > (1 << (ctx->Const.MaxTextureLevels - 1)) ||
        height > (1 << (ctx->Const.MaxTextureLevels - 1))) {
@@ -467,15 +476,18 @@ intel_texture_bitmap(GLcontext * ctx,
 		    GL_ALPHA, GL_UNSIGNED_BYTE, a8_bitmap);
    _mesa_free(a8_bitmap);
 
-   intel_meta_set_fragment_program(intel, &intel->meta.bitmap_fp, fp);
+   meta_set_fragment_program(&intel->meta, &intel->meta.bitmap_fp, fp);
    _mesa_ProgramLocalParameter4fvARB(GL_FRAGMENT_PROGRAM_ARB, 0,
 				     ctx->Current.RasterColor);
-   intel_meta_set_passthrough_vertex_program(intel);
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_vertex_program(&intel->meta);
+   meta_set_passthrough_transform(&intel->meta);
 
    /* convert rasterpos Z from [0,1] to NDC coord in [-1,1] */
    dst_z = -1.0 + 2.0 * ctx->Current.RasterPos[2];
 
+   /* RasterPos[2] already takes into account the DepthRange mapping. */
+   _mesa_DepthRange(0.0, 1.0);
+
    vertices[0][0] = dst_x;
    vertices[0][1] = dst_y;
    vertices[0][2] = dst_z;
@@ -493,25 +505,15 @@ intel_texture_bitmap(GLcontext * ctx,
    vertices[3][2] = dst_z;
    vertices[3][3] = 1.0;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   meta_set_default_texrect(&intel->meta);
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-   intel_meta_restore_transform(intel);
-   intel_meta_restore_fragment_program(intel);
-   intel_meta_restore_vertex_program(intel);
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
+   meta_restore_fragment_program(&intel->meta);
+   meta_restore_vertex_program(&intel->meta);
 
    _mesa_PopClientAttrib();
    _mesa_Disable(GL_TEXTURE_2D); /* asserted that it was disabled at entry */
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_copy.c b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
index d50dd68092d..5d52335dee3 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_copy.c
@@ -272,6 +272,12 @@ do_blit_copypixels(GLcontext * ctx,
    drm_clip_rect_t *cliprects;
    int x_off, y_off;
 
+   if (type == GL_DEPTH || type == GL_STENCIL) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "glCopyPixels() fallback: GL_DEPTH || GL_STENCIL\n");
+      return GL_FALSE;
+   }
+
    /* Update draw buffer bounds */
    _mesa_update_state(ctx);
 
@@ -362,14 +368,16 @@ do_blit_copypixels(GLcontext * ctx,
 				   &clip_x, &clip_y, &clip_w, &clip_h))
             continue;
 
-         intelEmitCopyBlit(intel, dst->cpp,
-			   src->pitch, src->buffer, 0, src->tiling,
-			   dst->pitch, dst->buffer, 0, dst->tiling,
-			   clip_x + delta_x, clip_y + delta_y, /* srcx, srcy */
-			   clip_x, clip_y, /* dstx, dsty */
-			   clip_w, clip_h,
-			   ctx->Color.ColorLogicOpEnabled ?
-			   ctx->Color.LogicOp : GL_COPY);
+	 if (!intel_region_copy(intel,
+				dst, 0, clip_x, clip_y,
+				src, 0, clip_x + delta_x, clip_y + delta_y,
+				clip_w, clip_h,
+				ctx->Color.ColorLogicOpEnabled ?
+				ctx->Color.LogicOp : GL_COPY)) {
+	    DBG("%s: blit failure\n", __FUNCTION__);
+	    UNLOCK_HARDWARE(intel);
+	    return GL_FALSE;
+	 }
       }
    }
 out:
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_draw.c b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
index d80069dd586..a6b6824164a 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
@@ -45,6 +45,7 @@
 #include "main/depth.h"
 #include "main/hash.h"
 #include "main/blend.h"
+#include "main/viewport.h"
 #include "glapi/dispatch.h"
 #include "swrast/swrast.h"
 
@@ -70,7 +71,6 @@ intel_texture_drawpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLuint texname;
    GLfloat vertices[4][4];
-   GLfloat texcoords[4][2];
    GLfloat z;
    GLint old_active_texture;
    GLenum internalFormat;
@@ -129,7 +129,7 @@ intel_texture_drawpixels(GLcontext * ctx,
    }
 
    _mesa_PushAttrib(GL_ENABLE_BIT | GL_TEXTURE_BIT |
-		    GL_CURRENT_BIT);
+		    GL_CURRENT_BIT | GL_VIEWPORT_BIT);
    _mesa_PushClientAttrib(GL_CLIENT_VERTEX_ARRAY_BIT);
 
    /* XXX: pixel store stuff */
@@ -150,11 +150,14 @@ intel_texture_drawpixels(GLcontext * ctx,
    _mesa_TexImage2D(GL_TEXTURE_2D, 0, internalFormat, width, height, 0, format,
 		    type, pixels);
 
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_transform(&intel->meta);
 
    /* convert rasterpos Z from [0,1] to NDC coord in [-1,1] */
    z = -1.0 + 2.0 * ctx->Current.RasterPos[2];
 
+   /* RasterPos[2] already takes into account the DepthRange mapping. */
+   _mesa_DepthRange(0.0, 1.0);
+
    /* Create the vertex buffer based on the current raster pos.  The x and y
     * we're handed are ctx->Current.RasterPos[0,1] rounded to integers.
     * We also apply the depth.  However, the W component is already multiplied
@@ -177,23 +180,14 @@ intel_texture_drawpixels(GLcontext * ctx,
    vertices[3][2] = z;
    vertices[3][3] = 1.0;
 
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
-
    _mesa_VertexPointer(4, GL_FLOAT, 4 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   meta_set_default_texrect(&intel->meta);
+
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-   intel_meta_restore_transform(intel);
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
 
    _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
    _mesa_PopClientAttrib();
@@ -216,14 +210,14 @@ intel_stencil_drawpixels(GLcontext * ctx,
    struct intel_context *intel = intel_context(ctx);
    GLuint texname, rb_name, fb_name, old_fb_name;
    GLfloat vertices[4][2];
-   GLfloat texcoords[4][2];
    struct intel_renderbuffer *irb;
    struct intel_renderbuffer *depth_irb;
    struct gl_renderbuffer *rb;
    struct gl_pixelstore_attrib old_unpack;
    GLstencil *stencil_pixels;
-   int row;
+   int row, y1, y2;
    GLint old_active_texture;
+   GLboolean rendering_to_fbo = ctx->DrawBuffer->Name != 0;
 
    if (format != GL_STENCIL_INDEX)
       return GL_FALSE;
@@ -358,34 +352,35 @@ intel_stencil_drawpixels(GLcontext * ctx,
    ctx->Unpack = old_unpack;
    _mesa_free(stencil_pixels);
 
-   intel_meta_set_passthrough_transform(intel);
+   meta_set_passthrough_transform(&intel->meta);
 
+   /* Since we're rendering to the framebuffer as if it was an FBO,
+    * if it's the window system we have to flip the coordinates.
+    */
+   if (rendering_to_fbo) {
+      y1 = y;
+      y2 = y + height * ctx->Pixel.ZoomY;
+   } else {
+      y1 = irb->Base.Height - (y + height * ctx->Pixel.ZoomY);
+      y2 = irb->Base.Height - y;
+   }
    vertices[0][0] = x;
-   vertices[0][1] = y;
+   vertices[0][1] = y1;
    vertices[1][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[1][1] = y;
+   vertices[1][1] = y1;
    vertices[2][0] = x + width * ctx->Pixel.ZoomX;
-   vertices[2][1] = y + height * ctx->Pixel.ZoomY;
+   vertices[2][1] = y2;
    vertices[3][0] = x;
-   vertices[3][1] = y + height * ctx->Pixel.ZoomY;
-
-   texcoords[0][0] = 0.0;
-   texcoords[0][1] = 0.0;
-   texcoords[1][0] = 1.0;
-   texcoords[1][1] = 0.0;
-   texcoords[2][0] = 1.0;
-   texcoords[2][1] = 1.0;
-   texcoords[3][0] = 0.0;
-   texcoords[3][1] = 1.0;
+   vertices[3][1] = y2;
 
    _mesa_VertexPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &vertices);
-   _mesa_ClientActiveTextureARB(GL_TEXTURE0);
-   _mesa_TexCoordPointer(2, GL_FLOAT, 2 * sizeof(GLfloat), &texcoords);
    _mesa_Enable(GL_VERTEX_ARRAY);
-   _mesa_Enable(GL_TEXTURE_COORD_ARRAY);
+   meta_set_default_texrect(&intel->meta);
+
    _mesa_DrawArrays(GL_TRIANGLE_FAN, 0, 4);
 
-   intel_meta_restore_transform(intel);
+   meta_restore_texcoords(&intel->meta);
+   meta_restore_transform(&intel->meta);
 
    _mesa_ActiveTextureARB(GL_TEXTURE0_ARB + old_active_texture);
    _mesa_BindFramebufferEXT(GL_FRAMEBUFFER_EXT, old_fb_name);
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_read.c b/src/mesa/drivers/dri/intel/intel_pixel_read.c
new file mode 100644
index 00000000000..8713463ace2
--- /dev/null
+++ b/src/mesa/drivers/dri/intel/intel_pixel_read.c
@@ -0,0 +1,324 @@
+/**************************************************************************
+ *
+ * Copyright 2003 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "main/glheader.h"
+#include "main/enums.h"
+#include "main/mtypes.h"
+#include "main/macros.h"
+#include "main/image.h"
+#include "main/bufferobj.h"
+#include "main/state.h"
+#include "swrast/swrast.h"
+
+#include "intel_screen.h"
+#include "intel_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_blit.h"
+#include "intel_buffers.h"
+#include "intel_regions.h"
+#include "intel_pixel.h"
+#include "intel_buffer_objects.h"
+
+/* For many applications, the new ability to pull the source buffers
+ * back out of the GTT and then do the packing/conversion operations
+ * in software will be as much of an improvement as trying to get the
+ * blitter and/or texture engine to do the work.
+ *
+ * This step is gated on private backbuffers.
+ *
+ * Obviously the frontbuffer can't be pulled back, so that is either
+ * an argument for blit/texture readpixels, or for blitting to a
+ * temporary and then pulling that back.
+ *
+ * When the destination is a pbo, however, it's not clear if it is
+ * ever going to be pulled to main memory (though the access param
+ * will be a good hint).  So it sounds like we do want to be able to
+ * choose between blit/texture implementation on the gpu and pullback
+ * and cpu-based copying.
+ *
+ * Unless you can magically turn client memory into a PBO for the
+ * duration of this call, there will be a cpu-based copying step in
+ * any case.
+ */
+
+
+static GLboolean
+do_texture_readpixels(GLcontext * ctx,
+                      GLint x, GLint y, GLsizei width, GLsizei height,
+                      GLenum format, GLenum type,
+                      const struct gl_pixelstore_attrib *pack,
+                      struct intel_region *dest_region)
+{
+#if 0
+   struct intel_context *intel = intel_context(ctx);
+   intelScreenPrivate *screen = intel->intelScreen;
+   GLint pitch = pack->RowLength ? pack->RowLength : width;
+   __DRIdrawablePrivate *dPriv = intel->driDrawable;
+   int textureFormat;
+   GLenum glTextureFormat;
+   int destFormat, depthFormat, destPitch;
+   drm_clip_rect_t tmp;
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+
+   if (ctx->_ImageTransferState ||
+       pack->SwapBytes || pack->LsbFirst || !pack->Invert) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         fprintf(stderr, "%s: check_color failed\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   intel->vtbl.meta_texrect_source(intel, intel_readbuf_region(intel));
+
+   if (!intel->vtbl.meta_render_dest(intel, dest_region, type, format)) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         fprintf(stderr, "%s: couldn't set dest %s/%s\n",
+                 __FUNCTION__,
+                 _mesa_lookup_enum_by_nr(type),
+                 _mesa_lookup_enum_by_nr(format));
+      return GL_FALSE;
+   }
+
+   LOCK_HARDWARE(intel);
+
+   if (intel->driDrawable->numClipRects) {
+      intel->vtbl.install_meta_state(intel);
+      intel->vtbl.meta_no_depth_write(intel);
+      intel->vtbl.meta_no_stencil_write(intel);
+
+      if (!driClipRectToFramebuffer(ctx->ReadBuffer, &x, &y, &width, &height)) {
+         UNLOCK_HARDWARE(intel);
+         SET_STATE(i830, state);
+         if (INTEL_DEBUG & DEBUG_PIXEL)
+            fprintf(stderr, "%s: cliprect failed\n", __FUNCTION__);
+         return GL_TRUE;
+      }
+
+      y = dPriv->h - y - height;
+      x += dPriv->x;
+      y += dPriv->y;
+
+
+      /* Set the frontbuffer up as a large rectangular texture.
+       */
+      intel->vtbl.meta_tex_rect_source(intel, src_region, textureFormat);
+
+
+      intel->vtbl.meta_texture_blend_replace(i830, glTextureFormat);
+
+
+      /* Set the 3d engine to draw into the destination region:
+       */
+
+      intel->vtbl.meta_draw_region(intel, dest_region);
+      intel->vtbl.meta_draw_format(intel, destFormat, depthFormat);     /* ?? */
+
+
+      /* Draw a single quad, no cliprects:
+       */
+      intel->vtbl.meta_disable_cliprects(intel);
+
+      intel->vtbl.draw_quad(intel,
+                            0, width, 0, height,
+                            0x00ff00ff, x, x + width, y, y + height);
+
+      intel->vtbl.leave_meta_state(intel);
+   }
+   UNLOCK_HARDWARE(intel);
+
+   intel_region_wait_fence(ctx, dest_region);   /* required by GL */
+   return GL_TRUE;
+#endif
+
+   return GL_FALSE;
+}
+
+
+
+
+static GLboolean
+do_blit_readpixels(GLcontext * ctx,
+                   GLint x, GLint y, GLsizei width, GLsizei height,
+                   GLenum format, GLenum type,
+                   const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
+{
+   struct intel_context *intel = intel_context(ctx);
+   struct intel_region *src = intel_readbuf_region(intel);
+   struct intel_buffer_object *dst = intel_buffer_object(pack->BufferObj);
+   GLuint dst_offset;
+   GLuint rowLength;
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      _mesa_printf("%s\n", __FUNCTION__);
+
+   if (!src)
+      return GL_FALSE;
+
+   if (pack->BufferObj->Name) {
+      /* XXX This validation should be done by core mesa:
+       */
+      if (!_mesa_validate_pbo_access(2, pack, width, height, 1,
+                                     format, type, pixels)) {
+         _mesa_error(ctx, GL_INVALID_OPERATION, "glDrawPixels");
+         return GL_TRUE;
+      }
+   }
+   else {
+      /* PBO only for now:
+       */
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s - not PBO\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+
+   if (ctx->_ImageTransferState ||
+       !intel_check_blit_format(src, format, type)) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s - bad format for blit\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   if (pack->Alignment != 1 || pack->SwapBytes || pack->LsbFirst) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s: bad packing params\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+
+   if (pack->RowLength > 0)
+      rowLength = pack->RowLength;
+   else
+      rowLength = width;
+
+   if (pack->Invert) {
+      if (INTEL_DEBUG & DEBUG_PIXEL)
+         _mesa_printf("%s: MESA_PACK_INVERT not done yet\n", __FUNCTION__);
+      return GL_FALSE;
+   }
+   else {
+      rowLength = -rowLength;
+   }
+
+   /* XXX 64-bit cast? */
+   dst_offset = (GLuint) _mesa_image_address(2, pack, pixels, width, height,
+                                             format, type, 0, 0, 0);
+
+
+   /* Although the blits go on the command buffer, need to do this and
+    * fire with lock held to guarentee cliprects are correct.
+    */
+   intelFlush(&intel->ctx);
+   LOCK_HARDWARE(intel);
+
+   if (intel->driDrawable->numClipRects) {
+      GLboolean all = (width * height * src->cpp == dst->Base.Size &&
+                       x == 0 && dst_offset == 0);
+
+      dri_bo *dst_buffer = intel_bufferobj_buffer(intel, dst,
+						  all ? INTEL_WRITE_FULL :
+						  INTEL_WRITE_PART);
+      __DRIdrawablePrivate *dPriv = intel->driDrawable;
+      int nbox = dPriv->numClipRects;
+      drm_clip_rect_t *box = dPriv->pClipRects;
+      drm_clip_rect_t rect;
+      drm_clip_rect_t src_rect;
+      int i;
+
+      src_rect.x1 = dPriv->x + x;
+      src_rect.y1 = dPriv->y + dPriv->h - (y + height);
+      src_rect.x2 = src_rect.x1 + width;
+      src_rect.y2 = src_rect.y1 + height;
+
+
+
+      for (i = 0; i < nbox; i++) {
+         if (!intel_intersect_cliprects(&rect, &src_rect, &box[i]))
+            continue;
+
+         if (!intelEmitCopyBlit(intel,
+				src->cpp,
+				src->pitch, src->buffer, 0, src->tiling,
+				rowLength, dst_buffer, dst_offset, GL_FALSE,
+				rect.x1,
+				rect.y1,
+				rect.x1 - src_rect.x1,
+				rect.y2 - src_rect.y2,
+				rect.x2 - rect.x1, rect.y2 - rect.y1,
+				GL_COPY)) {
+	    UNLOCK_HARDWARE(intel);
+	    return GL_FALSE;
+	 }
+      }
+   }
+   UNLOCK_HARDWARE(intel);
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      _mesa_printf("%s - DONE\n", __FUNCTION__);
+
+   return GL_TRUE;
+}
+
+void
+intelReadPixels(GLcontext * ctx,
+                GLint x, GLint y, GLsizei width, GLsizei height,
+                GLenum format, GLenum type,
+                const struct gl_pixelstore_attrib *pack, GLvoid * pixels)
+{
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      fprintf(stderr, "%s\n", __FUNCTION__);
+
+   intelFlush(ctx);
+
+#ifdef I915
+   if (do_blit_readpixels
+       (ctx, x, y, width, height, format, type, pack, pixels))
+      return;
+
+   if (do_texture_readpixels
+       (ctx, x, y, width, height, format, type, pack, pixels))
+      return;
+#else
+   (void)do_blit_readpixels;
+   (void)do_texture_readpixels;
+#endif
+
+   if (INTEL_DEBUG & DEBUG_PIXEL)
+      _mesa_printf("%s: fallback to swrast\n", __FUNCTION__);
+
+   /* Update Mesa state before calling down into _swrast_ReadPixels, as
+    * the spans code requires the computed buffer states to be up to date,
+    * but _swrast_ReadPixels only updates Mesa state after setting up
+    * the spans code.
+    */
+
+   if (ctx->NewState)
+      _mesa_update_state(ctx);
+
+   _swrast_ReadPixels(ctx, x, y, width, height, format, type, pack, pixels);
+}
diff --git a/src/mesa/drivers/dri/intel/intel_reg.h b/src/mesa/drivers/dri/intel/intel_reg.h
index 57ac8f0cc14..d19f1bae34c 100644
--- a/src/mesa/drivers/dri/intel/intel_reg.h
+++ b/src/mesa/drivers/dri/intel/intel_reg.h
@@ -189,6 +189,19 @@
 
 #define S7_DEPTH_OFFSET_CONST_MASK     ~0
 
+/* p143 */
+#define _3DSTATE_BUF_INFO_CMD	(CMD_3D | (0x1d<<24) | (0x8e<<16) | 1)
+/* Dword 1 */
+#define BUF_3D_ID_COLOR_BACK	(0x3<<24)
+#define BUF_3D_ID_DEPTH 	(0x7<<24)
+#define BUF_3D_USE_FENCE	(1<<23)
+#define BUF_3D_TILED_SURFACE	(1<<22)
+#define BUF_3D_TILE_WALK_X	0
+#define BUF_3D_TILE_WALK_Y	(1<<21)
+#define BUF_3D_PITCH(x)         (((x)/4)<<2)
+/* Dword 2 */
+#define BUF_3D_ADDR(x)		((x) & ~0x3)
+
 /* Primitive dispatch on 830-945 */
 #define _3DPRIMITIVE			(CMD_3D | (0x1f << 24))
 #define PRIM_INDIRECT            (1<<23)
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index 0aa5b8c02c9..7525cd9c4d7 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -52,17 +52,77 @@
 
 #define FILE_DEBUG_FLAG DEBUG_REGION
 
+/* This should be set to the maximum backtrace size desired.
+ * Set it to 0 to disable backtrace debugging.
+ */
+#define DEBUG_BACKTRACE_SIZE 0
+
+#if DEBUG_BACKTRACE_SIZE == 0
+/* Use the standard debug output */
+#define _DBG(...) DBG(__VA_ARGS__)
+#else
+/* Use backtracing debug output */
+#define _DBG(...) {debug_backtrace(); DBG(__VA_ARGS__);}
+
+/* Backtracing debug support */
+#include <execinfo.h>
+
+static void
+debug_backtrace(void)
+{
+   void *trace[DEBUG_BACKTRACE_SIZE];
+   char **strings = NULL;
+   int traceSize;
+   register int i;
+
+   traceSize = backtrace(trace, DEBUG_BACKTRACE_SIZE);
+   strings = backtrace_symbols(trace, traceSize);
+   if (strings == NULL) {
+      DBG("no backtrace:");
+      return;
+   }
+
+   /* Spit out all the strings with a colon separator.  Ignore
+    * the first, since we don't really care about the call
+    * to debug_backtrace() itself.  Skip until the final "/" in
+    * the trace to avoid really long lines.
+    */
+   for (i = 1; i < traceSize; i++) {
+      char *p = strings[i], *slash = strings[i];
+      while (*p) {
+         if (*p++ == '/') {
+            slash = p;
+         }
+      }
+
+      DBG("%s:", slash);
+   }
+
+   /* Free up the memory, and we're done */
+   free(strings);
+}
+
+#endif
+
+
+
 /* XXX: Thread safety?
  */
 GLubyte *
 intel_region_map(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   intelFlush(&intel->ctx);
+
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!region->map_refcount++) {
       if (region->pbo)
          intel_region_cow(intel, region);
 
-      dri_bo_map(region->buffer, GL_TRUE);
+      if (region->tiling != I915_TILING_NONE &&
+	  intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_map_gtt(region->buffer);
+      else
+	 dri_bo_map(region->buffer, GL_TRUE);
       region->map = region->buffer->virtual;
    }
 
@@ -72,9 +132,13 @@ intel_region_map(struct intel_context *intel, struct intel_region *region)
 void
 intel_region_unmap(struct intel_context *intel, struct intel_region *region)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s %p\n", __FUNCTION__, region);
    if (!--region->map_refcount) {
-      dri_bo_unmap(region->buffer);
+      if (region->tiling != I915_TILING_NONE &&
+	  intel->intelScreen->kernel_exec_fencing)
+	 drm_intel_gem_bo_unmap_gtt(region->buffer);
+      else
+	 dri_bo_unmap(region->buffer);
       region->map = NULL;
    }
 }
@@ -87,10 +151,10 @@ intel_region_alloc_internal(struct intel_context *intel,
 {
    struct intel_region *region;
 
-   DBG("%s\n", __FUNCTION__);
-
-   if (buffer == NULL)
+   if (buffer == NULL) {
+      _DBG("%s <-- NULL\n", __FUNCTION__);
       return NULL;
+   }
 
    region = calloc(sizeof(*region), 1);
    region->cpp = cpp;
@@ -104,15 +168,18 @@ intel_region_alloc_internal(struct intel_context *intel,
    region->tiling = I915_TILING_NONE;
    region->bit_6_swizzle = I915_BIT_6_SWIZZLE_NONE;
 
+   _DBG("%s <-- %p\n", __FUNCTION__, region);
    return region;
 }
 
 struct intel_region *
 intel_region_alloc(struct intel_context *intel,
+		   uint32_t tiling,
                    GLuint cpp, GLuint width, GLuint height, GLuint pitch,
 		   GLboolean expect_accelerated_upload)
 {
    dri_bo *buffer;
+   struct intel_region *region;
 
    if (expect_accelerated_upload) {
       buffer = drm_intel_bo_alloc_for_render(intel->bufmgr, "region",
@@ -122,7 +189,16 @@ intel_region_alloc(struct intel_context *intel,
 				  pitch * cpp * height, 64);
    }
 
-   return intel_region_alloc_internal(intel, cpp, width, height, pitch, buffer);
+   region = intel_region_alloc_internal(intel, cpp, width, height,
+					pitch, buffer);
+
+   if (tiling != I915_TILING_NONE) {
+      assert(((pitch * cpp) & 127) == 0);
+      drm_intel_bo_set_tiling(buffer, &tiling, pitch * cpp);
+      drm_intel_bo_get_tiling(buffer, &region->tiling, &region->bit_6_swizzle);
+   }
+
+   return region;
 }
 
 struct intel_region *
@@ -158,7 +234,7 @@ void
 intel_region_reference(struct intel_region **dst, struct intel_region *src)
 {
    if (src)
-      DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
+      _DBG("%s %p %d\n", __FUNCTION__, src, src->refcount);
 
    assert(*dst == NULL);
    if (src) {
@@ -172,10 +248,12 @@ intel_region_release(struct intel_region **region_handle)
 {
    struct intel_region *region = *region_handle;
 
-   if (region == NULL)
+   if (region == NULL) {
+      _DBG("%s NULL\n", __FUNCTION__);
       return;
+   }
 
-   DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
+   _DBG("%s %p %d\n", __FUNCTION__, region, region->refcount - 1);
 
    ASSERT(region->refcount > 0);
    region->refcount--;
@@ -249,9 +327,7 @@ intel_region_data(struct intel_context *intel,
                   const void *src, GLuint src_pitch,
                   GLuint srcx, GLuint srcy, GLuint width, GLuint height)
 {
-   GLboolean locked = GL_FALSE;
-
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
       return;
@@ -264,39 +340,33 @@ intel_region_data(struct intel_context *intel,
          intel_region_cow(intel, dst);
    }
 
-   if (!intel->locked) {
-      LOCK_HARDWARE(intel);
-      locked = GL_TRUE;
-   }
-
+   LOCK_HARDWARE(intel);
    _mesa_copy_rect(intel_region_map(intel, dst) + dst_offset,
                    dst->cpp,
                    dst->pitch,
                    dstx, dsty, width, height, src, src_pitch, srcx, srcy);
 
    intel_region_unmap(intel, dst);
-
-   if (locked)
-      UNLOCK_HARDWARE(intel);
-
+   UNLOCK_HARDWARE(intel);
 }
 
 /* Copy rectangular sub-regions. Need better logic about when to
  * push buffers into AGP - will currently do so whenever possible.
  */
-void
+GLboolean
 intel_region_copy(struct intel_context *intel,
                   struct intel_region *dst,
                   GLuint dst_offset,
                   GLuint dstx, GLuint dsty,
                   struct intel_region *src,
                   GLuint src_offset,
-                  GLuint srcx, GLuint srcy, GLuint width, GLuint height)
+                  GLuint srcx, GLuint srcy, GLuint width, GLuint height,
+		  GLenum logicop)
 {
-   DBG("%s\n", __FUNCTION__);
+   _DBG("%s\n", __FUNCTION__);
 
    if (intel == NULL)
-      return;
+      return GL_FALSE;
 
    if (dst->pbo) {
       if (dstx == 0 &&
@@ -308,41 +378,12 @@ intel_region_copy(struct intel_context *intel,
 
    assert(src->cpp == dst->cpp);
 
-   intelEmitCopyBlit(intel,
-                     dst->cpp,
-                     src->pitch, src->buffer, src_offset, src->tiling,
-                     dst->pitch, dst->buffer, dst_offset, dst->tiling,
-                     srcx, srcy, dstx, dsty, width, height,
-		     GL_COPY);
-}
-
-/* Fill a rectangular sub-region.  Need better logic about when to
- * push buffers into AGP - will currently do so whenever possible.
- */
-void
-intel_region_fill(struct intel_context *intel,
-                  struct intel_region *dst,
-                  GLuint dst_offset,
-                  GLuint dstx, GLuint dsty,
-                  GLuint width, GLuint height, GLuint color)
-{
-   DBG("%s\n", __FUNCTION__);
-
-   if (intel == NULL)
-      return;   
-
-   if (dst->pbo) {
-      if (dstx == 0 &&
-          dsty == 0 && width == dst->pitch && height == dst->height)
-         intel_region_release_pbo(intel, dst);
-      else
-         intel_region_cow(intel, dst);
-   }
-
-   intelEmitFillBlit(intel,
-                     dst->cpp,
-                     dst->pitch, dst->buffer, dst_offset, dst->tiling,
-                     dstx, dsty, width, height, color);
+   return intelEmitCopyBlit(intel,
+			    dst->cpp,
+			    src->pitch, src->buffer, src_offset, src->tiling,
+			    dst->pitch, dst->buffer, dst_offset, dst->tiling,
+			    srcx, srcy, dstx, dsty, width, height,
+			    logicop);
 }
 
 /* Attach to a pbo, discarding our data.  Effectively zero-copy upload
@@ -353,9 +394,13 @@ intel_region_attach_pbo(struct intel_context *intel,
                         struct intel_region *region,
                         struct intel_buffer_object *pbo)
 {
+   dri_bo *buffer;
+
    if (region->pbo == pbo)
       return;
 
+   _DBG("%s %p %p\n", __FUNCTION__, region, pbo);
+
    /* If there is already a pbo attached, break the cow tie now.
     * Don't call intel_region_release_pbo() as that would
     * unnecessarily allocate a new buffer we would have to immediately
@@ -371,10 +416,13 @@ intel_region_attach_pbo(struct intel_context *intel,
       region->buffer = NULL;
    }
 
+   /* make sure pbo has a buffer of its own */
+   buffer = intel_bufferobj_buffer(intel, pbo, INTEL_WRITE_FULL);
+
    region->pbo = pbo;
    region->pbo->region = region;
-   dri_bo_reference(pbo->buffer);
-   region->buffer = pbo->buffer;
+   dri_bo_reference(buffer);
+   region->buffer = buffer;
 }
 
 
@@ -385,6 +433,7 @@ void
 intel_region_release_pbo(struct intel_context *intel,
                          struct intel_region *region)
 {
+   _DBG("%s %p\n", __FUNCTION__, region);
    assert(region->buffer == region->pbo->buffer);
    region->pbo->region = NULL;
    region->pbo = NULL;
@@ -403,34 +452,25 @@ void
 intel_region_cow(struct intel_context *intel, struct intel_region *region)
 {
    struct intel_buffer_object *pbo = region->pbo;
-   GLboolean was_locked = intel->locked;
-
-   if (intel == NULL)
-      return;
 
    intel_region_release_pbo(intel, region);
 
    assert(region->cpp * region->pitch * region->height == pbo->Base.Size);
 
-   DBG("%s (%d bytes)\n", __FUNCTION__, pbo->Base.Size);
+   _DBG("%s %p (%d bytes)\n", __FUNCTION__, region, pbo->Base.Size);
 
    /* Now blit from the texture buffer to the new buffer: 
     */
 
-   was_locked = intel->locked;
-   if (!was_locked)
-      LOCK_HARDWARE(intel);
-
-   intelEmitCopyBlit(intel,
-		     region->cpp,
-		     region->pitch, region->buffer, 0, region->tiling,
-		     region->pitch, pbo->buffer, 0, region->tiling,
-		     0, 0, 0, 0,
-		     region->pitch, region->height,
-		     GL_COPY);
-
-   if (!was_locked)
-      UNLOCK_HARDWARE(intel);
+   LOCK_HARDWARE(intel);
+   assert(intelEmitCopyBlit(intel,
+			    region->cpp,
+			    region->pitch, pbo->buffer, 0, region->tiling,
+			    region->pitch, region->buffer, 0, region->tiling,
+			    0, 0, 0, 0,
+			    region->pitch, region->height,
+			    GL_COPY));
+   UNLOCK_HARDWARE(intel);
 }
 
 dri_bo *
@@ -459,6 +499,10 @@ intel_recreate_static(struct intel_context *intel,
    if (region == NULL) {
       region = calloc(sizeof(*region), 1);
       region->refcount = 1;
+      _DBG("%s creating new region %p\n", __FUNCTION__, region);
+   }
+   else {
+      _DBG("%s %p\n", __FUNCTION__, region);
    }
 
    if (intel->ctx.Visual.rgbBits == 24)
diff --git a/src/mesa/drivers/dri/intel/intel_regions.h b/src/mesa/drivers/dri/intel/intel_regions.h
index 45e2bf4e77a..0d379bdc6e2 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.h
+++ b/src/mesa/drivers/dri/intel/intel_regions.h
@@ -73,7 +73,8 @@ struct intel_region
  * copied by calling intel_reference_region().
  */
 struct intel_region *intel_region_alloc(struct intel_context *intel,
-                                        GLuint cpp, GLuint width,
+                                        uint32_t tiling,
+					GLuint cpp, GLuint width,
                                         GLuint height, GLuint pitch,
 					GLboolean expect_accelerated_upload);
 
@@ -109,21 +110,15 @@ void intel_region_data(struct intel_context *intel,
 
 /* Copy rectangular sub-regions
  */
-void intel_region_copy(struct intel_context *intel,
-                       struct intel_region *dest,
-                       GLuint dest_offset,
-                       GLuint destx, GLuint desty,
-                       struct intel_region *src,
-                       GLuint src_offset,
-                       GLuint srcx, GLuint srcy, GLuint width, GLuint height);
-
-/* Fill a rectangular sub-region
- */
-void intel_region_fill(struct intel_context *intel,
-                       struct intel_region *dest,
-                       GLuint dest_offset,
-                       GLuint destx, GLuint desty,
-                       GLuint width, GLuint height, GLuint color);
+GLboolean
+intel_region_copy(struct intel_context *intel,
+		  struct intel_region *dest,
+		  GLuint dest_offset,
+		  GLuint destx, GLuint desty,
+		  struct intel_region *src,
+		  GLuint src_offset,
+		  GLuint srcx, GLuint srcy, GLuint width, GLuint height,
+		  GLenum logicop);
 
 /* Helpers for zerocopy uploads, particularly texture image uploads:
  */
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 0f278b3acdd..6bbc995c1e6 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -49,6 +49,10 @@
 #include "i915_drm.h"
 #include "i830_dri.h"
 
+#define DRI_CONF_TEXTURE_TILING(def) \
+	DRI_CONF_OPT_BEGIN(texture_tiling, bool, def)		\
+		DRI_CONF_DESC(en, "Enable texture tiling")	\
+	DRI_CONF_OPT_END					\
 
 PUBLIC const char __driConfigOptions[] =
    DRI_CONF_BEGIN
@@ -64,6 +68,13 @@ PUBLIC const char __driConfigOptions[] =
 	    DRI_CONF_ENUM(1, "Enable reuse of all sizes of buffer objects")
 	 DRI_CONF_DESC_END
       DRI_CONF_OPT_END
+
+      DRI_CONF_TEXTURE_TILING(false)
+
+      DRI_CONF_OPT_BEGIN(early_z, bool, false)
+	 DRI_CONF_DESC(en, "Enable early Z in classic mode (unstable, 945-only).")
+      DRI_CONF_OPT_END
+
    DRI_CONF_SECTION_END
    DRI_CONF_SECTION_QUALITY
       DRI_CONF_FORCE_S3TC_ENABLE(false)
@@ -76,7 +87,7 @@ PUBLIC const char __driConfigOptions[] =
    DRI_CONF_SECTION_END
 DRI_CONF_END;
 
-const GLuint __driNConfigOptions = 8;
+const GLuint __driNConfigOptions = 10;
 
 #ifdef USE_NEW_INTERFACE
 static PFNGLXCREATECONTEXTMODES create_context_modes = NULL;
@@ -305,7 +316,7 @@ intelDestroyScreen(__DRIscreenPrivate * sPriv)
 
    dri_bufmgr_destroy(intelScreen->bufmgr);
    intelUnmapScreenRegions(intelScreen);
-   driDestroyOptionCache(&intelScreen->optionCache);
+   driDestroyOptionInfo(&intelScreen->optionCache);
 
    FREE(intelScreen);
    sPriv->private = NULL;
@@ -587,7 +598,7 @@ intel_init_bufmgr(intelScreenPrivate *intelScreen)
    GLboolean gem_supported;
    struct drm_i915_getparam gp;
    __DRIscreenPrivate *spriv = intelScreen->driScrnPriv;
-   int num_fences;
+   int num_fences = 0;
 
    intelScreen->no_hw = getenv("INTEL_NO_HW") != NULL;
 
diff --git a/src/mesa/drivers/dri/intel/intel_tex.c b/src/mesa/drivers/dri/intel/intel_tex.c
index ae0994b183a..df63f29a42c 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.c
+++ b/src/mesa/drivers/dri/intel/intel_tex.c
@@ -158,81 +158,11 @@ timed_memcpy(void *dest, const void *src, size_t n)
 }
 #endif /* DO_DEBUG */
 
-/**
- * Generate new mipmap data from BASE+1 to BASE+p (the minimally-sized mipmap
- * level).
- *
- * The texture object's miptree must be mapped.
- *
- * It would be really nice if this was just called by Mesa whenever mipmaps
- * needed to be regenerated, rather than us having to remember to do so in
- * each texture image modification path.
- *
- * This function should also include an accelerated path.
- */
-void
-intel_generate_mipmap(GLcontext *ctx, GLenum target,
-                      struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-   GLuint nr_faces = (intelObj->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
-   int face, i;
-
-   _mesa_generate_mipmap(ctx, target, texObj);
-
-   /* Update the level information in our private data in the new images, since
-    * it didn't get set as part of a normal TexImage path.
-    */
-   for (face = 0; face < nr_faces; face++) {
-      for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
-         struct intel_texture_image *intelImage;
-
-	 intelImage = intel_texture_image(texObj->Image[face][i]);
-	 if (intelImage == NULL)
-	    break;
-
-	 intelImage->level = i;
-	 intelImage->face = face;
-	 /* Unreference the miptree to signal that the new Data is a bare
-	  * pointer from mesa.
-	  */
-	 intel_miptree_release(intel, &intelImage->mt);
-      }
-   }
-}
-
-static void intelGenerateMipmap(GLcontext *ctx, GLenum target, struct gl_texture_object *texObj)
-{
-   struct intel_context *intel = intel_context(ctx);
-   struct intel_texture_object *intelObj = intel_texture_object(texObj);
-
-   intel_tex_map_level_images(intel, intelObj, texObj->BaseLevel);
-   intel_generate_mipmap(ctx, target, texObj);
-   intel_tex_unmap_level_images(intel, intelObj, texObj->BaseLevel);
-}
-
 void
 intelInitTextureFuncs(struct dd_function_table *functions)
 {
    functions->ChooseTextureFormat = intelChooseTextureFormat;
-   functions->TexImage1D = intelTexImage1D;
-   functions->TexImage2D = intelTexImage2D;
-   functions->TexImage3D = intelTexImage3D;
-   functions->TexSubImage1D = intelTexSubImage1D;
-   functions->TexSubImage2D = intelTexSubImage2D;
-   functions->TexSubImage3D = intelTexSubImage3D;
-   functions->CopyTexImage1D = intelCopyTexImage1D;
-   functions->CopyTexImage2D = intelCopyTexImage2D;
-   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
-   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
-   functions->GetTexImage = intelGetTexImage;
-   functions->GenerateMipmap = intelGenerateMipmap;
-
-   /* compressed texture functions */
-   functions->CompressedTexImage2D = intelCompressedTexImage2D;
-   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
-   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+   functions->GenerateMipmap = intel_generate_mipmap;
 
    functions->NewTextureObject = intelNewTextureObject;
    functions->NewTextureImage = intelNewTextureImage;
diff --git a/src/mesa/drivers/dri/intel/intel_tex.h b/src/mesa/drivers/dri/intel/intel_tex.h
index f5372d82fb2..471aa2a240b 100644
--- a/src/mesa/drivers/dri/intel/intel_tex.h
+++ b/src/mesa/drivers/dri/intel/intel_tex.h
@@ -35,116 +35,17 @@
 
 void intelInitTextureFuncs(struct dd_function_table *functions);
 
+void intelInitTextureImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureSubImageFuncs(struct dd_function_table *functions);
+
+void intelInitTextureCopyImageFuncs(struct dd_function_table *functions);
+
 const struct gl_texture_format *intelChooseTextureFormat(GLcontext * ctx,
                                                          GLint internalFormat,
                                                          GLenum format,
                                                          GLenum type);
 
-
-void intelTexImage3D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint depth,
-                     GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage3D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset, GLint zoffset,
-                        GLsizei width, GLsizei height, GLsizei depth,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage2D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint height, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage2D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset, GLint yoffset,
-                        GLsizei width, GLsizei height,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelTexImage1D(GLcontext * ctx,
-                     GLenum target, GLint level,
-                     GLint internalFormat,
-                     GLint width, GLint border,
-                     GLenum format, GLenum type, const void *pixels,
-                     const struct gl_pixelstore_attrib *packing,
-                     struct gl_texture_object *texObj,
-                     struct gl_texture_image *texImage);
-
-void intelTexSubImage1D(GLcontext * ctx,
-                        GLenum target,
-                        GLint level,
-                        GLint xoffset,
-                        GLsizei width,
-                        GLenum format, GLenum type,
-                        const GLvoid * pixels,
-                        const struct gl_pixelstore_attrib *packing,
-                        struct gl_texture_object *texObj,
-                        struct gl_texture_image *texImage);
-
-void intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLint border);
-
-void intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
-                         GLenum internalFormat,
-                         GLint x, GLint y, GLsizei width, GLsizei height,
-                         GLint border);
-
-void intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint x, GLint y, GLsizei width);
-
-void intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-                            GLint xoffset, GLint yoffset,
-                            GLint x, GLint y, GLsizei width, GLsizei height);
-
-void intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
-                      GLenum format, GLenum type, GLvoid * pixels,
-                      struct gl_texture_object *texObj,
-                      struct gl_texture_image *texImage);
-
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage );
-
-void intelCompressedTexSubImage2D(GLcontext * ctx,
-				  GLenum target,
-				  GLint level,
-				  GLint xoffset, GLint yoffset,
-				  GLsizei width, GLsizei height,
-				  GLenum format, GLsizei imageSize,
-				  const GLvoid * pixels,
-				  struct gl_texture_object *texObj,
-				  struct gl_texture_image *texImage);
-
-void intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
-				GLvoid *pixels,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage);
-
 void intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		       unsigned long long offset, GLint depth, GLuint pitch);
 void intelSetTexBuffer(__DRIcontext *pDRICtx,
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index 90bbb8c6bbf..028b49c14d3 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -73,11 +73,8 @@ get_teximage_source(struct intel_context *intel, GLenum internalFormat)
       return NULL;
    case GL_RGBA:
    case GL_RGBA8:
-      return intel_readbuf_region(intel);
    case GL_RGB:
-      if (intel->ctx.Visual.rgbBits == 16)
-         return intel_readbuf_region(intel);
-      return NULL;
+      return intel_readbuf_region(intel);
    default:
       return NULL;
    }
@@ -99,14 +96,24 @@ do_copy_texsubimage(struct intel_context *intel,
 
    if (!intelImage->mt || !src) {
       if (INTEL_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s fail %p %p\n",
-		 __FUNCTION__, intelImage->mt, src);
+	 fprintf(stderr, "%s fail %p %p (0x%08x)\n",
+		 __FUNCTION__, intelImage->mt, src, internalFormat);
+      return GL_FALSE;
+   }
+
+   if (intelImage->mt->cpp != src->cpp) {
+      if (INTEL_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s fail %d vs %d cpp\n",
+		 __FUNCTION__, intelImage->mt->cpp, src->cpp);
       return GL_FALSE;
    }
 
    intelFlush(ctx);
    LOCK_HARDWARE(intel);
    {
+      drm_intel_bo *dst_bo = intel_region_buffer(intel,
+						 intelImage->mt->region,
+						 INTEL_WRITE_PART);
       GLuint image_offset = intel_miptree_image_offset(intelImage->mt,
                                                        intelImage->face,
                                                        intelImage->level);
@@ -118,8 +125,12 @@ do_copy_texsubimage(struct intel_context *intel,
       dstx += x - orig_x;
       dsty += y - orig_y;
 
-      /* image_offset may be non-page-aligned, but that's illegal for tiling. */
-      assert(intelImage->mt->region->tiling == I915_TILING_NONE);
+      /* Can't blit to tiled buffers with non-tile-aligned offset. */
+      if (intelImage->mt->region->tiling != I915_TILING_NONE &&
+	  (image_offset & 4095) != 0) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
 
       if (ctx->ReadBuffer->Name == 0) {
 	 /* reading from a window, adjust x, y */
@@ -140,35 +151,35 @@ do_copy_texsubimage(struct intel_context *intel,
 	 src_pitch = src->pitch;
       }
 
-      intelEmitCopyBlit(intel,
-			intelImage->mt->cpp,
-			src_pitch,
-			src->buffer,
-			0,
-			src->tiling,
-			intelImage->mt->pitch,
-			intelImage->mt->region->buffer,
-			image_offset,
-			intelImage->mt->region->tiling,
-			x, y, dstx, dsty, width, height,
-			GL_COPY);
+      if (!intelEmitCopyBlit(intel,
+			     intelImage->mt->cpp,
+			     src_pitch,
+			     src->buffer,
+			     0,
+			     src->tiling,
+			     intelImage->mt->pitch,
+			     dst_bo,
+			     image_offset,
+			     intelImage->mt->region->tiling,
+			     x, y, dstx, dsty, width, height,
+			     GL_COPY)) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
    }
 
    UNLOCK_HARDWARE(intel);
 
    /* GL_SGIS_generate_mipmap */
    if (intelImage->level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      ctx->Driver.GenerateMipmap(ctx, target, texObj);
+      intel_generate_mipmap(ctx, target, texObj);
    }
 
    return GL_TRUE;
 }
 
 
-
-
-
-void
+static void
 intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLint border)
@@ -214,7 +225,8 @@ intelCopyTexImage1D(GLcontext * ctx, GLenum target, GLint level,
                            width, border);
 }
 
-void
+
+static void
 intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
                     GLenum internalFormat,
                     GLint x, GLint y, GLsizei width, GLsizei height,
@@ -262,7 +274,7 @@ intelCopyTexImage2D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-void
+static void
 intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint x, GLint y, GLsizei width)
 {
@@ -287,8 +299,7 @@ intelCopyTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
 }
 
 
-
-void
+static void
 intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                        GLint xoffset, GLint yoffset,
                        GLint x, GLint y, GLsizei width, GLsizei height)
@@ -301,7 +312,6 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
       _mesa_select_tex_image(ctx, texObj, target, level);
    GLenum internalFormat = texImage->InternalFormat;
 
-
    /* Need to check texture is compatible with source format. 
     */
 
@@ -316,3 +326,13 @@ intelCopyTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
                                  xoffset, yoffset, x, y, width, height);
    }
 }
+
+
+void
+intelInitTextureCopyImageFuncs(struct dd_function_table *functions)
+{
+   functions->CopyTexImage1D = intelCopyTexImage1D;
+   functions->CopyTexImage2D = intelCopyTexImage2D;
+   functions->CopyTexSubImage1D = intelCopyTexSubImage1D;
+   functions->CopyTexSubImage2D = intelCopyTexSubImage2D;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_image.c b/src/mesa/drivers/dri/intel/intel_tex_image.c
index 5e61e9e95ec..c5f52208376 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_image.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_image.c
@@ -131,6 +131,7 @@ guess_and_alloc_mipmap_tree(struct intel_context *intel,
       comp_byte = intel_compressed_num_bytes(intelImage->base.TexFormat->MesaFormat);
    intelObj->mt = intel_miptree_create(intel,
                                        intelObj->base.Target,
+                                       intelImage->base._BaseFormat,
                                        intelImage->base.InternalFormat,
                                        firstLevel,
                                        lastLevel,
@@ -205,7 +206,7 @@ try_pbo_upload(struct intel_context *intel,
    GLuint src_offset, src_stride;
    GLuint dst_offset, dst_stride;
 
-   if (!pbo ||
+   if (unpack->BufferObj->Name == 0 ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
       DBG("%s: failure 1\n", __FUNCTION__);
@@ -235,12 +236,15 @@ try_pbo_upload(struct intel_context *intel,
 					       INTEL_WRITE_FULL);
 
 
-      intelEmitCopyBlit(intel,
-                        intelImage->mt->cpp,
-                        src_stride, src_buffer, src_offset, GL_FALSE,
-                        dst_stride, dst_buffer, dst_offset, GL_FALSE,
-                        0, 0, 0, 0, width, height,
-			GL_COPY);
+      if (!intelEmitCopyBlit(intel,
+			     intelImage->mt->cpp,
+			     src_stride, src_buffer, src_offset, GL_FALSE,
+			     dst_stride, dst_buffer, dst_offset, GL_FALSE,
+			     0, 0, 0, 0, width, height,
+			     GL_COPY)) {
+	 UNLOCK_HARDWARE(intel);
+	 return GL_FALSE;
+      }
    }
    UNLOCK_HARDWARE(intel);
 
@@ -248,7 +252,6 @@ try_pbo_upload(struct intel_context *intel,
 }
 
 
-
 static GLboolean
 try_pbo_zcopy(struct intel_context *intel,
               struct intel_texture_image *intelImage,
@@ -261,7 +264,7 @@ try_pbo_zcopy(struct intel_context *intel,
    GLuint src_offset, src_stride;
    GLuint dst_offset, dst_stride;
 
-   if (!pbo ||
+   if (unpack->BufferObj->Name == 0 ||
        intel->ctx._ImageTransferState ||
        unpack->SkipPixels || unpack->SkipRows) {
       DBG("%s: failure 1\n", __FUNCTION__);
@@ -293,10 +296,6 @@ try_pbo_zcopy(struct intel_context *intel,
 }
 
 
-
-
-
-
 static void
 intelTexImage(GLcontext * ctx,
               GLint dims,
@@ -307,7 +306,8 @@ intelTexImage(GLcontext * ctx,
               GLenum format, GLenum type, const void *pixels,
               const struct gl_pixelstore_attrib *unpack,
               struct gl_texture_object *texObj,
-              struct gl_texture_image *texImage, GLsizei imageSize, int compressed)
+              struct gl_texture_image *texImage, GLsizei imageSize,
+              GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_object *intelObj = intel_texture_object(texObj);
@@ -316,7 +316,6 @@ intelTexImage(GLcontext * ctx,
    GLint postConvHeight = height;
    GLint texelBytes, sizeInBytes;
    GLuint dstRowStride = 0, srcRowStride = texImage->RowStride;
-   GLboolean needs_map;
 
    DBG("%s target %s level %d %dx%dx%d border %d\n", __FUNCTION__,
        _mesa_lookup_enum_by_nr(target), level, width, height, depth, border);
@@ -414,7 +413,9 @@ intelTexImage(GLcontext * ctx,
        * a miptree, so create one just for our level and store it in the image.
        * It'll get moved into the object miptree at validate time.
        */
-      intelImage->mt = intel_miptree_create(intel, target, internalFormat,
+      intelImage->mt = intel_miptree_create(intel, target,
+					    intelImage->base.TexFormat->BaseFormat,
+					    internalFormat,
 					    level, level,
 					    width, height, depth,
 					    intelImage->base.TexFormat->TexelBytes,
@@ -426,7 +427,7 @@ intelTexImage(GLcontext * ctx,
     */
    if (dims <= 2 &&
        intelImage->mt &&
-       intel_buffer_object(unpack->BufferObj) &&
+       unpack->BufferObj->Name != 0 &&
        check_pbo_format(internalFormat, format,
                         type, intelImage->base.TexFormat)) {
 
@@ -464,8 +465,6 @@ intelTexImage(GLcontext * ctx,
       DBG("pbo upload failed\n");
    }
 
-
-
    /* intelCopyTexImage calls this function with pixels == NULL, with
     * the expectation that the mipmap tree will be set up but nothing
     * more will be done.  This is where those calls return:
@@ -482,15 +481,8 @@ intelTexImage(GLcontext * ctx,
 
    LOCK_HARDWARE(intel);
 
-   /* Two cases where we need a mapping of the miptree: when the user supplied
-    * data is mapped as well (non-PBO, memcpy upload) or when we're going to do
-    * (software) mipmap generation.
-    */
-   needs_map = (pixels != NULL) || (level == texObj->BaseLevel &&
-				  texObj->GenerateMipmap);
-
    if (intelImage->mt) {
-      if (needs_map)
+      if (pixels != NULL)
          texImage->Data = intel_miptree_image_map(intel,
                                                   intelImage->mt,
                                                   intelImage->face,
@@ -547,25 +539,26 @@ intelTexImage(GLcontext * ctx,
 						   format, type, pixels, unpack)) {
 	   _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
        }
-
-       /* GL_SGIS_generate_mipmap */
-       if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-	  intel_generate_mipmap(ctx, target, texObj);
-       }
    }
 
    _mesa_unmap_teximage_pbo(ctx, unpack);
 
    if (intelImage->mt) {
-      if (needs_map)
+      if (pixels != NULL)
          intel_miptree_image_unmap(intel, intelImage->mt);
       texImage->Data = NULL;
    }
 
    UNLOCK_HARDWARE(intel);
+
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
 }
 
-void
+
+static void
 intelTexImage3D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -578,11 +571,11 @@ intelTexImage3D(GLcontext * ctx,
 {
    intelTexImage(ctx, 3, target, level,
                  internalFormat, width, height, depth, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
 
-void
+static void
 intelTexImage2D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -594,10 +587,11 @@ intelTexImage2D(GLcontext * ctx,
 {
    intelTexImage(ctx, 2, target, level,
                  internalFormat, width, height, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void
+
+static void
 intelTexImage1D(GLcontext * ctx,
                 GLenum target, GLint level,
                 GLint internalFormat,
@@ -609,21 +603,24 @@ intelTexImage1D(GLcontext * ctx,
 {
    intelTexImage(ctx, 1, target, level,
                  internalFormat, width, 1, 1, border,
-                 format, type, pixels, unpack, texObj, texImage, 0, 0);
+                 format, type, pixels, unpack, texObj, texImage, 0, GL_FALSE);
 }
 
-void intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-				GLint internalFormat,
-				GLint width, GLint height, GLint border,
-				GLsizei imageSize, const GLvoid *data,
-				struct gl_texture_object *texObj,
-				struct gl_texture_image *texImage )
+
+static void
+intelCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
+                           GLint internalFormat,
+                           GLint width, GLint height, GLint border,
+                           GLsizei imageSize, const GLvoid *data,
+                           struct gl_texture_object *texObj,
+                           struct gl_texture_image *texImage )
 {
    intelTexImage(ctx, 2, target, level,
 		 internalFormat, width, height, 1, border,
-		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, 1);
+		 0, 0, data, &ctx->Unpack, texObj, texImage, imageSize, GL_TRUE);
 }
 
+
 /**
  * Need to map texture image into memory before copying image data,
  * then unmap it.
@@ -632,7 +629,7 @@ static void
 intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
 		    GLenum format, GLenum type, GLvoid * pixels,
 		    struct gl_texture_object *texObj,
-		    struct gl_texture_image *texImage, int compressed)
+		    struct gl_texture_image *texImage, GLboolean compressed)
 {
    struct intel_context *intel = intel_context(ctx);
    struct intel_texture_image *intelImage = intel_texture_image(texImage);
@@ -686,28 +683,29 @@ intel_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
    }
 }
 
-void
+
+static void
 intelGetTexImage(GLcontext * ctx, GLenum target, GLint level,
                  GLenum format, GLenum type, GLvoid * pixels,
                  struct gl_texture_object *texObj,
                  struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, format, type, pixels,
-		       texObj, texImage, 0);
-
-
+		       texObj, texImage, GL_FALSE);
 }
 
-void
+
+static void
 intelGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
 			   GLvoid *pixels,
 			   struct gl_texture_object *texObj,
 			   struct gl_texture_image *texImage)
 {
    intel_get_tex_image(ctx, target, level, 0, 0, pixels,
-		       texObj, texImage, 1);
+		       texObj, texImage, GL_TRUE);
 }
 
+
 void
 intelSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 		  unsigned long long offset, GLint depth, GLuint pitch)
@@ -816,3 +814,16 @@ intelSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
     */
    intelSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
 }
+
+
+void
+intelInitTextureImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexImage1D = intelTexImage1D;
+   functions->TexImage2D = intelTexImage2D;
+   functions->TexImage3D = intelTexImage3D;
+   functions->GetTexImage = intelGetTexImage;
+
+   functions->CompressedTexImage2D = intelCompressedTexImage2D;
+   functions->GetCompressedTexImage = intelGetCompressedTexImage;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_layout.c b/src/mesa/drivers/dri/intel/intel_tex_layout.c
index e6f9a417790..7d69ea4484a 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_layout.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_layout.c
@@ -35,26 +35,39 @@
 #include "intel_context.h"
 #include "main/macros.h"
 
-GLuint intel_compressed_alignment(GLenum internalFormat)
+void intel_get_texture_alignment_unit(GLenum internalFormat, GLuint *w, GLuint *h)
 {
-    GLuint alignment = 4;
-
     switch (internalFormat) {
     case GL_COMPRESSED_RGB_FXT1_3DFX:
     case GL_COMPRESSED_RGBA_FXT1_3DFX:
-        alignment = 8;
+        *w = 8;
+        *h = 4;
+        break;
+
+    case GL_RGB_S3TC:
+    case GL_RGB4_S3TC:
+    case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+    case GL_RGBA_S3TC:
+    case GL_RGBA4_S3TC:
+    case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+    case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+        *w = 4;
+        *h = 4;
         break;
 
     default:
+        *w = 4;
+        *h = 2;
         break;
     }
-
-    return alignment;
 }
 
-void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt )
+void i945_miptree_layout_2d( struct intel_context *intel,
+			     struct intel_mipmap_tree *mt,
+			     uint32_t tiling )
 {
-   GLint align_h = 2, align_w = 4;
+   GLuint align_h = 2, align_w = 4;
    GLuint level;
    GLuint x = 0;
    GLuint y = 0;
@@ -62,9 +75,9 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    GLuint height = mt->height0;
 
    mt->pitch = mt->width0;
+   intel_get_texture_alignment_unit(mt->internal_format, &align_w, &align_h);
 
    if (mt->compressed) {
-       align_w = intel_compressed_alignment(mt->internal_format);
        mt->pitch = ALIGN(mt->width0, align_w);
    }
 
@@ -92,7 +105,7 @@ void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tr
    /* Pitch must be a whole number of dwords, even though we
     * express it in texels.
     */
-   mt->pitch = intel_miptree_pitch_align (intel, mt, mt->pitch);
+   mt->pitch = intel_miptree_pitch_align (intel, mt, tiling, mt->pitch);
    mt->total_height = 0;
 
    for ( level = mt->first_level ; level <= mt->last_level ; level++ ) {
diff --git a/src/mesa/drivers/dri/intel/intel_tex_layout.h b/src/mesa/drivers/dri/intel/intel_tex_layout.h
index dbc90e6f9b7..c9de9b56784 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_layout.h
+++ b/src/mesa/drivers/dri/intel/intel_tex_layout.h
@@ -38,5 +38,7 @@ static GLuint minify( GLuint d )
    return MAX2(1, d>>1);
 }
 
-extern void i945_miptree_layout_2d( struct intel_context *intel, struct intel_mipmap_tree *mt );
-extern GLuint intel_compressed_alignment(GLenum);
+extern void i945_miptree_layout_2d(struct intel_context *intel,
+				   struct intel_mipmap_tree *mt,
+				   uint32_t tiling);
+extern void intel_get_texture_alignment_unit(GLenum, GLuint *, GLuint *);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_subimage.c b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
index f86de568976..1f27131dac0 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_subimage.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_subimage.c
@@ -101,11 +101,6 @@ intelTexSubimage(GLcontext * ctx,
       _mesa_error(ctx, GL_OUT_OF_MEMORY, "intelTexSubImage");
    }
 
-   /* GL_SGIS_generate_mipmap */
-   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
-      intel_generate_mipmap(ctx, target, texObj);
-   }
-
    _mesa_unmap_teximage_pbo(ctx, packing);
 
    if (intelImage->mt) {
@@ -114,13 +109,15 @@ intelTexSubimage(GLcontext * ctx,
    }
 
    UNLOCK_HARDWARE(intel);
-}
-
-
 
+   /* GL_SGIS_generate_mipmap */
+   if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+      intel_generate_mipmap(ctx, target, texObj);
+   }
+}
 
 
-void
+static void
 intelTexSubImage3D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -132,18 +129,15 @@ intelTexSubImage3D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 3,
                     target, level,
                     xoffset, yoffset, zoffset,
                     width, height, depth,
                     format, type, pixels, packing, texObj, texImage);
-
 }
 
 
-
-void
+static void
 intelTexSubImage2D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -155,17 +149,15 @@ intelTexSubImage2D(GLcontext * ctx,
                    struct gl_texture_object *texObj,
                    struct gl_texture_image *texImage)
 {
-
    intelTexSubimage(ctx, 2,
                     target, level,
                     xoffset, yoffset, 0,
                     width, height, 1,
                     format, type, pixels, packing, texObj, texImage);
-
 }
 
 
-void
+static void
 intelTexSubImage1D(GLcontext * ctx,
                    GLenum target,
                    GLint level,
@@ -182,10 +174,9 @@ intelTexSubImage1D(GLcontext * ctx,
                     xoffset, 0, 0,
                     width, 1, 1,
                     format, type, pixels, packing, texObj, texImage);
-
 }
 
-void
+static void
 intelCompressedTexSubImage2D(GLcontext * ctx,
 			     GLenum target,
 			     GLint level,
@@ -199,3 +190,14 @@ intelCompressedTexSubImage2D(GLcontext * ctx,
    fprintf(stderr, "stubbed CompressedTexSubImage2D: %dx%d@%dx%d\n",
 	   width, height, xoffset, yoffset);
 }
+
+
+
+void
+intelInitTextureSubImageFuncs(struct dd_function_table *functions)
+{
+   functions->TexSubImage1D = intelTexSubImage1D;
+   functions->TexSubImage2D = intelTexSubImage2D;
+   functions->TexSubImage3D = intelTexSubImage3D;
+   functions->CompressedTexSubImage2D = intelCompressedTexSubImage2D;
+}
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 05a375e1f3b..a284d5475f4 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -199,6 +199,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
    if (!intelObj->mt) {
       intelObj->mt = intel_miptree_create(intel,
                                           intelObj->base.Target,
+                                          firstImage->base._BaseFormat,
                                           firstImage->base.InternalFormat,
                                           intelObj->firstLevel,
                                           intelObj->lastLevel,
@@ -241,7 +242,7 @@ intel_tex_map_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intelImage->base.Data =
 	    intel_miptree_image_map(intel,
 				    intelImage->mt,
@@ -268,7 +269,7 @@ intel_tex_unmap_level_images(struct intel_context *intel,
       struct intel_texture_image *intelImage =
 	 intel_texture_image(intelObj->base.Image[face][level]);
 
-      if (intelImage->mt) {
+      if (intelImage && intelImage->mt) {
 	 intel_miptree_image_unmap(intel, intelImage->mt);
 	 intelImage->base.Data = NULL;
       }
diff --git a/src/mesa/drivers/dri/r200/.gitignore b/src/mesa/drivers/dri/r200/.gitignore
index 3773d8ea73f..2f9cd1a987b 100644
--- a/src/mesa/drivers/dri/r200/.gitignore
+++ b/src/mesa/drivers/dri/r200/.gitignore
@@ -1,3 +1,15 @@
+radeon_bocs_wrapper.h
+radeon_bo_legacy.[ch]
 radeon_chipset.h
-radeon_screen.*
+radeon_cmdbuf.h
+radeon_common.[ch]
+radeon_common_context.[ch]
+radeon_cs_legacy.[ch]
+radeon_dma.[ch]
+radeon_fbo.c
+radeon_lock.[ch]
+radeon_mipmap_tree.[ch]
+radeon_screen.[ch]
+radeon_span.[ch]
+radeon_texture.[ch]
 server
diff --git a/src/mesa/drivers/dri/r200/Makefile b/src/mesa/drivers/dri/r200/Makefile
index e9144ac75ce..4686241957b 100644
--- a/src/mesa/drivers/dri/r200/Makefile
+++ b/src/mesa/drivers/dri/r200/Makefile
@@ -3,6 +3,8 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = r200_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c 
@@ -11,25 +13,41 @@ ifeq ($(USING_EGL), 1)
 EGL_SOURCES = server/radeon_egl.c
 endif
 
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
+RADEON_COMMON_SOURCES = \
+	radeon_texture.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_dma.c \
+	radeon_lock.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_fbo.c
+
+
 DRIVER_SOURCES = r200_context.c \
 		 r200_ioctl.c \
-		 r200_lock.c \
 		 r200_state.c \
 		 r200_state_init.c \
 		 r200_cmdbuf.c \
 		 r200_pixel.c \
 		 r200_tex.c \
-		 r200_texmem.c \
 		 r200_texstate.c \
 		 r200_tcl.c \
 		 r200_swtcl.c \
-		 r200_span.c \
 		 r200_maos.c \
 		 r200_sanity.c \
 		 r200_fragshader.c \
 		 r200_vertprog.c \
 		 radeon_screen.c \
-		 $(EGL_SOURCES)
+		 $(EGL_SOURCES) \
+		 $(RADEON_COMMON_SOURCES) \
+		 $(CS_SOURCES)
 
 C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 
@@ -48,7 +66,31 @@ SYMLINKS = \
 COMMON_SYMLINKS = \
 	radeon_chipset.h \
 	radeon_screen.c \
-	radeon_screen.h
+	radeon_screen.h \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_bo_legacy.h \
+	radeon_cs_legacy.h \
+	radeon_bocs_wrapper.h \
+	radeon_span.h \
+	radeon_span.c \
+	radeon_lock.c \
+	radeon_lock.h \
+	radeon_common.c \
+	radeon_common_context.c \
+	radeon_common_context.h \
+	radeon_common.h \
+	radeon_cmdbuf.h \
+	radeon_mipmap_tree.c \
+	radeon_mipmap_tree.h \
+	radeon_texture.c \
+	radeon_texture.h \
+	radeon_dma.c \
+	radeon_dma.h \
+	radeon_fbo.c \
+	$(CS_SOURCES)
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 ##### TARGETS #####
 
diff --git a/src/mesa/drivers/dri/r200/r200_cmdbuf.c b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
index e1633772a18..d49f4fabe72 100644
--- a/src/mesa/drivers/dri/r200/r200_cmdbuf.c
+++ b/src/mesa/drivers/dri/r200/r200_cmdbuf.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "main/simple_list.h"
 
+#include "radeon_common.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -45,18 +46,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_sanity.h"
 #include "radeon_reg.h"
 
-static void print_state_atom( struct r200_state_atom *state )
-{
-   int i;
-
-   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
-
-   if (0 & R200_DEBUG & DEBUG_VERBOSE) 
-      for (i = 0 ; i < state->cmd_size ; i++) 
-	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
-
-}
-
 /* The state atoms will be emitted in the order they appear in the atom list,
  * so this step is important.
  */
@@ -64,141 +53,85 @@ void r200SetUpAtomList( r200ContextPtr rmesa )
 {
    int i, mtu;
 
-   mtu = rmesa->glCtx->Const.MaxTextureUnits;
-
-   make_empty_list(&rmesa->hw.atomlist);
-   rmesa->hw.atomlist.name = "atom-list";
-
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ctx );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.set );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lin );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msk );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpt );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vtx );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vap );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vte );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msc );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cst );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.zbs );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcl );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.msl );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tcg );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.grd );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.fog );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tam );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tf );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.atf );
+   mtu = rmesa->radeon.glCtx->Const.MaxTextureUnits;
+
+   make_empty_list(&rmesa->radeon.hw.atomlist);
+   rmesa->radeon.hw.atomlist.name = "atom-list";
+
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.ctx );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.set );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.lin );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.msk );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpt );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vtx );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vap );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vte );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.msc );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.cst );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.zbs );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tcl );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.msl );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tcg );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.grd );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.fog );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tam );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tf );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.atf );
    for (i = 0; i < mtu; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.tex[i] );
+       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.tex[i] );
    for (i = 0; i < mtu; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.cube[i] );
+       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.cube[i] );
    for (i = 0; i < 6; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pix[i] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[0] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.afs[1] );
+       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.pix[i] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.afs[0] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.afs[1] );
    for (i = 0; i < 8; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.lit[i] );
+       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.lit[i] );
    for (i = 0; i < 3 + mtu; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mat[i] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.eye );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.glt );
+       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.mat[i] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.eye );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.glt );
    for (i = 0; i < 2; ++i)
-      insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.mtl[i] );
+      insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.mtl[i] );
    for (i = 0; i < 6; ++i)
-       insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ucp[i] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.spr );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.ptp );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.prf );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.pvs );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[0] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpp[1] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[0] );
-   insert_at_tail( &rmesa->hw.atomlist, &rmesa->hw.vpi[1] );
-}
-
-static void r200SaveHwState( r200ContextPtr rmesa )
-{
-   struct r200_state_atom *atom;
-   char * dest = rmesa->backup_store.cmd_buf;
-
-   if (R200_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->backup_store.cmd_used = 0;
-
-   foreach( atom, &rmesa->hw.atomlist ) {
-      if ( atom->check( rmesa->glCtx, atom->idx ) ) {
-	 int size = atom->cmd_size * 4;
-	 memcpy( dest, atom->cmd, size);
-	 dest += size;
-	 rmesa->backup_store.cmd_used += size;
-	 if (R200_DEBUG & DEBUG_STATE)
-	    print_state_atom( atom );
-      }
-   }
-
-   assert( rmesa->backup_store.cmd_used <= R200_CMD_BUF_SZ );
-   if (R200_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Returning to r200EmitState\n");
+       insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.ucp[i] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.spr );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.ptp );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.prf );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.pvs );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpp[0] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpp[1] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[0] );
+   insert_at_tail( &rmesa->radeon.hw.atomlist, &rmesa->hw.vpi[1] );
 }
 
-void r200EmitState( r200ContextPtr rmesa )
+void r200EmitScissor(r200ContextPtr rmesa)
 {
-   char *dest;
-   int mtu;
-   struct r200_state_atom *atom;
-
-   if (R200_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   if (rmesa->save_on_next_emit) {
-      r200SaveHwState(rmesa);
-      rmesa->save_on_next_emit = GL_FALSE;
-   }
-
-   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
-      return;
-
-   mtu = rmesa->glCtx->Const.MaxTextureUnits;
-
-   /* To avoid going across the entire set of states multiple times, just check
-    * for enough space for the case of emitting all state, and inline the
-    * r200AllocCmdBuf code here without all the checks.
-    */
-   r200EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size );
-
-   /* we need to calculate dest after EnsureCmdBufSpace
-      as we may flush the buffer - airlied */
-   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-   if (R200_DEBUG & DEBUG_STATE) {
-      foreach( atom, &rmesa->hw.atomlist ) {
-	 if ( atom->dirty || rmesa->hw.all_dirty ) {
-	    if ( atom->check( rmesa->glCtx, atom->idx ) )
-	       print_state_atom( atom );
-	    else
-	       fprintf(stderr, "skip state %s\n", atom->name);
-	 }
-      }
-   }
-
-   foreach( atom, &rmesa->hw.atomlist ) {
-      if ( rmesa->hw.all_dirty )
-	 atom->dirty = GL_TRUE;
-      if ( atom->dirty ) {
-	 if ( atom->check( rmesa->glCtx, atom->idx ) ) {
-	    int size = atom->cmd_size * 4;
-	    memcpy( dest, atom->cmd, size);
-	    dest += size;
-	    rmesa->store.cmd_used += size;
-	    atom->dirty = GL_FALSE;
-	 }
-      }
-   }
-
-   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
-
-   rmesa->hw.is_dirty = GL_FALSE;
-   rmesa->hw.all_dirty = GL_FALSE;
+    BATCH_LOCALS(&rmesa->radeon);
+    if (!rmesa->radeon.radeonScreen->kernel_mm) {
+       return;
+    }
+    if (rmesa->radeon.state.scissor.enabled) {
+        BEGIN_BATCH(8);
+        OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
+        OUT_BATCH(R200_SCISSOR_ENABLE | rmesa->hw.set.cmd[SET_RE_CNTL]);
+        OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
+        OUT_BATCH(R200_SCISSOR_ENABLE_0);
+        OUT_BATCH(CP_PACKET0(R200_RE_SCISSOR_TL_0, 0));
+        OUT_BATCH((rmesa->radeon.state.scissor.rect.y1 << 16) |
+                  rmesa->radeon.state.scissor.rect.x1);
+        OUT_BATCH(CP_PACKET0(R200_RE_SCISSOR_BR_0, 0));
+        OUT_BATCH(((rmesa->radeon.state.scissor.rect.y2 - 1) << 16) |
+                  (rmesa->radeon.state.scissor.rect.x2 - 1));
+        END_BATCH();
+    } else {
+        BEGIN_BATCH(4);
+        OUT_BATCH(CP_PACKET0(R200_RE_CNTL, 0));
+        OUT_BATCH(rmesa->hw.set.cmd[SET_RE_CNTL] & ~R200_SCISSOR_ENABLE);
+        OUT_BATCH(CP_PACKET0(R200_RE_AUX_SCISSOR_CNTL, 0));
+        OUT_BATCH(0);
+        END_BATCH();
+    }
 }
 
 /* Fire a section of the retained (indexed_verts) buffer as a regular
@@ -208,51 +141,81 @@ void r200EmitVbufPrim( r200ContextPtr rmesa,
                        GLuint primitive,
                        GLuint vertex_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
+   BATCH_LOCALS(&rmesa->radeon);
 
    assert(!(primitive & R200_VF_PRIM_WALK_IND));
    
-   r200EmitState( rmesa );
+   radeonEmitState(&rmesa->radeon);
    
    if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
       fprintf(stderr, "%s cmd_used/4: %d prim %x nr %d\n", __FUNCTION__,
 	      rmesa->store.cmd_used/4, primitive, vertex_nr);
-   
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VBUF_BUFSZ,
-						  __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = R200_CP_CMD_3D_DRAW_VBUF_2;
-   cmd[2].i = (primitive | 
-	       R200_VF_PRIM_WALK_LIST |
-	       R200_VF_COLOR_ORDER_RGBA |
-	       (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
+   r200EmitScissor(rmesa);
+ 
+   BEGIN_BATCH(3);
+   OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_VBUF_2, 0);
+   OUT_BATCH(primitive | R200_VF_PRIM_WALK_LIST | R200_VF_COLOR_ORDER_RGBA |
+	     (vertex_nr << R200_VF_VERTEX_NUMBER_SHIFT));
+   END_BATCH();
 }
 
+static void r200FireEB(r200ContextPtr rmesa, int vertex_count, int type)
+{
+	BATCH_LOCALS(&rmesa->radeon);
+
+	if (vertex_count > 0) {
+        r200EmitScissor(rmesa);
+		BEGIN_BATCH(8+2);
+		OUT_BATCH_PACKET3_CLIP(R200_CP_CMD_3D_DRAW_INDX_2, 0);
+		OUT_BATCH(R200_VF_PRIM_WALK_IND |
+			  R200_VF_COLOR_ORDER_RGBA | 
+			  ((vertex_count + 0) << 16) |
+			  type);
+		
+		if (!rmesa->radeon.radeonScreen->kernel_mm) {
+			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
+			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
+			OUT_BATCH_RELOC(rmesa->radeon.tcl.elt_dma_offset,
+					rmesa->radeon.tcl.elt_dma_bo,
+					rmesa->radeon.tcl.elt_dma_offset,
+					RADEON_GEM_DOMAIN_GTT, 0, 0);
+			OUT_BATCH((vertex_count + 1)/2);
+		} else {
+			OUT_BATCH_PACKET3(R200_CP_CMD_INDX_BUFFER, 2);
+			OUT_BATCH((0x80 << 24) | (0 << 16) | 0x810);
+			OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
+			OUT_BATCH((vertex_count + 1)/2);
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.elt_dma_bo,
+					      RADEON_GEM_DOMAIN_GTT, 0, 0);
+		}
+		END_BATCH();
+	}
+}
 
-void r200FlushElts( r200ContextPtr rmesa )
+void r200FlushElts(GLcontext *ctx)
 {
-   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
-   int dwords;
-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 12)) / 2;
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   int nr, elt_used = rmesa->tcl.elt_used;
 
    if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
+     fprintf(stderr, "%s %x %d\n", __FUNCTION__, rmesa->tcl.hw_primitive, elt_used);
+
+   assert( rmesa->radeon.dma.flush == r200FlushElts );
+   rmesa->radeon.dma.flush = NULL;
 
-   assert( rmesa->dma.flush == r200FlushElts );
-   rmesa->dma.flush = NULL;
+   nr = elt_used / 2;
 
-   /* Cope with odd number of elts:
-    */
-   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
-   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
+   radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
 
-   cmd[1] |= (dwords - 3) << 16;
-   cmd[2] |= nr << R200_VF_VERTEX_NUMBER_SHIFT;
+   r200FireEB(rmesa, nr, rmesa->tcl.hw_primitive);
+
+   radeon_bo_unref(rmesa->radeon.tcl.elt_dma_bo);
+   rmesa->radeon.tcl.elt_dma_bo = NULL;
 
    if (R200_DEBUG & DEBUG_SYNC) {
       fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-      r200Finish( rmesa->glCtx );
+      radeonFinish( rmesa->radeon.glCtx );
    }
 }
 
@@ -261,38 +224,45 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
 				    GLuint primitive,
 				    GLuint min_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
    GLushort *retval;
+   int ret;
 
    if (R200_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
 
    assert((primitive & R200_VF_PRIM_WALK_IND));
    
-   r200EmitState( rmesa );
-   
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, ELTS_BUFSZ(min_nr),
-						__FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = R200_CP_CMD_3D_DRAW_INDX_2;
-   cmd[2].i = (primitive | 
-	       R200_VF_PRIM_WALK_IND |
-	       R200_VF_COLOR_ORDER_RGBA);
+   radeonEmitState(&rmesa->radeon);
+
+#ifdef RADEON_DEBUG_BO
+   rmesa->radeon.tcl.elt_dma_bo = radeon_bo_open(rmesa->radeon.radeonScreen->bom,
+					  0, R200_ELT_BUF_SZ, 4,
+						 RADEON_GEM_DOMAIN_GTT, 0, "ELT");
+#else
+   rmesa->radeon.tcl.elt_dma_bo = radeon_bo_open(rmesa->radeon.radeonScreen->bom,
+					  0, R200_ELT_BUF_SZ, 4,
+					  RADEON_GEM_DOMAIN_GTT, 0);
+#endif
+   rmesa->radeon.tcl.elt_dma_offset = 0;
+   rmesa->tcl.elt_used = min_nr * 2;
+
+   ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, rmesa->radeon.tcl.elt_dma_bo,
+				 RADEON_GEM_DOMAIN_GTT, 0);
+   if (ret) {
+      fprintf(stderr,"failure to revalidate BOs - badness\n");
+   }
 
+   radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
+   retval = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
    
-   retval = (GLushort *)(cmd+3);
 
    if (R200_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x prim %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, primitive);
-
-   assert(!rmesa->dma.flush);
-   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-   rmesa->dma.flush = r200FlushElts;
+      fprintf(stderr, "%s: header prim %x \n",
+	      __FUNCTION__, primitive);
 
-   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
+   assert(!rmesa->radeon.dma.flush);
+   rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+   rmesa->radeon.dma.flush = r200FlushElts;
 
    return retval;
 }
@@ -300,129 +270,119 @@ GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
 
 
 void r200EmitVertexAOS( r200ContextPtr rmesa,
-			  GLuint vertex_size,
-			  GLuint offset )
+			GLuint vertex_size,
+ 			struct radeon_bo *bo,
+			GLuint offset )
 {
-   drm_radeon_cmd_header_t *cmd;
+   BATCH_LOCALS(&rmesa->radeon);
 
    if (R200_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
 	      __FUNCTION__, vertex_size, offset);
 
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
-						  __FUNCTION__ );
 
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (2 << 16);
-   cmd[2].i = 1;
-   cmd[3].i = vertex_size | (vertex_size << 8);
-   cmd[4].i = offset;
+   BEGIN_BATCH(7);
+   OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, 2);
+   OUT_BATCH(1);
+   OUT_BATCH(vertex_size | (vertex_size << 8));
+   OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   END_BATCH();
 }
-		       
 
-void r200EmitAOS( r200ContextPtr rmesa,
-		    struct r200_dma_region **component,
-		    GLuint nr,
-		    GLuint offset )
+void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset)
 {
-   drm_radeon_cmd_header_t *cmd;
-   int sz = AOS_BUFSZ(nr);
+   BATCH_LOCALS(&rmesa->radeon);
+   uint32_t voffset;
+   int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
    int i;
-   int *tmp;
-
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s nr arrays: %d\n", __FUNCTION__, nr);
-
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sz, __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_3D_LOAD_VBPNTR | (((sz / sizeof(int)) - 3) << 16);
-   cmd[2].i = nr;
-   tmp = &cmd[0].i;
-   cmd += 3;
-
-   for (i = 0 ; i < nr ; i++) {
-      if (i & 1) {
-	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
-		      (component[i]->aos_size << 16));
-	 cmd[2].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
-	 cmd += 3;
+   
+   if (RADEON_DEBUG & DEBUG_VERTS)
+      fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
+	      offset);
+
+   BEGIN_BATCH(sz+2+ (nr*2));
+   OUT_BATCH_PACKET3(R200_CP_CMD_3D_LOAD_VBPNTR, sz - 1);
+   OUT_BATCH(nr);
+
+    
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+			
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i+1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
       }
-      else {
-	 cmd[0].i = ((component[i]->aos_stride << 8) | 
-		     (component[i]->aos_size << 0));
-	 cmd[1].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
+      
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[nr - 1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
+      }
+   } else {
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+	 
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH(voffset);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH(voffset);
+      }
+      
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH(voffset);
+      }
+      for (i = 0; i + 1 < nr; i += 2) {
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+0].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
+      }
+      if (nr & 1) {
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[nr-1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
       }
    }
-
-   if (R200_DEBUG & DEBUG_VERTS) {
-      fprintf(stderr, "%s:\n", __FUNCTION__);
-      for (i = 0 ; i < sz ; i++)
-	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
-   }
-}
-
-void r200EmitBlit( r200ContextPtr rmesa,
-		   GLuint color_fmt,
-		   GLuint src_pitch,
-		   GLuint src_offset,
-		   GLuint dst_pitch,
-		   GLuint dst_offset,
-		   GLint srcx, GLint srcy,
-		   GLint dstx, GLint dsty,
-		   GLuint w, GLuint h )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-	      __FUNCTION__, 
-	      src_pitch, src_offset, srcx, srcy,
-	      dst_pitch, dst_offset, dstx, dsty,
-	      w, h);
-
-   assert( (src_pitch & 63) == 0 );
-   assert( (dst_pitch & 63) == 0 );
-   assert( (src_offset & 1023) == 0 );
-   assert( (dst_offset & 1023) == 0 );
-   assert( w < (1<<16) );
-   assert( h < (1<<16) );
-
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 8 * sizeof(int),
-						  __FUNCTION__ );
-
-
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = R200_CP_CMD_BITBLT_MULTI | (5 << 16);
-   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_BRUSH_NONE |
-	       (color_fmt << 8) |
-	       RADEON_GMC_SRC_DATATYPE_COLOR |
-	       RADEON_ROP3_S |
-	       RADEON_DP_SRC_SOURCE_MEMORY |
-	       RADEON_GMC_CLR_CMP_CNTL_DIS |
-	       RADEON_GMC_WR_MSK_DIS );
-
-   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
-   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
-   cmd[5].i = (srcx << 16) | srcy;
-   cmd[6].i = (dstx << 16) | dsty; /* dst */
-   cmd[7].i = (w << 16) | h;
-}
-
-
-void r200EmitWait( r200ContextPtr rmesa, GLuint flags )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
-
-   cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, 1 * sizeof(int),
-					   __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
-   cmd[0].wait.flags = flags;
+   END_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r200/r200_context.c b/src/mesa/drivers/dri/r200/r200_context.c
index 058296eab23..9a92a320797 100644
--- a/src/mesa/drivers/dri/r200/r200_context.c
+++ b/src/mesa/drivers/dri/r200/r200_context.c
@@ -54,7 +54,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
-#include "r200_span.h"
 #include "r200_pixel.h"
 #include "r200_tex.h"
 #include "r200_swtcl.h"
@@ -62,6 +61,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_maos.h"
 #include "r200_vertprog.h"
 
+#include "radeon_span.h"
+
 #define need_GL_ARB_vertex_program
 #define need_GL_ATI_fragment_shader
 #define need_GL_EXT_blend_minmax
@@ -71,6 +72,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_blend_func_separate
 #define need_GL_NV_vertex_program
 #define need_GL_ARB_point_parameters
+#define need_GL_EXT_framebuffer_object
 #include "extension_helper.h"
 
 #define DRIVER_DATE	"20060602"
@@ -78,9 +80,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h" /* for symbolic values of enum-type options */
-#ifndef R200_DEBUG
-int R200_DEBUG = (0);
-#endif
 
 /* Return various strings for glGetString().
  */
@@ -89,8 +88,8 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    static char buffer[128];
    unsigned   offset;
-   GLuint agp_mode = (rmesa->r200Screen->card_type == RADEON_CARD_PCI)? 0 :
-      rmesa->r200Screen->AGPMode;
+   GLuint agp_mode = (rmesa->radeon.radeonScreen->card_type == RADEON_CARD_PCI)? 0 :
+      rmesa->radeon.radeonScreen->AGPMode;
 
    switch ( name ) {
    case GL_VENDOR:
@@ -101,7 +100,7 @@ static const GLubyte *r200GetString( GLcontext *ctx, GLenum name )
 				     agp_mode );
 
       sprintf( & buffer[ offset ], " %sTCL",
-	       !(rmesa->TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
+	       !(rmesa->radeon.TclFallback & R200_TCL_FALLBACK_TCL_DISABLE)
 	       ? "" : "NO-" );
 
       return (GLubyte *)buffer;
@@ -126,6 +125,7 @@ const struct dri_extension card_extensions[] =
     { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
     { "GL_EXT_blend_subtract",             NULL },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+    { "GL_EXT_packed_depth_stencil",	   NULL},
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_edge_clamp",         NULL },
@@ -167,6 +167,11 @@ const struct dri_extension point_extensions[] = {
     { NULL,                                NULL }
 };
 
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
 extern const struct tnl_pipeline_stage _r200_render_stage;
 extern const struct tnl_pipeline_stage _r200_tcl_stage;
 
@@ -234,6 +239,39 @@ static const struct dri_debug_control debug_control[] =
     { NULL,    0 }
 };
 
+static void r200_get_lock(radeonContextPtr radeon)
+{
+   r200ContextPtr rmesa = (r200ContextPtr)radeon;
+   drm_radeon_sarea_t *sarea = radeon->sarea;
+
+   R200_STATECHANGE( rmesa, ctx );
+   if (rmesa->radeon.sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+   }
+   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
+
+   if ( sarea->ctx_owner != rmesa->radeon.dri.hwContext ) {
+      sarea->ctx_owner = rmesa->radeon.dri.hwContext;
+      if (!radeon->radeonScreen->kernel_mm)
+         radeon_bo_legacy_texture_age(radeon->radeonScreen->bom);
+   }
+
+}
+
+static void r200_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+}
+
+
+static void r200_init_vtbl(radeonContextPtr radeon)
+{
+   radeon->vtbl.get_lock = r200_get_lock;
+   radeon->vtbl.update_viewport_offset = r200UpdateViewportOffset;
+   radeon->vtbl.emit_cs_header = r200_vtbl_emit_cs_header;
+   radeon->vtbl.swtcl_flush = r200_swtcl_flush;
+   radeon->vtbl.fallback = r200Fallback;
+}
+
 
 /* Create the device specific rendering context.
  */
@@ -245,9 +283,9 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
    struct dd_function_table functions;
    r200ContextPtr rmesa;
-   GLcontext *ctx, *shareCtx;
+   GLcontext *ctx;
    int i;
-   int tcl_mode, fthrottle_mode;
+   int tcl_mode;
 
    assert(glVisual);
    assert(driContextPriv);
@@ -257,7 +295,8 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    rmesa = (r200ContextPtr) CALLOC( sizeof(*rmesa) );
    if ( !rmesa )
       return GL_FALSE;
-      
+
+   r200_init_vtbl(&rmesa->radeon);
    /* init exp fog table data */
    r200InitStaticFogData();
 
@@ -265,12 +304,12 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
     * Do this here so that initialMaxAnisotropy is set before we create
     * the default textures.
     */
-   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
+   driParseConfigFiles (&rmesa->radeon.optionCache, &screen->optionCache,
 			screen->driScreen->myNum, "r200");
-   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
-                                                 "def_max_anisotropy");
+   rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
+							"def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
+   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
       if ( sPriv->drm_version.minor < 13 )
 	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
 			  "disabling.\n", sPriv->drm_version.minor );
@@ -291,59 +330,15 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    r200InitTextureFuncs(&functions);
    r200InitShaderFuncs(&functions); 
 
-   /* Allocate and initialize the Mesa context */
-   if (sharedContextPrivate)
-      shareCtx = ((r200ContextPtr) sharedContextPrivate)->glCtx;
-   else
-      shareCtx = NULL;
-   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
-                                       &functions, (void *) rmesa);
-   if (!rmesa->glCtx) {
-      FREE(rmesa);
-      return GL_FALSE;
-   }
-   driContextPriv->driverPrivate = rmesa;
-
-   /* Init r200 context data */
-   rmesa->dri.context = driContextPriv;
-   rmesa->dri.screen = sPriv;
-   rmesa->dri.drawable = NULL; /* Set by XMesaMakeCurrent */
-   rmesa->dri.hwContext = driContextPriv->hHWContext;
-   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
-   rmesa->dri.fd = sPriv->fd;
-   rmesa->dri.drmMinor = sPriv->drm_version.minor;
-
-   rmesa->r200Screen = screen;
-   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
-				       screen->sarea_priv_offset);
-
-
-   rmesa->dma.buf0_address = rmesa->r200Screen->buffers->list[0].address;
-
-   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
-   make_empty_list( & rmesa->swapped );
-
-   rmesa->nr_heaps = 1 /* screen->numTexHeaps */ ;
-   assert(rmesa->nr_heaps < RADEON_NR_TEX_HEAPS);
-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
-	    screen->texSize[i],
-	    12,
-	    RADEON_NR_TEX_REGIONS,
-	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
-	    & rmesa->sarea->tex_age[i],
-	    & rmesa->swapped,
-	    sizeof( r200TexObj ),
-	    (destroy_texture_object_t *) r200DestroyTexObj );
+   if (!radeonInitContext(&rmesa->radeon, &functions,
+			  glVisual, driContextPriv,
+			  sharedContextPrivate)) {
+     FREE(rmesa);
+     return GL_FALSE;
    }
-   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
-					   "texture_depth");
-   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-      rmesa->texture_depth = ( screen->cpp == 4 ) ?
-	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
 
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->hw.all_dirty = 1;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.hw.all_dirty = 1;
 
    /* Set the maximum texture size small enough that we can guarentee that
     * all texture units can bind a maximal texture and have all of them in
@@ -351,29 +346,20 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
     * setting allow larger textures.
     */
 
-   ctx = rmesa->glCtx;
-   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
+   ctx = rmesa->radeon.glCtx;
+   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->radeon.optionCache,
 						 "texture_units");
    ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
    ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
 
-   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
-
-   driCalculateMaxTextureLevels( rmesa->texture_heaps,
-				 rmesa->nr_heaps,
-				 & ctx->Const,
-				 4,
-				 11, /* max 2D texture size is 2048x2048 */
-#if ENABLE_HW_3D_TEXTURE
-				 8,  /* max 3D texture size is 256^3 */
-#else
-				 0,  /* 3D textures unsupported */
-#endif
-				 11, /* max cube texture size is 2048x2048 */
-				 11, /* max texture rectangle size is 2048x2048 */
-				 12,
-				 GL_FALSE,
-				 i );
+   i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
+
+   /* FIXME: When no memory manager is available we should set this 
+    * to some reasonable value based on texture memory pool size */
+   ctx->Const.MaxTextureLevels = 12;
+   ctx->Const.Max3DTextureLevels = 9;
+   ctx->Const.MaxCubeTextureLevels = 12;
+   ctx->Const.MaxTextureRectSize = 2048;
 
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
@@ -383,7 +369,7 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    ctx->Const.MinPointSizeAA = 1.0;
    ctx->Const.MaxPointSizeAA = 1.0;
    ctx->Const.PointSizeGranularity = 0.0625;
-   if (rmesa->r200Screen->drmSupportsPointSprites)
+   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites)
       ctx->Const.MaxPointSize = 2047.0;
    else
       ctx->Const.MaxPointSize = 1.0;
@@ -441,32 +427,35 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
    _math_matrix_set_identity( &rmesa->tmpmat );
 
    driInitExtensions( ctx, card_extensions, GL_TRUE );
-   if (!(rmesa->r200Screen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
+
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     driInitExtensions(ctx, mm_extensions, GL_FALSE);
+   if (!(rmesa->radeon.radeonScreen->chip_flags & R200_CHIPSET_YCBCR_BROKEN)) {
      /* yuv textures don't work with some chips - R200 / rv280 okay so far
 	others get the bit ordering right but don't actually do YUV-RGB conversion */
       _mesa_enable_extension( ctx, "GL_MESA_ycbcr_texture" );
    }
-   if (rmesa->glCtx->Mesa_DXTn) {
+   if (rmesa->radeon.glCtx->Mesa_DXTn) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
       _mesa_enable_extension( ctx, "GL_S3_s3tc" );
    }
-   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
+   else if (driQueryOptionb (&rmesa->radeon.optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
    }
 
-   if (rmesa->r200Screen->drmSupportsCubeMapsR200)
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR200)
       _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
-   if (rmesa->r200Screen->drmSupportsBlendColor) {
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
        driInitExtensions( ctx, blend_extensions, GL_FALSE );
    }
-   if(rmesa->r200Screen->drmSupportsVertexProgram)
+   if(rmesa->radeon.radeonScreen->drmSupportsVertexProgram)
       driInitSingleExtension( ctx, ARB_vp_extension );
-   if(driQueryOptionb(&rmesa->optionCache, "nv_vertex_program"))
+   if(driQueryOptionb(&rmesa->radeon.optionCache, "nv_vertex_program"))
       driInitSingleExtension( ctx, NV_vp_extension );
 
-   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->r200Screen->drmSupportsFragShader)
+   if ((ctx->Const.MaxTextureUnits == 6) && rmesa->radeon.radeonScreen->drmSupportsFragShader)
       driInitSingleExtension( ctx, ATI_fs_extension );
-   if (rmesa->r200Screen->drmSupportsPointSprites)
+   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites)
       driInitExtensions( ctx, point_extensions, GL_FALSE );
 #if 0
    r200InitDriverFuncs( ctx );
@@ -476,33 +465,16 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 #endif
    /* plug in a few more device driver functions */
    /* XXX these should really go right after _mesa_init_driver_functions() */
+   radeon_fbo_init(&rmesa->radeon);
+   radeonInitSpanFuncs( ctx );
    r200InitPixelFuncs( ctx );
-   r200InitSpanFuncs( ctx );
    r200InitTnlFuncs( ctx );
    r200InitState( rmesa );
    r200InitSwtcl( ctx );
 
-   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
-   rmesa->iw.irq_seq = -1;
-   rmesa->irqsEmitted = 0;
-   rmesa->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
-		     rmesa->r200Screen->irq);
-
-   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-   if (!rmesa->do_irqs)
-      fprintf(stderr,
-	      "IRQ's not enabled, falling back to %s: %d %d\n",
-	      rmesa->do_usleeps ? "usleeps" : "busy waits",
-	      fthrottle_mode,
-	      rmesa->r200Screen->irq);
-
    rmesa->prefer_gart_client_texturing = 
       (getenv("R200_GART_CLIENT_TEXTURES") != 0);
 
-   (*sPriv->systemTime->getUST)( & rmesa->swap_ust );
-
-
 #if DO_DEBUG
    R200_DEBUG  = driParseDebugString( getenv( "R200_DEBUG" ),
 				      debug_control );
@@ -510,202 +482,21 @@ GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 				      debug_control );
 #endif
 
-   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
-   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
+   tcl_mode = driQueryOptioni(&rmesa->radeon.optionCache, "tcl_mode");
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "no_rast")) {
       fprintf(stderr, "disabling 3D acceleration\n");
       FALLBACK(rmesa, R200_FALLBACK_DISABLE, 1);
    }
    else if (tcl_mode == DRI_CONF_TCL_SW || getenv("R200_NO_TCL") ||
-	    !(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
-      if (rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL) {
-	 rmesa->r200Screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	    !(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	 rmesa->radeon.radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
 	 fprintf(stderr, "Disabling HW TCL support\n");
       }
-      TCL_FALLBACK(rmesa->glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
+      TCL_FALLBACK(rmesa->radeon.glCtx, R200_TCL_FALLBACK_TCL_DISABLE, 1);
    }
 
    return GL_TRUE;
 }
 
 
-/* Destroy the device specific context.
- */
-/* Destroy the Mesa and driver specific context data.
- */
-void r200DestroyContext( __DRIcontextPrivate *driContextPriv )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
-   r200ContextPtr current = ctx ? R200_CONTEXT(ctx) : NULL;
-
-   /* check if we're deleting the currently bound context */
-   if (rmesa == current) {
-      R200_FIREVERTICES( rmesa );
-      _mesa_make_current(NULL, NULL, NULL);
-   }
-
-   /* Free r200 context resources */
-   assert(rmesa); /* should never be null */
-   if ( rmesa ) {
-      GLboolean   release_texture_heaps;
-
-
-      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
-      _swsetup_DestroyContext( rmesa->glCtx );
-      _tnl_DestroyContext( rmesa->glCtx );
-      _vbo_DestroyContext( rmesa->glCtx );
-      _swrast_DestroyContext( rmesa->glCtx );
-
-      r200DestroySwtcl( rmesa->glCtx );
-      r200ReleaseArrays( rmesa->glCtx, ~0 );
-
-      if (rmesa->dma.current.buf) {
-	 r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-	 r200FlushCmdBuf( rmesa, __FUNCTION__ );
-      }
-
-      if (rmesa->state.scissor.pClipRects) {
-	 FREE(rmesa->state.scissor.pClipRects);
-	 rmesa->state.scissor.pClipRects = NULL;
-      }
-
-      if ( release_texture_heaps ) {
-         /* This share group is about to go away, free our private
-          * texture object data.
-          */
-         int i;
-
-         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
-	    rmesa->texture_heaps[ i ] = NULL;
-         }
-
-	 assert( is_empty_list( & rmesa->swapped ) );
-      }
-
-      /* free the Mesa context */
-      rmesa->glCtx->DriverCtx = NULL;
-      _mesa_destroy_context( rmesa->glCtx );
-
-      /* free the option cache */
-      driDestroyOptionCache (&rmesa->optionCache);
-
-      FREE( rmesa );
-   }
-}
-
-
-
-
-void
-r200SwapBuffers( __DRIdrawablePrivate *dPriv )
-{
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      r200ContextPtr rmesa;
-      GLcontext *ctx;
-      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-      ctx = rmesa->glCtx;
-      if (ctx->Visual.doubleBufferMode) {
-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
-         if ( rmesa->doPageFlip ) {
-            r200PageFlip( dPriv );
-         }
-         else {
-	     r200CopyBuffer( dPriv, NULL );
-         }
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
-   }
-}
-
-void
-r200CopySubBuffer( __DRIdrawablePrivate *dPriv,
-		   int x, int y, int w, int h )
-{
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      r200ContextPtr rmesa;
-      GLcontext *ctx;
-      rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-      ctx = rmesa->glCtx;
-      if (ctx->Visual.doubleBufferMode) {
-	 drm_clip_rect_t rect;
-	 rect.x1 = x + dPriv->x;
-	 rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	 rect.x2 = rect.x1 + w;
-	 rect.y2 = rect.y1 + h;
-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
-	 r200CopyBuffer( dPriv, &rect );
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
-   }
-}
-
-/* Force the context `c' to be the current context and associate with it
- * buffer `b'.
- */
-GLboolean
-r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
-                   __DRIdrawablePrivate *driDrawPriv,
-                   __DRIdrawablePrivate *driReadPriv )
-{
-   if ( driContextPriv ) {
-      r200ContextPtr newCtx = 
-	 (r200ContextPtr) driContextPriv->driverPrivate;
-
-      if (R200_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)newCtx->glCtx);
-
-      newCtx->dri.readable = driReadPriv;
-
-      if ( newCtx->dri.drawable != driDrawPriv ||
-           newCtx->lastStamp != driDrawPriv->lastStamp ) {
-	 if (driDrawPriv->swap_interval == (unsigned)-1) {
-	    driDrawPriv->vblFlags = (newCtx->r200Screen->irq != 0)
-	       ? driGetDefaultVBlankFlags(&newCtx->optionCache)
-	       : VBLANK_FLAG_NO_IRQ;
-
-	    driDrawableInitVBlank( driDrawPriv );
-	 }
-
-	 newCtx->dri.drawable = driDrawPriv;
-
-	 r200SetCliprects(newCtx);
-	 r200UpdateViewportOffset( newCtx->glCtx );
-      }
-
-      _mesa_make_current( newCtx->glCtx,
-			  (GLframebuffer *) driDrawPriv->driverPrivate,
-			  (GLframebuffer *) driReadPriv->driverPrivate );
-
-      _mesa_update_state( newCtx->glCtx );
-      r200ValidateState( newCtx->glCtx );
-
-   } else {
-      if (R200_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-      _mesa_make_current( NULL, NULL, NULL );
-   }
-
-   if (R200_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "End %s\n", __FUNCTION__);
-   return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean
-r200UnbindContext( __DRIcontextPrivate *driContextPriv )
-{
-   r200ContextPtr rmesa = (r200ContextPtr) driContextPriv->driverPrivate;
-
-   if (R200_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *)rmesa->glCtx);
-
-   return GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/r200/r200_context.h b/src/mesa/drivers/dri/r200/r200_context.h
index 14a1dda46ac..6267293817d 100644
--- a/src/mesa/drivers/dri/r200/r200_context.h
+++ b/src/mesa/drivers/dri/r200/r200_context.h
@@ -53,51 +53,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #error This driver requires a newer libdrm to compile
 #endif
 
+#include "radeon_screen.h"
+#include "radeon_common.h"
+
+#include "radeon_lock.h"
+
 struct r200_context;
 typedef struct r200_context r200ContextRec;
 typedef struct r200_context *r200ContextPtr;
 
-/* This union is used to avoid warnings/miscompilation
-   with float to uint32_t casts due to strict-aliasing */
-typedef union { GLfloat f; uint32_t ui32; } float_ui32_type;
-
-#include "r200_lock.h"
-#include "radeon_screen.h"
 #include "main/mm.h"
 
-/* Flags for software fallback cases */
-/* See correponding strings in r200_swtcl.c */
-#define R200_FALLBACK_TEXTURE           0x01
-#define R200_FALLBACK_DRAW_BUFFER       0x02
-#define R200_FALLBACK_STENCIL           0x04
-#define R200_FALLBACK_RENDER_MODE       0x08
-#define R200_FALLBACK_DISABLE           0x10
-#define R200_FALLBACK_BORDER_MODE       0x20
-
-/* The blit width for texture uploads
- */
-#define BLIT_WIDTH_BYTES 1024
-
-/* Use the templated vertex format:
- */
-#define COLOR_IS_RGBA
-#define TAG(x) r200##x
-#include "tnl_dd/t_dd_vertex.h"
-#undef TAG
-
-typedef void (*r200_tri_func)( r200ContextPtr,
-				 r200Vertex *,
-				 r200Vertex *,
-				 r200Vertex * );
-
-typedef void (*r200_line_func)( r200ContextPtr,
-				  r200Vertex *,
-				  r200Vertex * );
-
-typedef void (*r200_point_func)( r200ContextPtr,
-				   r200Vertex * );
-
-
 struct r200_vertex_program {
         struct gl_vertex_program mesa_program; /* Must be first */
         int translated;
@@ -112,93 +78,11 @@ struct r200_vertex_program {
         int fogmode;
 };
 
-struct r200_colorbuffer_state {
-   GLuint clear;
-#if 000
-   GLint drawOffset, drawPitch;
-#endif
-   int roundEnable;
-};
-
-
-struct r200_depthbuffer_state {
-   GLuint clear;
-   GLfloat scale;
-};
-
-#if 000
-struct r200_pixel_state {
-   GLint readOffset, readPitch;
-};
-#endif
-
-struct r200_scissor_state {
-   drm_clip_rect_t rect;
-   GLboolean enabled;
-
-   GLuint numClipRects;			/* Cliprects active */
-   GLuint numAllocedClipRects;		/* Cliprects available */
-   drm_clip_rect_t *pClipRects;
-};
-
-struct r200_stencilbuffer_state {
-   GLboolean hwBuffer;
-   GLuint clear;			/* rb3d_stencilrefmask value */
-};
-
-struct r200_stipple_state {
-   GLuint mask[32];
-};
-
-
-
-#define TEX_0   0x1
-#define TEX_1   0x2
-#define TEX_2	0x4
-#define TEX_3	0x8
-#define TEX_4	0x10
-#define TEX_5	0x20
-#define TEX_ALL 0x3f
-
-typedef struct r200_tex_obj r200TexObj, *r200TexObjPtr;
-
-/* Texture object in locally shared texture space.
- */
-struct r200_tex_obj {
-   driTextureObject   base;
-
-   GLuint bufAddr;			/* Offset to start of locally
-					   shared texture block */
-
-   GLuint dirty_state;		        /* Flags (1 per texunit) for
-					   whether or not this texobj
-					   has dirty hardware state
-					   (pp_*) that needs to be
-					   brought into the
-					   texunit. */
-
-   drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
-					/* Six, for the cube faces */
-   GLboolean image_override;		/* Image overridden by GLX_EXT_tfp */
-
-   GLuint pp_txfilter;		        /* hardware register values */
-   GLuint pp_txformat;
-   GLuint pp_txformat_x;
-   GLuint pp_txoffset;		        /* Image location in texmem.
-					   All cube faces follow. */
-   GLuint pp_txsize;		        /* npot only */
-   GLuint pp_txpitch;		        /* npot only */
-   GLuint pp_border_color;
-   GLuint pp_cubic_faces;	        /* cube face 1,2,3,4 log2 sizes */
-
-   GLboolean  border_fallback;
-
-   GLuint tile_bits;			/* hw texture tile bits used on this texture */
-};
+#define R200_TEX_ALL 0x3f
 
 
 struct r200_texture_env_state {
-   r200TexObjPtr texobj;
+   radeonTexObjPtr texobj;
    GLuint outputreg;
    GLuint unitneeded;
 };
@@ -210,19 +94,6 @@ struct r200_texture_state {
 };
 
 
-struct r200_state_atom {
-   struct r200_state_atom *next, *prev;
-   const char *name;		         /* for debug */
-   int cmd_size;		         /* size in bytes */
-   GLuint idx;
-   int *cmd;			         /* one or more cmd's */
-   int *lastcmd;			 /* one or more cmd's */
-   GLboolean dirty;
-   GLboolean (*check)( GLcontext *, int );    /* is this state active? */
-};
-   
-
-
 /* Trying to keep these relatively short as the variables are becoming
  * extravagently long.  Drop the driver name prefix off the front of
  * everything - I think we know which driver we're in by now, and keep the
@@ -597,181 +468,79 @@ struct r200_state_atom {
 
 
 struct r200_hw_state {
-   /* Head of the linked list of state atoms. */
-   struct r200_state_atom atomlist;
-
    /* Hardware state, stored as cmdbuf commands:  
     *   -- Need to doublebuffer for
     *           - reviving state after loss of context
     *           - eliding noop statechange loops? (except line stipple count)
     */
-   struct r200_state_atom ctx;
-   struct r200_state_atom set;
-   struct r200_state_atom vte;
-   struct r200_state_atom lin;
-   struct r200_state_atom msk;
-   struct r200_state_atom vpt;
-   struct r200_state_atom vap;
-   struct r200_state_atom vtx;
-   struct r200_state_atom tcl;
-   struct r200_state_atom msl;
-   struct r200_state_atom tcg;
-   struct r200_state_atom msc;
-   struct r200_state_atom cst;
-   struct r200_state_atom tam;
-   struct r200_state_atom tf;
-   struct r200_state_atom tex[6];
-   struct r200_state_atom cube[6];
-   struct r200_state_atom zbs;
-   struct r200_state_atom mtl[2];
-   struct r200_state_atom mat[9];
-   struct r200_state_atom lit[8]; /* includes vec, scl commands */
-   struct r200_state_atom ucp[6];
-   struct r200_state_atom pix[6]; /* pixshader stages */
-   struct r200_state_atom eye; /* eye pos */
-   struct r200_state_atom grd; /* guard band clipping */
-   struct r200_state_atom fog;
-   struct r200_state_atom glt;
-   struct r200_state_atom prf;
-   struct r200_state_atom afs[2];
-   struct r200_state_atom pvs;
-   struct r200_state_atom vpi[2];
-   struct r200_state_atom vpp[2];
-   struct r200_state_atom atf;
-   struct r200_state_atom spr;
-   struct r200_state_atom ptp;
-
-   int max_state_size;	/* Number of bytes necessary for a full state emit. */
-   GLboolean is_dirty, all_dirty;
+   struct radeon_state_atom ctx;
+   struct radeon_state_atom set;
+   struct radeon_state_atom vte;
+   struct radeon_state_atom lin;
+   struct radeon_state_atom msk;
+   struct radeon_state_atom vpt;
+   struct radeon_state_atom vap;
+   struct radeon_state_atom vtx;
+   struct radeon_state_atom tcl;
+   struct radeon_state_atom msl;
+   struct radeon_state_atom tcg;
+   struct radeon_state_atom msc;
+   struct radeon_state_atom cst;
+   struct radeon_state_atom tam;
+   struct radeon_state_atom tf;
+   struct radeon_state_atom tex[6];
+   struct radeon_state_atom cube[6];
+   struct radeon_state_atom zbs;
+   struct radeon_state_atom mtl[2];
+   struct radeon_state_atom mat[9];
+   struct radeon_state_atom lit[8]; /* includes vec, scl commands */
+   struct radeon_state_atom ucp[6];
+   struct radeon_state_atom pix[6]; /* pixshader stages */
+   struct radeon_state_atom eye; /* eye pos */
+   struct radeon_state_atom grd; /* guard band clipping */
+   struct radeon_state_atom fog;
+   struct radeon_state_atom glt;
+   struct radeon_state_atom prf;
+   struct radeon_state_atom afs[2];
+   struct radeon_state_atom pvs;
+   struct radeon_state_atom vpi[2];
+   struct radeon_state_atom vpp[2];
+   struct radeon_state_atom atf;
+   struct radeon_state_atom spr;
+   struct radeon_state_atom ptp;
 };
 
 struct r200_state {
    /* Derived state for internal purposes:
     */
-   struct r200_colorbuffer_state color;
-   struct r200_depthbuffer_state depth;
-#if 00
-   struct r200_pixel_state pixel;
-#endif
-   struct r200_scissor_state scissor;
-   struct r200_stencilbuffer_state stencil;
-   struct r200_stipple_state stipple;
+   struct radeon_stipple_state stipple;
    struct r200_texture_state texture;
    GLuint envneeded;
 };
 
-/* Need refcounting on dma buffers:
- */
-struct r200_dma_buffer {
-   int refcount;		/* the number of retained regions in buf */
-   drmBufPtr buf;
-};
-
-#define GET_START(rvb) (rmesa->r200Screen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-
-/* A retained region, eg vertices for indexed vertices.
- */
-struct r200_dma_region {
-   struct r200_dma_buffer *buf;
-   char *address;		/* == buf->address */
-   int start, end, ptr;		/* offsets from start of buf */
-   int aos_start;
-   int aos_stride;
-   int aos_size;
-};
-
-
-struct r200_dma {
-   /* Active dma region.  Allocations for vertices and retained
-    * regions come from here.  Also used for emitting random vertices,
-    * these may be flushed by calling flush_current();
-    */
-   struct r200_dma_region current;
-   
-   void (*flush)( r200ContextPtr );
-
-   char *buf0_address;		/* start of buf[0], for index calcs */
-   GLuint nr_released_bufs;	/* flush after so many buffers released */
-};
-
-struct r200_dri_mirror {
-   __DRIcontextPrivate	*context;	/* DRI context */
-   __DRIscreenPrivate	*screen;	/* DRI screen */
-   __DRIdrawablePrivate	*drawable;	/* DRI drawable bound to this ctx */
-   __DRIdrawablePrivate	*readable;	/* DRI readable bound to this ctx */
-
-   drm_context_t hwContext;
-   drm_hw_lock_t *hwLock;
-   int fd;
-   int drmMinor;
-};
-
-
 #define R200_CMD_BUF_SZ  (16*1024) 
 
-struct r200_store {
-   GLuint statenr;
-   GLuint primnr;
-   char cmd_buf[R200_CMD_BUF_SZ];
-   int cmd_used;   
-   int elts_start;
-};
-
-
+#define R200_ELT_BUF_SZ  (16*1024) 
 /* r200_tcl.c
  */
 struct r200_tcl_info {
    GLuint hw_primitive;
 
-/* hw can handle 12 components max */
-   struct r200_dma_region *aos_components[12];
-   GLuint nr_aos_components;
-
    GLuint *Elts;
 
-   struct r200_dma_region indexed_verts;
-   struct r200_dma_region vertex_data[15];
+   int elt_used;
+
 };
 
 
 /* r200_swtcl.c
  */
 struct r200_swtcl_info {
-   GLuint RenderIndex;
-   
-   /**
-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
-    * installed in the Mesa state vector.
-    */
-   GLuint vertex_size;
 
-   /**
-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
-    * data in the hardware buffer.
-    */
-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
 
-   /**
-    * Number of elements of \c ::vertex_attrs that are actually used.
-    */
-   GLuint vertex_attr_count;
-
-   /**
-    * Cached pointer to the buffer where Mesa will store vertex data.
-    */
-   GLubyte *verts;
-
-   /* Fallback rasterization functions
-    */
-   r200_point_func draw_point;
-   r200_line_func draw_line;
-   r200_tri_func draw_tri;
-
-   GLuint hw_primitive;
-   GLenum render_primitive;
-   GLuint numverts;
+   radeon_point_func draw_point;
+   radeon_line_func draw_line;
+   radeon_tri_func draw_tri;
 
    /**
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
@@ -787,27 +556,10 @@ struct r200_swtcl_info {
     * Should Mesa project vertex data or will the hardware do it?
     */
    GLboolean needproj;
-
-   struct r200_dma_region indexed_verts;
-};
-
-
-struct r200_ioctl {
-   GLuint vertex_offset;
-   GLuint vertex_size;
 };
 
 
 
-#define R200_MAX_PRIMS 64
-
-
-
-struct r200_prim {
-   GLuint start;
-   GLuint end;
-   GLuint prim;
-};
 
    /* A maximum total of 29 elements per vertex:  3 floats for position, 3
     * floats for normal, 4 floats for color, 4 bytes for secondary color,
@@ -822,9 +574,8 @@ struct r200_prim {
 
 #define R200_MAX_VERTEX_SIZE ((3*6)+11)
 
-
 struct r200_context {
-   GLcontext *glCtx;			/* Mesa context */
+   struct radeon_context radeon;
 
    /* Driver and hardware state management
     */
@@ -832,56 +583,15 @@ struct r200_context {
    struct r200_state state;
    struct r200_vertex_program *curr_vp_hw;
 
-   /* Texture object bookkeeping
-    */
-   unsigned              nr_heaps;
-   driTexHeap          * texture_heaps[ RADEON_NR_TEX_HEAPS ];
-   driTextureObject      swapped;
-   int                   texture_depth;
-   float                 initialMaxAnisotropy;
-
-   /* Rasterization and vertex state:
-    */
-   GLuint TclFallback;
-   GLuint Fallback;
-   GLuint NewGLState;
-   DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
-
    /* Vertex buffers
     */
-   struct r200_ioctl ioctl;
-   struct r200_dma dma;
-   struct r200_store store;
-   /* A full state emit as of the first state emit in the main store, in case
-    * the context is lost.
-    */
-   struct r200_store backup_store;
-
-   /* Page flipping
-    */
-   GLuint doPageFlip;
-
-   /* Busy waiting
-    */
-   GLuint do_usleeps;
-   GLuint do_irqs;
-   GLuint irqsEmitted;
-   drm_radeon_irq_wait_t iw;
+   struct radeon_ioctl ioctl;
+   struct radeon_store store;
 
    /* Clientdata textures;
     */
    GLuint prefer_gart_client_texturing;
 
-   /* Drawable, cliprect and scissor information
-    */
-   GLuint numClipRects;			/* Cliprects for the draw buffer */
-   drm_clip_rect_t *pClipRects;
-   unsigned int lastStamp;
-   GLboolean lost_context;
-   GLboolean save_on_next_emit;
-   radeonScreenPtr r200Screen;	/* Screen private DRI data */
-   drm_radeon_sarea_t *sarea;		/* Private SAREA data */
-
    /* TCL stuff
     */
    GLmatrix TexGenMatrix[R200_MAX_TEXTURE_UNITS];
@@ -893,15 +603,6 @@ struct r200_context {
    GLuint TexGenCompSel;
    GLmatrix tmpmat;
 
-   /* buffer swap
-    */
-   int64_t swap_ust;
-   int64_t swap_missed_ust;
-
-   GLuint swap_count;
-   GLuint swap_missed_count;
-
-
    /* r200_tcl.c
     */
    struct r200_tcl_info tcl;
@@ -910,14 +611,6 @@ struct r200_context {
     */
    struct r200_swtcl_info swtcl;
 
-   /* Mirrors of some DRI state
-    */
-   struct r200_dri_mirror dri;
-
-   /* Configuration cache
-    */
-   driOptionCache optionCache;
-
    GLboolean using_hyperz;
    GLboolean texmicrotile;
 
@@ -927,28 +620,10 @@ struct r200_context {
 #define R200_CONTEXT(ctx)		((r200ContextPtr)(ctx->DriverCtx))
 
 
-static INLINE GLuint r200PackColor( GLuint cpp,
-					GLubyte r, GLubyte g,
-					GLubyte b, GLubyte a )
-{
-   switch ( cpp ) {
-   case 2:
-      return PACK_COLOR_565( r, g, b );
-   case 4:
-      return PACK_COLOR_8888( a, r, g, b );
-   default:
-      return 0;
-   }
-}
-
-
 extern void r200DestroyContext( __DRIcontextPrivate *driContextPriv );
 extern GLboolean r200CreateContext( const __GLcontextModes *glVisual,
 				    __DRIcontextPrivate *driContextPriv,
 				    void *sharedContextPrivate);
-extern void r200SwapBuffers( __DRIdrawablePrivate *dPriv );
-extern void r200CopySubBuffer( __DRIdrawablePrivate * dPriv,
-			       int x, int y, int w, int h );
 extern GLboolean r200MakeCurrent( __DRIcontextPrivate *driContextPriv,
 				  __DRIdrawablePrivate *driDrawPriv,
 				  __DRIdrawablePrivate *driReadPriv );
@@ -957,28 +632,9 @@ extern GLboolean r200UnbindContext( __DRIcontextPrivate *driContextPriv );
 /* ================================================================
  * Debugging:
  */
-#define DO_DEBUG		1
 
-#if DO_DEBUG
-extern int R200_DEBUG;
-#else
-#define R200_DEBUG		0
-#endif
+#define R200_DEBUG RADEON_DEBUG
+
 
-#define DEBUG_TEXTURE	0x001
-#define DEBUG_STATE	0x002
-#define DEBUG_IOCTL	0x004
-#define DEBUG_PRIMS	0x008
-#define DEBUG_VERTS	0x010
-#define DEBUG_FALLBACKS	0x020
-#define DEBUG_VFMT	0x040
-#define DEBUG_CODEGEN	0x080
-#define DEBUG_VERBOSE	0x100
-#define DEBUG_DRI       0x200
-#define DEBUG_DMA       0x400
-#define DEBUG_SANITY    0x800
-#define DEBUG_SYNC      0x1000
-#define DEBUG_PIXEL     0x2000
-#define DEBUG_MEMORY    0x4000
 
 #endif /* __R200_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_fragshader.c b/src/mesa/drivers/dri/r200/r200_fragshader.c
index d514b28219a..85c1b7bdd19 100644
--- a/src/mesa/drivers/dri/r200/r200_fragshader.c
+++ b/src/mesa/drivers/dri/r200/r200_fragshader.c
@@ -522,7 +522,7 @@ static void r200UpdateFSConstants( GLcontext *ctx )
 	 CLAMPED_FLOAT_TO_UBYTE(con_byte[2], ctx->ATIFragmentShader.GlobalConstants[i][2]);
 	 CLAMPED_FLOAT_TO_UBYTE(con_byte[3], ctx->ATIFragmentShader.GlobalConstants[i][3]);
       }
-      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = r200PackColor (
+      rmesa->hw.atf.cmd[ATF_TFACTOR_0 + i] = radeonPackColor (
 	 4, con_byte[0], con_byte[1], con_byte[2], con_byte[3] );
    }
 }
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.c b/src/mesa/drivers/dri/r200/r200_ioctl.c
index 0741e57af71..4dbda39eb95 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.c
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.c
@@ -31,7 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * Authors:
  *   Keith Whitwell <[email protected]>
  */
- 
+
 #include <sched.h>
 #include <errno.h>
 
@@ -41,6 +41,10 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "swrast/swrast.h"
 
+
+
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -54,635 +58,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define R200_TIMEOUT             512
 #define R200_IDLE_RETRY           16
 
-
-static void r200WaitForIdle( r200ContextPtr rmesa );
-
-
-/* At this point we were in FlushCmdBufLocked but we had lost our context, so
- * we need to unwire our current cmdbuf, hook the one with the saved state in
- * it, flush it, and then put the current one back.  This is so commands at the
- * start of a cmdbuf can rely on the state being kept from the previous one.
- */
-static void r200BackUpAndEmitLostStateLocked( r200ContextPtr rmesa )
-{
-   GLuint nr_released_bufs;
-   struct r200_store saved_store;
-
-   if (rmesa->backup_store.cmd_used == 0)
-      return;
-
-   if (R200_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Emitting backup state on lost context\n");
-
-   rmesa->lost_context = GL_FALSE;
-
-   nr_released_bufs = rmesa->dma.nr_released_bufs;
-   saved_store = rmesa->store;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->store = rmesa->backup_store;
-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
-   rmesa->dma.nr_released_bufs = nr_released_bufs;
-   rmesa->store = saved_store;
-}
-
-int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller )
-{
-   int ret, i;
-   drm_radeon_cmd_buffer_t cmd;
-
-   if (rmesa->lost_context)
-      r200BackUpAndEmitLostStateLocked( rmesa );
-
-   if (R200_DEBUG & DEBUG_IOCTL) {
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-
-      if (0 & R200_DEBUG & DEBUG_VERBOSE) 
-	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
-	    fprintf(stderr, "%d: %x\n", i/4, 
-		    *(int *)(&rmesa->store.cmd_buf[i]));
-   }
-
-   if (R200_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
-	      rmesa->dma.nr_released_bufs);
-
-
-   if (R200_DEBUG & DEBUG_SANITY) {
-      if (rmesa->state.scissor.enabled) 
-	 ret = r200SanityCmdBuffer( rmesa, 
-				    rmesa->state.scissor.numClipRects,
-				    rmesa->state.scissor.pClipRects);
-      else
-	 ret = r200SanityCmdBuffer( rmesa, 
-				    rmesa->numClipRects,
-				    rmesa->pClipRects);
-      if (ret) {
-	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
-	 goto out;
-      }
-   }
-
-
-   if (R200_DEBUG & DEBUG_MEMORY) {
-      if (! driValidateTextureHeaps( rmesa->texture_heaps, rmesa->nr_heaps,
-				     & rmesa->swapped ) ) {
-	 fprintf( stderr, "%s: texture memory is inconsistent - expect "
-		  "mangled textures\n", __FUNCTION__ );
-      }
-   }
-
-
-   cmd.bufsz = rmesa->store.cmd_used;
-   cmd.buf = rmesa->store.cmd_buf;
-
-   if (rmesa->state.scissor.enabled) {
-      cmd.nbox = rmesa->state.scissor.numClipRects;
-      cmd.boxes = (drm_clip_rect_t *)rmesa->state.scissor.pClipRects;
-   } else {
-      cmd.nbox = rmesa->numClipRects;
-      cmd.boxes = (drm_clip_rect_t *)rmesa->pClipRects;
-   }
-
-   ret = drmCommandWrite( rmesa->dri.fd,
-			  DRM_RADEON_CMDBUF,
-			  &cmd, sizeof(cmd) );
-
-   if (ret)
-      fprintf(stderr, "drmCommandWrite: %d\n", ret);
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
-      r200WaitForIdleLocked( rmesa );
-   }
-
-
- out:
-   rmesa->store.primnr = 0;
-   rmesa->store.statenr = 0;
-   rmesa->store.cmd_used = 0;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->save_on_next_emit = 1;
-
-   return ret;
-}
-
-
-/* Note: does not emit any commands to avoid recursion on
- * r200AllocCmdBuf.
- */
-void r200FlushCmdBuf( r200ContextPtr rmesa, const char *caller )
-{
-   int ret;
-
-   LOCK_HARDWARE( rmesa );
-
-   ret = r200FlushCmdBufLocked( rmesa, caller );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if (ret) {
-      fprintf(stderr, "drmRadeonCmdBuffer: %d (exiting)\n", ret);
-      exit(ret);
-   }
-}
-
-
-/* =============================================================
- * Hardware vertex buffer handling
- */
-
-
-void r200RefillCurrentDmaRegion( r200ContextPtr rmesa )
-{
-   struct r200_dma_buffer *dmabuf;
-   int fd = rmesa->dri.fd;
-   int index = 0;
-   int size = 0;
-   drmDMAReq dma;
-   int ret;
-
-   if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-      fprintf(stderr, "%s\n", __FUNCTION__);  
-
-   if (rmesa->dma.flush) {
-      rmesa->dma.flush( rmesa );
-   }
-
-   if (rmesa->dma.current.buf)
-      r200ReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-
-   if (rmesa->dma.nr_released_bufs > 4)
-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
-
-   dma.context = rmesa->dri.hwContext;
-   dma.send_count = 0;
-   dma.send_list = NULL;
-   dma.send_sizes = NULL;
-   dma.flags = 0;
-   dma.request_count = 1;
-   dma.request_size = RADEON_BUFFER_SIZE;
-   dma.request_list = &index;
-   dma.request_sizes = &size;
-   dma.granted_count = 0;
-
-   LOCK_HARDWARE(rmesa);	/* no need to validate */
-
-   while (1) {
-      ret = drmDMA( fd, &dma );
-      if (ret == 0)
-	 break;
-   
-      if (rmesa->dma.nr_released_bufs) {
-	 r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
-      }
-
-      if (rmesa->do_usleeps) {
-	 UNLOCK_HARDWARE( rmesa );
-	 DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa );
-      }
-   }
-
-   UNLOCK_HARDWARE(rmesa);
-
-   if (R200_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "Allocated buffer %d\n", index);
-
-   dmabuf = CALLOC_STRUCT( r200_dma_buffer );
-   dmabuf->buf = &rmesa->r200Screen->buffers->list[index];
-   dmabuf->refcount = 1;
-
-   rmesa->dma.current.buf = dmabuf;
-   rmesa->dma.current.address = dmabuf->buf->address;
-   rmesa->dma.current.end = dmabuf->buf->total;
-   rmesa->dma.current.start = 0;
-   rmesa->dma.current.ptr = 0;
-}
-
-void r200ReleaseDmaRegion( r200ContextPtr rmesa,
-			     struct r200_dma_region *region,
-			     const char *caller )
-{
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-   
-   if (!region->buf)
-      return;
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (--region->buf->refcount == 0) {
-      drm_radeon_cmd_header_t *cmd;
-
-      if (R200_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
-		 region->buf->buf->idx);  
-      
-      cmd = (drm_radeon_cmd_header_t *)r200AllocCmdBuf( rmesa, sizeof(*cmd), 
-						     __FUNCTION__ );
-      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
-      cmd->dma.buf_idx = region->buf->buf->idx;
-      FREE(region->buf);
-      rmesa->dma.nr_released_bufs++;
-   }
-
-   region->buf = NULL;
-   region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r200AllocDmaRegion( r200ContextPtr rmesa, 
-			   struct r200_dma_region *region,
-			   int bytes,
-			   int alignment )
-{
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (region->buf)
-      r200ReleaseDmaRegion( rmesa, region, __FUNCTION__ );
-
-   alignment--;
-   rmesa->dma.current.start = rmesa->dma.current.ptr = 
-      (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      r200RefillCurrentDmaRegion( rmesa );
-
-   region->start = rmesa->dma.current.start;
-   region->ptr = rmesa->dma.current.start;
-   region->end = rmesa->dma.current.start + bytes;
-   region->address = rmesa->dma.current.address;
-   region->buf = rmesa->dma.current.buf;
-   region->buf->refcount++;
-
-   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
-   rmesa->dma.current.start = 
-      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
-
-   assert( rmesa->dma.current.ptr <= rmesa->dma.current.end );
-}
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t r200GetLastFrame(r200ContextPtr rmesa)
-{
-   drm_radeon_getparam_t gp;
-   int ret;
-   uint32_t frame;
-
-   gp.param = RADEON_PARAM_LAST_FRAME;
-   gp.value = (int *)&frame;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
-			      &gp, sizeof(gp) );
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-
-   return frame;
-}
-
-static void r200EmitIrqLocked( r200ContextPtr rmesa )
-{
-   drm_radeon_irq_emit_t ie;
-   int ret;
-
-   ie.irq_seq = &rmesa->iw.irq_seq;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
-			      &ie, sizeof(ie) );
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void r200WaitIrq( r200ContextPtr rmesa )
-{
-   int ret;
-
-   do {
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
-			     &rmesa->iw, sizeof(rmesa->iw) );
-   } while (ret && (errno == EINTR || errno == EBUSY));
-
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void r200WaitForFrameCompletion( r200ContextPtr rmesa )
-{
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-
-   if (rmesa->do_irqs) {
-      if (r200GetLastFrame(rmesa) < sarea->last_frame) {
-	 if (!rmesa->irqsEmitted) {
-	    while (r200GetLastFrame (rmesa) < sarea->last_frame)
-	       ;
-	 }
-	 else {
-	    UNLOCK_HARDWARE( rmesa ); 
-	    r200WaitIrq( rmesa );	
-	    LOCK_HARDWARE( rmesa ); 
-	 }
-	 rmesa->irqsEmitted = 10;
-      }
-
-      if (rmesa->irqsEmitted) {
-	 r200EmitIrqLocked( rmesa );
-	 rmesa->irqsEmitted--;
-      }
-   } 
-   else {
-      while (r200GetLastFrame (rmesa) < sarea->last_frame) {
-	 UNLOCK_HARDWARE( rmesa ); 
-	 if (rmesa->do_usleeps) 
-	    DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa ); 
-      }
-   }
-}
-
-
-
-/* Copy the back color buffer to the front color buffer.
- */
-void r200CopyBuffer( __DRIdrawablePrivate *dPriv,
-		      const drm_clip_rect_t	 *rect)
-{
-   r200ContextPtr rmesa;
-   GLint nbox, i, ret;
-   GLboolean   missed_target;
-   int64_t ust;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-
-   if ( R200_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *)rmesa->glCtx );
-   }
-
-   R200_FIREVERTICES( rmesa );
-
-   LOCK_HARDWARE( rmesa );
-
-
-   /* Throttle the frame rate -- only allow one pending swap buffers
-    * request at a time.
-    */
-   r200WaitForFrameCompletion( rmesa );
-   if (!rect)
-   {
-       UNLOCK_HARDWARE( rmesa );
-       driWaitForVBlank( dPriv, & missed_target );
-       LOCK_HARDWARE( rmesa );
-   }
-
-   nbox = dPriv->numClipRects; /* must be in locked region */
-
-   for ( i = 0 ; i < nbox ; ) {
-      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      GLint n = 0;
-
-      for ( ; i < nr ; i++ ) {
-
-	  *b = box[i];
-
-	  if (rect)
-	  {
-	     if (rect->x1 > b->x1)
-		 b->x1 = rect->x1;
-	     if (rect->y1 > b->y1)
-		 b->y1 = rect->y1;
-	     if (rect->x2 < b->x2)
-		 b->x2 = rect->x2;
-	     if (rect->y2 < b->y2)
-		 b->y2 = rect->y2;
-
-	     if (b->x1 >= b->x2 || b->y1 >= b->y2)
-		 continue;
-	  }
-
-	  b++;
-	  n++;
-      }
-      rmesa->sarea->nbox = n;
-
-      if (!n)
-	 continue;
-
-      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
-
-      if ( ret ) {
-	 fprintf( stderr, "DRM_R200_SWAP_BUFFERS: return = %d\n", ret );
-	 UNLOCK_HARDWARE( rmesa );
-	 exit( 1 );
-      }
-   }
-
-   UNLOCK_HARDWARE( rmesa );
-   if (!rect)
-   {
-       rmesa->hw.all_dirty = GL_TRUE;
-
-       rmesa->swap_count++;
-       (*psp->systemTime->getUST)( & ust );
-       if ( missed_target ) {
-	   rmesa->swap_missed_count++;
-	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
-       }
-
-       rmesa->swap_ust = ust;
-
-       sched_yield();
-   }
-}
-
-void r200PageFlip( __DRIdrawablePrivate *dPriv )
-{
-   r200ContextPtr rmesa;
-   GLint ret;
-   GLboolean   missed_target;
-   __DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (r200ContextPtr) dPriv->driContextPriv->driverPrivate;
-
-   if ( R200_DEBUG & DEBUG_IOCTL ) {
-      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-	      rmesa->sarea->pfCurrentPage);
-   }
-
-   R200_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   if (!dPriv->numClipRects) {
-      UNLOCK_HARDWARE( rmesa );
-      usleep( 10000 );		/* throttle invisible client 10ms */
-      return;
-   }
-
-   /* Need to do this for the perf box placement:
-    */
-   {
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      b[0] = box[0];
-      rmesa->sarea->nbox = 1;
-   }
-
-   /* Throttle the frame rate -- only allow a few pending swap buffers
-    * request at a time.
-    */
-   r200WaitForFrameCompletion( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   driWaitForVBlank( dPriv, & missed_target );
-   if ( missed_target ) {
-      rmesa->swap_missed_count++;
-      (void) (*psp->systemTime->getUST)( & rmesa->swap_missed_ust );
-   }
-   LOCK_HARDWARE( rmesa );
-
-   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
-      exit( 1 );
-   }
-
-   rmesa->swap_count++;
-   (void) (*psp->systemTime->getUST)( & rmesa->swap_ust );
-
-#if 000
-   if ( rmesa->sarea->pfCurrentPage == 1 ) {
-	 rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
-	 rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
-   } else {
-	 rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
-	 rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
-   }
-
-   R200_STATECHANGE( rmesa, ctx );
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = rmesa->state.color.drawOffset
-					   + rmesa->r200Screen->fbLocation;
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH]  = rmesa->state.color.drawPitch;
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
-   }
-#else
-   /* Get ready for drawing next frame.  Update the renderbuffers'
-    * flippedOffset/Pitch fields so we draw into the right place.
-    */
-   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-                        rmesa->sarea->pfCurrentPage);
-
-
-   r200UpdateDrawBuffer(rmesa->glCtx);
-#endif
-}
-
-
-/* ================================================================
- * Buffer clear
- */
-static void r200Clear( GLcontext *ctx, GLbitfield mask )
+static void r200KernelClear(GLcontext *ctx, GLuint flags)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   GLuint flags = 0;
-   GLuint color_mask = 0;
-   GLint ret, i;
-   GLint cx, cy, cw, ch;
-
-   if ( R200_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "r200Clear\n");
-   }
-
-   {
-      LOCK_HARDWARE( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      if ( dPriv->numClipRects == 0 ) 
-	 return;
-   }
-
-   r200Flush( ctx );
-
-   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
-      flags |= RADEON_FRONT;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_FRONT_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_BACK_LEFT ) {
-      flags |= RADEON_BACK;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_BACK_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_DEPTH ) {
-      flags |= RADEON_DEPTH;
-      mask &= ~BUFFER_BIT_DEPTH;
-   }
-
-   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
-      flags |= RADEON_STENCIL;
-      mask &= ~BUFFER_BIT_STENCIL;
-   }
-
-   if ( mask ) {
-      if (R200_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
-      _swrast_Clear( ctx, mask );
-   }
-
-   if ( !flags ) 
-      return;
-
-   if (rmesa->using_hyperz) {
-      flags |= RADEON_USE_COMP_ZBUF;
-/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
-	 flags |= RADEON_USE_HIERZ; */
-      if (!(rmesa->state.stencil.hwBuffer) ||
-	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
-	    ((rmesa->state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
-	  flags |= RADEON_CLEAR_FASTZ;
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
-
-   /* compute region after locking: */
-   cx = ctx->DrawBuffer->_Xmin;
-   cy = ctx->DrawBuffer->_Ymin;
-   cw = ctx->DrawBuffer->_Xmax - cx;
-   ch = ctx->DrawBuffer->_Ymax - cy;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLint cx, cy, cw, ch, ret;
+   GLuint i;
 
-   /* Flip top to bottom */
-   cx += dPriv->x;
-   cy  = dPriv->y + dPriv->h - cy - ch;
+   LOCK_HARDWARE( &rmesa->radeon );
 
    /* Throttle the number of clear ioctls we do.
     */
@@ -693,7 +76,7 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 
       gp.param = RADEON_PARAM_LAST_CLEAR;
       gp.value = (int *)&clear;
-      ret = drmCommandWriteRead( rmesa->dri.fd,
+      ret = drmCommandWriteRead( rmesa->radeon.dri.fd,
 		      DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
 
       if ( ret ) {
@@ -703,24 +86,34 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 
       /* Clear throttling needs more thought.
        */
-      if ( rmesa->sarea->last_clear - clear <= 25 ) {
+      if ( rmesa->radeon.sarea->last_clear - clear <= 25 ) {
 	 break;
       }
-      
-      if (rmesa->do_usleeps) {
-	 UNLOCK_HARDWARE( rmesa );
+
+      if (rmesa->radeon.do_usleeps) {
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa );
+	 LOCK_HARDWARE( &rmesa->radeon );
       }
    }
 
    /* Send current state to the hardware */
-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+
 
+  /* compute region after locking: */
+   cx = ctx->DrawBuffer->_Xmin;
+   cy = ctx->DrawBuffer->_Ymin;
+   cw = ctx->DrawBuffer->_Xmax - cx;
+   ch = ctx->DrawBuffer->_Ymax - cy;
+
+   /* Flip top to bottom */
+   cx += dPriv->x;
+   cy  = dPriv->y + dPriv->h - cy - ch;
    for ( i = 0 ; i < dPriv->numClipRects ; ) {
       GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
       drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_clip_rect_t *b = rmesa->radeon.sarea->boxes;
       drm_radeon_clear_t clear;
       drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
       GLint n = 0;
@@ -755,17 +148,17 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 	 }
       }
 
-      rmesa->sarea->nbox = n;
+      rmesa->radeon.sarea->nbox = n;
 
       clear.flags       = flags;
-      clear.clear_color = rmesa->state.color.clear;
-      clear.clear_depth = rmesa->state.depth.clear;	/* needed for hyperz */
+      clear.clear_color = rmesa->radeon.state.color.clear;
+      clear.clear_depth = rmesa->radeon.state.depth.clear;	/* needed for hyperz */
       clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      clear.depth_mask  = rmesa->state.stencil.clear;
+      clear.depth_mask  = rmesa->radeon.state.stencil.clear;
       clear.depth_boxes = depth_boxes;
 
       n--;
-      b = rmesa->sarea->boxes;
+      b = rmesa->radeon.sarea->boxes;
       for ( ; n >= 0 ; n-- ) {
 	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
@@ -774,84 +167,94 @@ static void r200Clear( GLcontext *ctx, GLbitfield mask )
 	 depth_boxes[n].f[CLEAR_DEPTH] = ctx->Depth.Clear;
       }
 
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
+      ret = drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_CLEAR,
 			     &clear, sizeof(clear));
 
 
       if ( ret ) {
-	 UNLOCK_HARDWARE( rmesa );
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
 	 exit( 1 );
       }
    }
-
-   UNLOCK_HARDWARE( rmesa );
-   rmesa->hw.all_dirty = GL_TRUE;
+   UNLOCK_HARDWARE( &rmesa->radeon );
 }
+/* ================================================================
+ * Buffer clear
+ */
+static void r200Clear( GLcontext *ctx, GLbitfield mask )
+{
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLuint flags = 0;
+   GLuint color_mask = 0;
+   GLuint orig_mask = mask;
 
+   if ( R200_DEBUG & DEBUG_IOCTL ) {
+	   if (rmesa->radeon.sarea)
+	       fprintf( stderr, "r200Clear %x %d\n", mask, rmesa->radeon.sarea->pfCurrentPage);
+	   else
+	       fprintf( stderr, "r200Clear %x radeon->sarea is NULL\n", mask);
+   }
 
-void r200WaitForIdleLocked( r200ContextPtr rmesa )
-{
-    int ret;
-    int i = 0;
-    
-    do {
-       ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_CP_IDLE);
-       if (ret) 
-	  DO_USLEEP( 1 );
-    } while (ret && ++i < 100);
-    
-    if ( ret < 0 ) {
-       UNLOCK_HARDWARE( rmesa );
-       fprintf( stderr, "Error: R200 timed out... exiting\n" );
-       exit( -1 );
-    }
-}
+   {
+      LOCK_HARDWARE( &rmesa->radeon );
+      UNLOCK_HARDWARE( &rmesa->radeon );
+      if ( dPriv->numClipRects == 0 )
+	 return;
+   }
 
+   radeonFlush( ctx );
 
-static void r200WaitForIdle( r200ContextPtr rmesa )
-{
-   LOCK_HARDWARE(rmesa);
-   r200WaitForIdleLocked( rmesa );
-   UNLOCK_HARDWARE(rmesa);
-}
+   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+      flags |= RADEON_FRONT;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_FRONT_LEFT;
+   }
 
+   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+      flags |= RADEON_BACK;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_BACK_LEFT;
+   }
 
-void r200Flush( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
+   if ( mask & BUFFER_BIT_DEPTH ) {
+      flags |= RADEON_DEPTH;
+      mask &= ~BUFFER_BIT_DEPTH;
+   }
 
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   if ( (mask & BUFFER_BIT_STENCIL) ) {
+      flags |= RADEON_STENCIL;
+      mask &= ~BUFFER_BIT_STENCIL;
+   }
 
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
+   if ( mask ) {
+      if (R200_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+      _swrast_Clear( ctx, mask );
+   }
 
-   r200EmitState( rmesa );
-   
-   if (rmesa->store.cmd_used)
-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
-}
+   if ( !flags )
+      return;
 
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void r200Finish( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   r200Flush( ctx );
+   if (rmesa->using_hyperz) {
+      flags |= RADEON_USE_COMP_ZBUF;
+/*      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200)
+	 flags |= RADEON_USE_HIERZ; */
+      if (!((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+	    ((rmesa->radeon.state.stencil.clear & R200_STENCIL_WRITE_MASK) == R200_STENCIL_WRITE_MASK))) {
+	  flags |= RADEON_CLEAR_FASTZ;
+      }
+   }
 
-   if (rmesa->do_irqs) {
-      LOCK_HARDWARE( rmesa );
-      r200EmitIrqLocked( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      r200WaitIrq( rmesa );
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+      radeonUserClear(ctx, orig_mask);
+   else {
+      r200KernelClear(ctx, flags);
+      rmesa->radeon.hw.all_dirty = GL_TRUE;
    }
-   else 
-      r200WaitForIdle( rmesa );
 }
 
-
 /* This version of AllocateMemoryMESA allocates only GART memory, and
  * only does so after the point at which the driver has been
  * initialized.
@@ -862,7 +265,7 @@ void r200Finish( GLcontext *ctx )
  * device fd.
  */
 void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
-			     GLfloat readfreq, GLfloat writefreq, 
+			     GLfloat readfreq, GLfloat writefreq,
 			     GLfloat priority)
 {
    GET_CURRENT_CONTEXT(ctx);
@@ -872,10 +275,10 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
    int ret;
 
    if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq, 
+      fprintf(stderr, "%s sz %d %f/%f/%f\n", __FUNCTION__, size, readfreq,
 	      writefreq, priority);
 
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map)
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map)
       return NULL;
 
    if (getenv("R200_NO_ALLOC"))
@@ -886,17 +289,17 @@ void *r200AllocateMemoryMESA(__DRIscreen *screen, GLsizei size,
    alloc.size = size;
    alloc.region_offset = &region_offset;
 
-   ret = drmCommandWriteRead( rmesa->r200Screen->driScreen->fd,
+   ret = drmCommandWriteRead( rmesa->radeon.radeonScreen->driScreen->fd,
 			      DRM_RADEON_ALLOC,
 			      &alloc, sizeof(alloc));
-   
+
    if (ret) {
       fprintf(stderr, "%s: DRM_RADEON_ALLOC ret %d\n", __FUNCTION__, ret);
       return NULL;
    }
-   
+
    {
-      char *region_start = (char *)rmesa->r200Screen->gartTextures.map;
+      char *region_start = (char *)rmesa->radeon.radeonScreen->gartTextures.map;
       return (void *)(region_start + region_offset);
    }
 }
@@ -914,28 +317,28 @@ void r200FreeMemoryMESA(__DRIscreen *screen, GLvoid *pointer)
    if (R200_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s %p\n", __FUNCTION__, pointer);
 
-   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->r200Screen->gartTextures.map) {
+   if (!ctx || !(rmesa = R200_CONTEXT(ctx)) || !rmesa->radeon.radeonScreen->gartTextures.map) {
       fprintf(stderr, "%s: no context\n", __FUNCTION__);
       return;
    }
 
-   region_offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   region_offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
 
-   if (region_offset < 0 || 
-       region_offset > rmesa->r200Screen->gartTextures.size) {
+   if (region_offset < 0 ||
+       region_offset > rmesa->radeon.radeonScreen->gartTextures.size) {
       fprintf(stderr, "offset %d outside range 0..%d\n", region_offset,
-	      rmesa->r200Screen->gartTextures.size);
+	      rmesa->radeon.radeonScreen->gartTextures.size);
       return;
    }
 
    memfree.region = RADEON_MEM_REGION_GART;
    memfree.region_offset = region_offset;
-   
-   ret = drmCommandWrite( rmesa->r200Screen->driScreen->fd,
+
+   ret = drmCommandWrite( rmesa->radeon.radeonScreen->driScreen->fd,
 			  DRM_RADEON_FREE,
 			  &memfree, sizeof(memfree));
-   
-   if (ret) 
+
+   if (ret)
       fprintf(stderr, "%s: DRM_RADEON_FREE ret %d\n", __FUNCTION__, ret);
 }
 
@@ -956,32 +359,32 @@ GLuint r200GetMemoryOffsetMESA(__DRIscreen *screen, const GLvoid *pointer)
 
    card_offset = r200GartOffsetFromVirtual( rmesa, pointer );
 
-   return card_offset - rmesa->r200Screen->gart_base;
+   return card_offset - rmesa->radeon.radeonScreen->gart_base;
 }
 
 GLboolean r200IsGartMemory( r200ContextPtr rmesa, const GLvoid *pointer,
 			   GLint size )
 {
-   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   ptrdiff_t offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
    int valid = (size >= 0 &&
 		offset >= 0 &&
-		offset + size < rmesa->r200Screen->gartTextures.size);
+		offset + size < rmesa->radeon.radeonScreen->gartTextures.size);
 
    if (R200_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "r200IsGartMemory( %p ) : %d\n", pointer, valid );
-   
+
    return valid;
 }
 
 
 GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
 {
-   ptrdiff_t offset = (char *)pointer - (char *)rmesa->r200Screen->gartTextures.map;
+   ptrdiff_t offset = (char *)pointer - (char *)rmesa->radeon.radeonScreen->gartTextures.map;
 
-   if (offset < 0 || offset > rmesa->r200Screen->gartTextures.size)
+   if (offset < 0 || offset > rmesa->radeon.radeonScreen->gartTextures.size)
       return ~0;
    else
-      return rmesa->r200Screen->gart_texture_offset + offset;
+      return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
 }
 
 
@@ -989,7 +392,7 @@ GLuint r200GartOffsetFromVirtual( r200ContextPtr rmesa, const GLvoid *pointer )
 void r200InitIoctlFuncs( struct dd_function_table *functions )
 {
     functions->Clear = r200Clear;
-    functions->Finish = r200Finish;
-    functions->Flush = r200Flush;
+    functions->Finish = radeonFinish;
+    functions->Flush = radeonFlush;
 }
 
diff --git a/src/mesa/drivers/dri/r200/r200_ioctl.h b/src/mesa/drivers/dri/r200/r200_ioctl.h
index f7458e4a0ed..2a4b8a11f4c 100644
--- a/src/mesa/drivers/dri/r200/r200_ioctl.h
+++ b/src/mesa/drivers/dri/r200/r200_ioctl.h
@@ -37,65 +37,30 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/simple_list.h"
 #include "radeon_dri.h"
-#include "r200_lock.h"
+
+#include "radeon_bocs_wrapper.h"
 
 #include "xf86drm.h"
 #include "drm.h"
 #include "radeon_drm.h"
 
-extern void r200EmitState( r200ContextPtr rmesa );
 extern void r200EmitVertexAOS( r200ContextPtr rmesa,
-				 GLuint vertex_size,
-				 GLuint offset );
+			       GLuint vertex_size,
+			       struct radeon_bo *bo,
+			       GLuint offset );
 
 extern void r200EmitVbufPrim( r200ContextPtr rmesa,
 				GLuint primitive,
 				GLuint vertex_nr );
 
-extern void r200FlushElts( r200ContextPtr rmesa );
+extern void r200FlushElts(GLcontext *ctx);
 
 extern GLushort *r200AllocEltsOpenEnded( r200ContextPtr rmesa,
 					   GLuint primitive,
 					   GLuint min_nr );
 
-extern void r200EmitAOS( r200ContextPtr rmesa,
-			   struct r200_dma_region **regions,
-			   GLuint n,
-			   GLuint offset );
-
-extern void r200EmitBlit( r200ContextPtr rmesa,
-			  GLuint color_fmt,
-			  GLuint src_pitch,
-			  GLuint src_offset,
-			  GLuint dst_pitch,
-			  GLuint dst_offset,
-			  GLint srcx, GLint srcy,
-			  GLint dstx, GLint dsty,
-			  GLuint w, GLuint h );
-
-extern void r200EmitWait( r200ContextPtr rmesa, GLuint flags );
-
-extern void r200FlushCmdBuf( r200ContextPtr rmesa, const char * );
-extern int r200FlushCmdBufLocked( r200ContextPtr rmesa, const char * caller );
-
-extern void r200RefillCurrentDmaRegion( r200ContextPtr rmesa );
-
-extern void r200AllocDmaRegion( r200ContextPtr rmesa,
-				  struct r200_dma_region *region,
-				  int bytes, 
-				  int alignment );
-
-extern void r200ReleaseDmaRegion( r200ContextPtr rmesa,
-				    struct r200_dma_region *region,
-				    const char *caller );
-
-extern void r200CopyBuffer( __DRIdrawablePrivate *drawable,
-			    const drm_clip_rect_t      *rect);
-extern void r200PageFlip( __DRIdrawablePrivate *drawable );
-extern void r200Flush( GLcontext *ctx );
-extern void r200Finish( GLcontext *ctx );
-extern void r200WaitForIdleLocked( r200ContextPtr rmesa );
-extern void r200WaitForVBlank( r200ContextPtr rmesa );
+extern void r200EmitAOS(r200ContextPtr rmesa, GLuint nr, GLuint offset);
+
 extern void r200InitIoctlFuncs( struct dd_function_table *functions );
 
 extern void *r200AllocateMemoryMESA( __DRIscreen *screen, GLsizei size, GLfloat readfreq,
@@ -119,8 +84,8 @@ void r200SetUpAtomList( r200ContextPtr rmesa );
  */
 #define R200_NEWPRIM( rmesa )			\
 do {						\
-   if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );	\
+   if ( rmesa->radeon.dma.flush )			\
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
 } while (0)
 
 /* Can accomodate several state changes and primitive changes without
@@ -130,7 +95,7 @@ do {						\
 do {								\
    R200_NEWPRIM( rmesa );					\
    rmesa->hw.ATOM.dirty = GL_TRUE;				\
-   rmesa->hw.is_dirty = GL_TRUE;				\
+   rmesa->radeon.hw.is_dirty = GL_TRUE;				\
 } while (0)
 
 #define R200_DB_STATE( ATOM )			        \
@@ -139,13 +104,13 @@ do {								\
 
 static INLINE int R200_DB_STATECHANGE( 
    r200ContextPtr rmesa,
-   struct r200_state_atom *atom )
+   struct radeon_state_atom *atom )
 {
    if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
-      int *tmp;
+      GLuint *tmp;
       R200_NEWPRIM( rmesa );
       atom->dirty = GL_TRUE;
-      rmesa->hw.is_dirty = GL_TRUE;
+      rmesa->radeon.hw.is_dirty = GL_TRUE;
       tmp = atom->cmd; 
       atom->cmd = atom->lastcmd;
       atom->lastcmd = tmp;
@@ -156,15 +121,6 @@ static INLINE int R200_DB_STATECHANGE(
 }
 
 
-/* Fire the buffered vertices no matter what.
- */
-#define R200_FIREVERTICES( rmesa )			\
-do {							\
-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
-      r200Flush( rmesa->glCtx );			\
-   }							\
-} while (0)
-
 /* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
  * are available, you will also be adding an rmesa->state.max_state_size because
  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
@@ -174,36 +130,36 @@ do {							\
 #define ELTS_BUFSZ(nr)	(12 + nr * 2)
 #define VBUF_BUFSZ	(3 * sizeof(int))
 
-/* Ensure that a minimum amount of space is available in the command buffer.
- * This is used to ensure atomicity of state updates with the rendering requests
- * that rely on them.
- *
- * An alternative would be to implement a "soft lock" such that when the buffer
- * wraps at an inopportune time, we grab the lock, flush the current buffer,
- * and hang on to the lock until the critical section is finished and we flush
- * the buffer again and unlock.
- */
-static INLINE void r200EnsureCmdBufSpace( r200ContextPtr rmesa, int bytes )
+static inline uint32_t cmdpacket3(int cmd_type)
 {
-   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
-      r200FlushCmdBuf( rmesa, __FUNCTION__ );
-   assert( bytes <= R200_CMD_BUF_SZ );
-}
+  drm_radeon_cmd_header_t cmd;
 
-/* Alloc space in the command buffer
- */
-static INLINE char *r200AllocCmdBuf( r200ContextPtr rmesa,
-					 int bytes, const char *where )
-{
-   char * head;
+  cmd.i = 0;
+  cmd.header.cmd_type = cmd_type;
 
-   if (rmesa->store.cmd_used + bytes > R200_CMD_BUF_SZ)
-      r200FlushCmdBuf( rmesa, where );
+  return (uint32_t)cmd.i;
 
-   head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-   rmesa->store.cmd_used += bytes;
-   assert( rmesa->store.cmd_used <= R200_CMD_BUF_SZ );
-   return head;
 }
 
+#define OUT_BATCH_PACKET3(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3));				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+#define OUT_BATCH_PACKET3_CLIP(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3_CLIP));	      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+
 #endif /* __R200_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_lock.c b/src/mesa/drivers/dri/r200/r200_lock.c
deleted file mode 100644
index 99661a4bfb4..00000000000
--- a/src/mesa/drivers/dri/r200/r200_lock.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <[email protected]>
- */
- 
-#include "r200_context.h"
-#include "r200_lock.h"
-#include "r200_tex.h"
-#include "r200_state.h"
-#include "r200_ioctl.h"
-
-#include "drirenderbuffer.h"
-
-
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-static void
-r200UpdatePageFlipping( r200ContextPtr rmesa )
-{
-   rmesa->doPageFlip = rmesa->sarea->pfState;
-   if (rmesa->glCtx->WinSysDrawBuffer) {
-      driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-                           rmesa->sarea->pfCurrentPage);
-   }
-}
-
-
-
-/* Update the hardware state.  This is called if another main/context.has
- * grabbed the hardware lock, which includes the X server.  This
- * function also updates the driver's window state after the X server
- * moves, resizes or restacks a window -- the change will be reflected
- * in the drawable position and clip rects.  Since the X server grabs
- * the hardware lock when it changes the window state, this routine will
- * automatically be called after such a change.
- */
-void r200GetLock( r200ContextPtr rmesa, GLuint flags )
-{
-   __DRIdrawablePrivate *drawable = rmesa->dri.drawable;
-   __DRIdrawablePrivate *readable = rmesa->dri.readable;
-   __DRIscreenPrivate *sPriv = rmesa->dri.screen;
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-   int i;
-
-   drmGetLock( rmesa->dri.fd, rmesa->dri.hwContext, flags );
-
-   /* The window might have moved, so we might need to get new clip
-    * rects.
-    *
-    * NOTE: This releases and regrabs the hw lock to allow the X server
-    * to respond to the DRI protocol request for new drawable info.
-    * Since the hardware state depends on having the latest drawable
-    * clip rects, all state checking must be done _after_ this call.
-    */
-   DRI_VALIDATE_DRAWABLE_INFO( sPriv, drawable );
-   if (drawable != readable) {
-      DRI_VALIDATE_DRAWABLE_INFO( sPriv, readable );
-   }
-
-   if ( rmesa->lastStamp != drawable->lastStamp ) {
-      r200UpdatePageFlipping( rmesa );
-      r200SetCliprects( rmesa );
-      r200UpdateViewportOffset( rmesa->glCtx );
-      driUpdateFramebufferSize(rmesa->glCtx, drawable);
-   }
-
-   R200_STATECHANGE( rmesa, ctx );
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
-   }
-   else rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &= ~R200_COLOR_TILE_ENABLE;
-
-   if ( sarea->ctx_owner != rmesa->dri.hwContext ) {
-      sarea->ctx_owner = rmesa->dri.hwContext;
-   }
-
-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-      DRI_AGE_TEXTURES( rmesa->texture_heaps[ i ] );
-   }
-
-   rmesa->lost_context = GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/r200/r200_lock.h b/src/mesa/drivers/dri/r200/r200_lock.h
deleted file mode 100644
index 4ff98907fbf..00000000000
--- a/src/mesa/drivers/dri/r200/r200_lock.h
+++ /dev/null
@@ -1,106 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <[email protected]>
- */
-
-#ifndef __R200_LOCK_H__
-#define __R200_LOCK_H__
-
-extern void r200GetLock( r200ContextPtr rmesa, GLuint flags );
-
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if ( prevLockFile ) {						\
-	 fprintf( stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
-	 exit( 1 );							\
-      }									\
-   } while (0)
-
-#else
-
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
-
-#endif
-
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
-
-
-/* Lock the hardware and validate our state.
- */
-#define LOCK_HARDWARE( rmesa )					\
-   do {								\
-      char __ret = 0;						\
-      DEBUG_CHECK_LOCK();					\
-      DRM_CAS( rmesa->dri.hwLock, rmesa->dri.hwContext,		\
-	       (DRM_LOCK_HELD | rmesa->dri.hwContext), __ret );	\
-      if ( __ret )						\
-	 r200GetLock( rmesa, 0 );				\
-      DEBUG_LOCK();						\
-   } while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-   do {									\
-      DRM_UNLOCK( rmesa->dri.fd,					\
-		  rmesa->dri.hwLock,					\
-		  rmesa->dri.hwContext );				\
-      DEBUG_RESET();							\
-   } while (0)
-
-#endif /* __R200_LOCK_H__ */
diff --git a/src/mesa/drivers/dri/r200/r200_maos.h b/src/mesa/drivers/dri/r200/r200_maos.h
index d3ed06d4021..16a70475e18 100644
--- a/src/mesa/drivers/dri/r200/r200_maos.h
+++ b/src/mesa/drivers/dri/r200/r200_maos.h
@@ -38,6 +38,5 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev );
-extern void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs );
 
 #endif
diff --git a/src/mesa/drivers/dri/r200/r200_maos_arrays.c b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
index 8512b9af478..383a0c4b0d3 100644
--- a/src/mesa/drivers/dri/r200/r200_maos_arrays.c
+++ b/src/mesa/drivers/dri/r200/r200_maos_arrays.c
@@ -50,110 +50,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_maos.h"
 #include "r200_tcl.h"
 
-
-#if 0
-/* Usage:
- *   - from r200_tcl_render
- *   - call r200EmitArrays to ensure uptodate arrays in dma
- *   - emit primitives (new type?) which reference the data
- *       -- need to use elts for lineloop, quads, quadstrip/flat
- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
- *
- */
-static void emit_ubyte_rgba3( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   r200_color_t *out = (r200_color_t *)(rvb->start + rvb->address);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p\n",
-	      __FUNCTION__, count, stride, (void *)out);
-
-   for (i = 0; i < count; i++) {
-      out->red   = *data;
-      out->green = *(data+1);
-      out->blue  = *(data+2);
-      out->alpha = 0xFF;
-      out++;
-      data += stride;
-   }
-}
-
-static void emit_ubyte_rgba4( GLcontext *ctx,
-			      struct r200_dma_region *rvb,
-			      char *data,
-			      int stride,
-			      int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4) {
-      for (i = 0; i < count; i++)
-	 ((int *)out)[i] = LE32_TO_CPU(((int *)data)[i]);
-   } else {
-      for (i = 0; i < count; i++) {
-	 *(int *)out++ = LE32_TO_CPU(*(int *)data);
-	 data += stride;
-      }
-   }
-}
-
-
-static void emit_ubyte_rgba( GLcontext *ctx,
-			     struct r200_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-      r200AllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 3:
-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-}
-#endif
-
-
 #if defined(USE_X86_ASM)
 #define COPY_DWORDS( dst, src, nr )					\
 do {									\
@@ -174,204 +70,34 @@ do {						\
 } while (0)
 #endif
 
-
-static void emit_vecfog( GLcontext *ctx,
-			 struct r200_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
+static void r200_emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
+			     GLvoid *data, int stride, int count)
 {
-   int i;
-   GLfloat *out;
-
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      r200AllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-      r200AllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
-
-   /* Emit the data
-    */
-   out = (GLfloat *)(rvb->address + rvb->start);
-   for (i = 0; i < count; i++) {
-      out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
-      out++;
-      data += stride;
-   }
-
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	uint32_t *out;
+	int i;
+	int size = 1;
+
+	if (stride == 0) {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+		count = 1;
+		aos->stride = 0;
+	} else {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+		aos->stride = size;
+	}
+
+	aos->components = size;
+	aos->count = count;
+
+	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+	for (i = 0; i < count; i++) {
+	  out[0] = r200ComputeFogBlendFactor( ctx, *(GLfloat *)data );
+	  out++;
+	  data += stride;
+	}
 }
 
-
-static void emit_vec4( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4)
-      COPY_DWORDS( out, data, count );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out++;
-	 data += stride;
-      }
-}
-
-
-static void emit_vec8( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 8)
-      COPY_DWORDS( out, data, count*2 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out += 2;
-	 data += stride;
-      }
-}
-
-static void emit_vec12( GLcontext *ctx,
-		       struct r200_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-	      __FUNCTION__, count, stride, (void *)out, (void *)data);
-
-   if (stride == 12)
-      COPY_DWORDS( out, data, count*3 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out += 3;
-	 data += stride;
-      }
-}
-
-static void emit_vec16( GLcontext *ctx,
-			struct r200_dma_region *rvb,
-			char *data,
-			int stride,
-			int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 16)
-      COPY_DWORDS( out, data, count*4 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out[3] = *(int *)(data+12);
-	 out += 4;
-	 data += stride;
-      }
-}
-
-
-static void emit_vector( GLcontext *ctx,
-			 struct r200_dma_region *rvb,
-			 char *data,
-			 int size,
-			 int stride,
-			 int count )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d size %d stride %d\n",
-	      __FUNCTION__, count, size, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      r200AllocDmaRegion( rmesa, rvb, size * 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = size;
-   }
-   else {
-      r200AllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = size;
-      rvb->aos_size = size;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 1:
-      emit_vec4( ctx, rvb, data, stride, count );
-      break;
-   case 2:
-      emit_vec8( ctx, rvb, data, stride, count );
-      break;
-   case 3:
-      emit_vec12( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_vec16( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-
-}
-
-
-
 /* Emit any changed arrays to new GART memory, re-emit a packet to
  * update the arrays.  
  */
@@ -379,12 +105,12 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
 {
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
-   struct r200_dma_region **component = rmesa->tcl.aos_components;
    GLuint nr = 0;
    GLuint vfmt0 = 0, vfmt1 = 0;
    GLuint count = VB->Count;
    GLuint i, emitsize;
 
+   //   fprintf(stderr,"emit arrays\n");
    for ( i = 0; i < 15; i++ ) {
       GLubyte attrib = vimap_rev[i];
       if (attrib != 255) {
@@ -416,20 +142,20 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
 	 case 3:
 	    /* special handling to fix up fog. Will get us into trouble with vbos...*/
 	    assert(attrib == VERT_ATTRIB_FOG);
-	    if (!rmesa->tcl.vertex_data[i].buf) {
+	    if (!rmesa->radeon.tcl.aos[i].bo) {
 	       if (ctx->VertexProgram._Enabled)
-		  emit_vector( ctx,
-			 &(rmesa->tcl.vertex_data[i]),
-			 (char *)VB->AttribPtr[attrib]->data,
-			 1,
-			 VB->AttribPtr[attrib]->stride,
-			 count);
+		  rcommon_emit_vector( ctx,
+				       &(rmesa->radeon.tcl.aos[nr]),
+				       (char *)VB->AttribPtr[attrib]->data,
+				       1,
+				       VB->AttribPtr[attrib]->stride,
+				       count);
 	       else
-		  emit_vecfog( ctx,
-			 &(rmesa->tcl.vertex_data[i]),
-			 (char *)VB->AttribPtr[attrib]->data,
-			 VB->AttribPtr[attrib]->stride,
-			 count);
+		 r200_emit_vecfog( ctx,
+				   &(rmesa->radeon.tcl.aos[nr]),
+				   (char *)VB->AttribPtr[attrib]->data,
+				   VB->AttribPtr[attrib]->stride,
+				   count);
 	    }
 	    vfmt0 |= R200_VTX_DISCRETE_FOG;
 	    goto after_emit;
@@ -473,17 +199,17 @@ void r200EmitArrays( GLcontext *ctx, GLubyte *vimap_rev )
 	 default:
 	    assert(0);
 	 }
-	 if (!rmesa->tcl.vertex_data[i].buf) {
-	    emit_vector( ctx,
-			 &(rmesa->tcl.vertex_data[i]),
-			 (char *)VB->AttribPtr[attrib]->data,
-			 emitsize,
-			 VB->AttribPtr[attrib]->stride,
-			 count );
+	 if (!rmesa->radeon.tcl.aos[nr].bo) {
+	   rcommon_emit_vector( ctx,
+				&(rmesa->radeon.tcl.aos[nr]),
+				(char *)VB->AttribPtr[attrib]->data,
+				emitsize,
+				VB->AttribPtr[attrib]->stride,
+				count );
 	 }
 after_emit:
 	 assert(nr < 12);
-	 component[nr++] = &rmesa->tcl.vertex_data[i];
+	 nr++;
       }
    }
 
@@ -494,19 +220,6 @@ after_emit:
       rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = vfmt1;
    }
 
-   rmesa->tcl.nr_aos_components = nr;
+   rmesa->radeon.tcl.aos_count = nr;
 }
 
-
-void r200ReleaseArrays( GLcontext *ctx, GLuint newinputs )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-
-   /* only do it for changed inputs ? */
-   int i;
-   for (i = 0; i < 15; i++) {
-      if (newinputs & (1 << i))
-	 r200ReleaseDmaRegion( rmesa,
-	    &rmesa->tcl.vertex_data[i], __FUNCTION__ );
-   }
-}
diff --git a/src/mesa/drivers/dri/r200/r200_pixel.c b/src/mesa/drivers/dri/r200/r200_pixel.c
index 2797cbb3dc0..654f2c6ae98 100644
--- a/src/mesa/drivers/dri/r200/r200_pixel.c
+++ b/src/mesa/drivers/dri/r200/r200_pixel.c
@@ -51,7 +51,7 @@ check_color( const GLcontext *ctx, GLenum type, GLenum format,
 	     const void *pixels, GLint sz, GLint pitch )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint cpp = rmesa->r200Screen->cpp;
+   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
 
    if (R200_DEBUG & DEBUG_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
@@ -65,8 +65,8 @@ check_color( const GLcontext *ctx, GLenum type, GLenum format,
       return GL_FALSE;
    }
 
-   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV && 
-	cpp == 4 && 
+   if ( type == GL_UNSIGNED_INT_8_8_8_8_REV &&
+	cpp == 4 &&
 	format == GL_BGRA ) {
       if (R200_DEBUG & DEBUG_PIXEL)
 	 fprintf(stderr, "%s: passed 2\n", __FUNCTION__);
@@ -83,7 +83,7 @@ static GLboolean
 check_color_per_fragment_ops( const GLcontext *ctx )
 {
    int result;
-   result = (!(     ctx->Color.AlphaEnabled || 
+   result = (!(     ctx->Color.AlphaEnabled ||
 		    ctx->Depth.Test ||
 		    ctx->Fog.Enabled ||
 		    ctx->Scissor.Enabled ||
@@ -96,7 +96,7 @@ check_color_per_fragment_ops( const GLcontext *ctx )
 		    ctx->Texture._EnabledUnits
            ) &&
 	   ctx->Current.RasterPosValid);
-   
+
    return result;
 }
 
@@ -137,8 +137,8 @@ clip_pixelrect( const GLcontext *ctx,
    if (*height <= 0)
       return GL_FALSE;
 
-   *size = ((*y + *height - 1) * rmesa->r200Screen->frontPitch +
-	    (*x + *width - 1) * rmesa->r200Screen->cpp);
+   *size = ((*y + *height - 1) * rmesa->radeon.radeonScreen->frontPitch +
+	    (*x + *width - 1) * rmesa->radeon.radeonScreen->cpp);
 
    return GL_TRUE;
 }
@@ -153,19 +153,20 @@ r200TryReadPixels( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLint pitch = pack->RowLength ? pack->RowLength : width;
    GLint blit_format;
-   GLuint cpp = rmesa->r200Screen->cpp;
+   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
    GLint size = width * height * cpp;
 
+   return GL_FALSE;
+#if 0
    if (R200_DEBUG & DEBUG_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
    /* Only accelerate reading to GART buffers.
     */
-   if ( !r200IsGartMemory(rmesa, pixels, 
-			 pitch * height * rmesa->r200Screen->cpp ) ) {
+   if ( !r200IsGartMemory(rmesa, pixels,
+			 pitch * height * rmesa->radeon.radeonScreen->cpp ) ) {
       if (R200_DEBUG & DEBUG_PIXEL)
 	 fprintf(stderr, "%s: dest not GART\n", __FUNCTION__);
-      return GL_FALSE;
    }
 
    /* Need GL_PACK_INVERT_MESA to cope with upsidedown results from
@@ -180,7 +181,7 @@ r200TryReadPixels( GLcontext *ctx,
    if (!check_color(ctx, type, format, pack, pixels, size, pitch))
       return GL_FALSE;
 
-   switch ( rmesa->r200Screen->cpp ) {
+   switch ( rmesa->radeon.radeonScreen->cpp ) {
    case 4:
       blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
       break;
@@ -197,14 +198,14 @@ r200TryReadPixels( GLcontext *ctx,
     * a full command buffer expects to be called unlocked.  As a
     * workaround, immediately flush the buffer on aquiring the lock.
     */
-   LOCK_HARDWARE( rmesa );
+   LOCK_HARDWARE( &rmesa->radeon );
 
    if (rmesa->store.cmd_used)
-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    if (!clip_pixelrect(ctx, ctx->ReadBuffer, &x, &y, &width, &height,
 		       &size)) {
-      UNLOCK_HARDWARE( rmesa );
+      UNLOCK_HARDWARE( &rmesa->radeon );
       if (R200_DEBUG & DEBUG_PIXEL)
 	 fprintf(stderr, "%s totally clipped -- nothing to do\n",
 		 __FUNCTION__);
@@ -212,18 +213,18 @@ r200TryReadPixels( GLcontext *ctx,
    }
 
    {
-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+      __DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
       driRenderbuffer *drb = (driRenderbuffer *) ctx->ReadBuffer->_ColorReadBuffer;
       int nbox = dPriv->numClipRects;
       int src_offset = drb->offset
-		     + rmesa->r200Screen->fbLocation;
+		     + rmesa->radeon.radeonScreen->fbLocation;
       int src_pitch = drb->pitch * drb->cpp;
       int dst_offset = r200GartOffsetFromVirtual( rmesa, pixels );
-      int dst_pitch = pitch * rmesa->r200Screen->cpp;
+      int dst_pitch = pitch * rmesa->radeon.radeonScreen->cpp;
       drm_clip_rect_t *box = dPriv->pClipRects;
       int i;
 
-      r200EmitWait( rmesa, RADEON_WAIT_3D ); 
+      r200EmitWait( rmesa, RADEON_WAIT_3D );
 
       y = dPriv->h - y - height;
       x += dPriv->x;
@@ -240,7 +241,7 @@ r200TryReadPixels( GLcontext *ctx,
 	 GLint by = box[i].y1;
 	 GLint bw = box[i].x2 - bx;
 	 GLint bh = box[i].y2 - by;
-	 
+
 	 if (bx < x) bw -= x - bx, bx = x;
 	 if (by < y) bh -= y - by, by = y;
 	 if (bx + bw > x + width) bw = x + width - bx;
@@ -257,12 +258,12 @@ r200TryReadPixels( GLcontext *ctx,
 		       bw, bh );
       }
 
-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
    }
-   UNLOCK_HARDWARE( rmesa );
-
-   r200Finish( ctx ); /* required by GL */
+   UNLOCK_HARDWARE( &rmesa->radeon );
 
+   radeonFinish( ctx ); /* required by GL */
+#endif
    return GL_TRUE;
 }
 
@@ -276,9 +277,9 @@ r200ReadPixels( GLcontext *ctx,
    if (R200_DEBUG & DEBUG_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack, 
+   if (!r200TryReadPixels( ctx, x, y, width, height, format, type, pack,
 			   pixels))
-      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack, 
+      _swrast_ReadPixels( ctx, x, y, width, height, format, type, pack,
 			  pixels);
 }
 
@@ -292,7 +293,7 @@ static void do_draw_pix( GLcontext *ctx,
 			 GLuint planemask)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    drm_clip_rect_t *box = dPriv->pClipRects;
    struct gl_renderbuffer *rb = ctx->ReadBuffer->_ColorDrawBuffers[0];
    driRenderbuffer *drb = (driRenderbuffer *) rb;
@@ -301,12 +302,12 @@ static void do_draw_pix( GLcontext *ctx,
    int blit_format;
    int size;
    int src_offset = r200GartOffsetFromVirtual( rmesa, pixels );
-   int src_pitch = pitch * rmesa->r200Screen->cpp;
+   int src_pitch = pitch * rmesa->radeon.radeonScreen->cpp;
 
    if (R200_DEBUG & DEBUG_PIXEL)
       fprintf(stderr, "%s\n", __FUNCTION__);
-
-   switch ( rmesa->r200Screen->cpp ) {
+#if 0
+   switch ( rmesa->radeon.radeonScreen->cpp ) {
    case 2:
       blit_format = R200_CP_COLOR_FORMAT_RGB565;
       break;
@@ -318,17 +319,17 @@ static void do_draw_pix( GLcontext *ctx,
    }
 
 
-   LOCK_HARDWARE( rmesa );
+   LOCK_HARDWARE( &rmesa->radeon );
 
    if (rmesa->store.cmd_used)
-      r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
+      rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    y -= height;			/* cope with pixel zoom */
-   
+
    if (!clip_pixelrect(ctx, ctx->DrawBuffer,
 		       &x, &y, &width, &height,
 		       &size)) {
-      UNLOCK_HARDWARE( rmesa );
+      UNLOCK_HARDWARE( &rmesa->radeon );
       return;
    }
 
@@ -357,15 +358,16 @@ static void do_draw_pix( GLcontext *ctx,
 		    blit_format,
 		    src_pitch, src_offset,
 		    drb->pitch * drb->cpp,
-		    drb->offset + rmesa->r200Screen->fbLocation,
+		    drb->offset + rmesa->radeon.radeonScreen->fbLocation,
 		    bx - x, by - y,
 		    bx, by,
 		    bw, bh );
    }
 
-   r200FlushCmdBufLocked( rmesa, __FUNCTION__ );
-   r200WaitForIdleLocked( rmesa ); /* required by GL */
-   UNLOCK_HARDWARE( rmesa );
+   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
+   radeonWaitForIdleLocked( &rmesa->radeon ); /* required by GL */
+   UNLOCK_HARDWARE( &rmesa->radeon );
+#endif
 }
 
 
@@ -381,7 +383,7 @@ r200TryDrawPixels( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLint pitch = unpack->RowLength ? unpack->RowLength : width;
    GLuint planemask;
-   GLuint cpp = rmesa->r200Screen->cpp;
+   GLuint cpp = rmesa->radeon.radeonScreen->cpp;
    GLint size = height * pitch * cpp;
 
    if (R200_DEBUG & DEBUG_PIXEL)
@@ -395,7 +397,7 @@ r200TryDrawPixels( GLcontext *ctx,
    case GL_RGB:
    case GL_RGBA:
    case GL_BGRA:
-      planemask = r200PackColor(cpp,
+      planemask = radeonPackColor(cpp,
 				ctx->Color.ColorMask[RCOMP],
 				ctx->Color.ColorMask[GCOMP],
 				ctx->Color.ColorMask[BCOMP],
@@ -407,7 +409,7 @@ r200TryDrawPixels( GLcontext *ctx,
       if (planemask != ~0)
 	 return GL_FALSE;	/* fix me -- should be possible */
 
-      /* Can't do conversions on GART reads/draws. 
+      /* Can't do conversions on GART reads/draws.
        */
       if ( !r200IsGartMemory( rmesa, pixels, size ) ) {
 	 if (R200_DEBUG & DEBUG_PIXEL)
@@ -431,7 +433,7 @@ r200TryDrawPixels( GLcontext *ctx,
       return GL_FALSE;
    }
 
-   if ( r200IsGartMemory(rmesa, pixels, size) )
+   if (0)// r200IsGartMemory(rmesa, pixels, size) )
    {
       do_draw_pix( ctx, x, y, width, height, pitch, pixels, planemask );
       return GL_TRUE;
@@ -471,7 +473,7 @@ r200Bitmap( GLcontext *ctx, GLint px, GLint py,
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   if (rmesa->Fallback)
+   if (rmesa->radeon.Fallback)
       _swrast_Bitmap( ctx, px, py, width, height, unpack, bitmap );
    else
       r200PointsBitmap( ctx, px, py, width, height, unpack, bitmap );
@@ -482,9 +484,9 @@ r200Bitmap( GLcontext *ctx, GLint px, GLint py,
 void r200InitPixelFuncs( GLcontext *ctx )
 {
    if (!getenv("R200_NO_BLITS")) {
-      ctx->Driver.ReadPixels = r200ReadPixels;  
-      ctx->Driver.DrawPixels = r200DrawPixels; 
-      if (getenv("R200_HW_BITMAP")) 
+      ctx->Driver.ReadPixels = r200ReadPixels;
+      ctx->Driver.DrawPixels = r200DrawPixels;
+      if (getenv("R200_HW_BITMAP"))
 	 ctx->Driver.Bitmap = r200Bitmap;
    }
 }
diff --git a/src/mesa/drivers/dri/r200/r200_reg.h b/src/mesa/drivers/dri/r200/r200_reg.h
index 5ce287f7a5f..526a624b697 100644
--- a/src/mesa/drivers/dri/r200/r200_reg.h
+++ b/src/mesa/drivers/dri/r200/r200_reg.h
@@ -463,8 +463,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define     R200_VSC_UPDATE_USER_COLOR_1_ENABLE    0x00020000
 /* gap */
 #define R200_SE_TCL_VECTOR_INDX_REG                0x2200
+#       define RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT  16
+#       define RADEON_VEC_INDX_DWORD_COUNT_SHIFT     28
 #define R200_SE_TCL_VECTOR_DATA_REG                0x2204
 #define R200_SE_TCL_SCALAR_INDX_REG                0x2208
+#       define RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT  16
 #define R200_SE_TCL_SCALAR_DATA_REG                0x220c
 /* gap */
 #define R200_SE_TCL_MATRIX_SEL_0                   0x2230
@@ -949,6 +952,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define     R200_LOD_BIAS_MASK                        (0xfff80000)
 #define     R200_LOD_BIAS_SHIFT                       19
 #define R200_PP_TXSIZE_0                  0x2c0c /* NPOT only */
+#define R200_PP_TX_WIDTHMASK_SHIFT 0
+#define R200_PP_TX_HEIGHTMASK_SHIFT 16
+
 #define R200_PP_TXPITCH_0                 0x2c10 /* NPOT only */
 #define R200_PP_BORDER_COLOR_0            0x2c14
 #define R200_PP_CUBIC_FACES_0             0x2c18
diff --git a/src/mesa/drivers/dri/r200/r200_span.c b/src/mesa/drivers/dri/r200/r200_span.c
deleted file mode 100644
index 9783678028a..00000000000
--- a/src/mesa/drivers/dri/r200/r200_span.c
+++ /dev/null
@@ -1,307 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <[email protected]>
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/colormac.h"
-#include "swrast/swrast.h"
-
-#include "r200_context.h"
-#include "r200_ioctl.h"
-#include "r200_state.h"
-#include "r200_span.h"
-#include "r200_tex.h"
-
-#define DBG 0
-
-/*
- * Note that all information needed to access pixels in a renderbuffer
- * should be obtained through the gl_renderbuffer parameter, not per-context
- * information.
- */
-#define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
-
-#define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
-
-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
-
-#define Y_FLIP(Y) (bottom - (Y))
-
-#define HW_LOCK() 
-
-#define HW_UNLOCK()							
-
-
-
-/* ================================================================
- * Color buffer
- */
-
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    r200##x##_RGB565
-#define TAG2(x,y) r200##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    r200##x##_ARGB8888
-#define TAG2(x,y) r200##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
-#include "spantmp2.h"
-
-
-/* ================================================================
- * Depth buffer
- */
-
-/* The Radeon family has depth tiling on all the time, so we have to convert
- * the x,y coordinates into the memory bus address (mba) in the same
- * manner as the engine.  In each case, the linear block address (ba)
- * is calculated, and then wired with x and y to produce the final
- * memory address.
- * The chip will do address translation on its own if the surface registers
- * are set up correctly. It is not quite enough to get it working with hyperz too...
- */
-
-/* extract bit 'b' of x, result is zero or one */
-#define BIT(x,b) ((x & (1<<b))>>b)
-
-static GLuint
-r200_mba_z32( driRenderbuffer *drb, GLint x, GLint y )
-{
-   GLuint pitch = drb->pitch;
-   if (drb->depthHasSurface) {
-      return 4 * (x + y * pitch);
-   }
-   else {
-      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 5) + ((x & 0x7FF) >> 5);
-      GLuint a = 
-         (BIT(x,0) << 2) |
-         (BIT(y,0) << 3) |
-         (BIT(x,1) << 4) |
-         (BIT(y,1) << 5) |
-         (BIT(x,3) << 6) |
-         (BIT(x,4) << 7) |
-         (BIT(x,2) << 8) |
-         (BIT(y,2) << 9) |
-         (BIT(y,3) << 10) |
-         (((pitch & 0x20) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
-         ((b >> 1) << 12);
-      return a;
-   }
-}
-
-static GLuint
-r200_mba_z16( driRenderbuffer *drb, GLint x, GLint y )
-{
-   GLuint pitch = drb->pitch;
-   if (drb->depthHasSurface) {
-      return 2 * (x + y * pitch);
-   }
-   else {
-      GLuint b = ((y & 0x7FF) >> 4) * ((pitch & 0xFFF) >> 6) + ((x & 0x7FF) >> 6);
-      GLuint a = 
-         (BIT(x,0) << 1) |
-         (BIT(y,0) << 2) |
-         (BIT(x,1) << 3) |
-         (BIT(y,1) << 4) |
-         (BIT(x,2) << 5) |
-         (BIT(x,4) << 6) |
-         (BIT(x,5) << 7) |
-         (BIT(x,3) << 8) |
-         (BIT(y,2) << 9) |
-         (BIT(y,3) << 10) |
-         (((pitch & 0x40) ? (b & 0x01) : ((b & 0x01) ^ (BIT(y,4)))) << 11) |
-         ((b >> 1) << 12);
-      return a;
-   }
-}
-
-
-/* 16-bit depth buffer functions
- */
-#define VALUE_TYPE GLushort
-
-#define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo )) = d;
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + r200_mba_z16( drb, _x + xo, _y + yo ));
-
-#define TAG(x) r200##x##_z16
-#include "depthtmp.h"
-
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#define VALUE_TYPE GLuint
-
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + r200_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
-
-#define TAG(x) r200##x##_z24_s8
-#include "depthtmp.h"
-
-
-/* ================================================================
- * Stencil buffer
- */
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x00ffffff;							\
-   tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = r200_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   d = tmp >> 24;							\
-} while (0)
-
-#define TAG(x) r200##x##_z24_s8
-#include "stenciltmp.h"
-
-
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
-
-static void r200SpanRenderStart( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-
-   R200_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-   r200WaitForIdleLocked( rmesa );
-
-   /* Read & rewrite the first pixel in the frame buffer.  This should
-    * be a noop, right?  In fact without this conform fails as reading
-    * from the framebuffer sometimes produces old results -- the
-    * on-card read cache gets mixed up and doesn't notice that the
-    * framebuffer has been updated.
-    *
-    * In the worst case this is buggy too as p might get the wrong
-    * value first time, so really need a hidden pixel somewhere for this.
-    */
-   {
-      int p;
-      driRenderbuffer *drb =
-	 (driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
-      volatile int *buf =
-	 (volatile int *)(rmesa->dri.screen->pFB + drb->offset);
-      p = *buf;
-      *buf = p;
-   }
-}
-
-static void r200SpanRenderFinish( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-   _swrast_flush( ctx );
-   UNLOCK_HARDWARE( rmesa );
-}
-
-void r200InitSpanFuncs( GLcontext *ctx )
-{
-   struct swrast_device_driver *swdd = _swrast_GetDeviceDriverReference(ctx);
-   swdd->SpanRenderStart          = r200SpanRenderStart;
-   swdd->SpanRenderFinish         = r200SpanRenderFinish; 
-}
-
-
-
-/**
- * Plug in the Get/Put routines for the given driRenderbuffer.
- */
-void
-radeonSetSpanFunctions(driRenderbuffer *drb, const GLvisual *vis)
-{
-   if (drb->Base.InternalFormat == GL_RGBA) {
-      if (vis->redBits == 5 && vis->greenBits == 6 && vis->blueBits == 5) {
-         r200InitPointers_RGB565(&drb->Base);
-      }
-      else {
-         r200InitPointers_ARGB8888(&drb->Base);
-      }
-   }
-   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-      r200InitDepthPointers_z16(&drb->Base);
-   }
-   else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-      r200InitDepthPointers_z24_s8(&drb->Base);
-   }
-   else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-      r200InitStencilPointers_z24_s8(&drb->Base);
-   }
-}
diff --git a/src/mesa/drivers/dri/r200/r200_state.c b/src/mesa/drivers/dri/r200/r200_state.c
index 2fcc87c0f5a..5a6fd20d8c2 100644
--- a/src/mesa/drivers/dri/r200/r200_state.c
+++ b/src/mesa/drivers/dri/r200/r200_state.c
@@ -47,6 +47,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_pipeline.h"
 #include "swrast_setup/swrast_setup.h"
 
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
@@ -77,7 +79,7 @@ static void r200AlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 
    switch ( func ) {
    case GL_NEVER:
-      pp_misc |= R200_ALPHA_TEST_FAIL; 
+      pp_misc |= R200_ALPHA_TEST_FAIL;
       break;
    case GL_LESS:
       pp_misc |= R200_ALPHA_TEST_LESS;
@@ -114,8 +116,8 @@ static void r200BlendColor( GLcontext *ctx, const GLfloat cf[4] )
    CLAMPED_FLOAT_TO_UBYTE(color[1], cf[1]);
    CLAMPED_FLOAT_TO_UBYTE(color[2], cf[2]);
    CLAMPED_FLOAT_TO_UBYTE(color[3], cf[3]);
-   if (rmesa->r200Screen->drmSupportsBlendColor)
-      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = r200PackColor( 4, color[0], color[1], color[2], color[3] );
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
+      rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = radeonPackColor( 4, color[0], color[1], color[2], color[3] );
 }
 
 /**
@@ -213,7 +215,7 @@ static void r200_set_blend_state( GLcontext * ctx )
 
    R200_STATECHANGE( rmesa, ctx );
 
-   if (rmesa->r200Screen->drmSupportsBlendColor) {
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
       if (ctx->Color.ColorLogicOpEnabled) {
          rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] =  cntl | R200_ROP_ENABLE;
          rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = eqn | func;
@@ -278,7 +280,7 @@ static void r200_set_blend_state( GLcontext * ctx )
       return;
    }
 
-   if (!rmesa->r200Screen->drmSupportsBlendColor) {
+   if (!rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
       rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] = eqn | func;
       return;
    }
@@ -383,10 +385,10 @@ static void r200ClearDepth( GLcontext *ctx, GLclampd d )
 
    switch ( format ) {
    case R200_DEPTH_FORMAT_16BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x0000ffff;
+      rmesa->radeon.state.depth.clear = d * 0x0000ffff;
       break;
    case R200_DEPTH_FORMAT_24BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x00ffffff;
+      rmesa->radeon.state.depth.clear = d * 0x00ffffff;
       break;
    }
 }
@@ -477,10 +479,10 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 	 }
       }
       break;
-   case GL_FOG_COLOR: 
+   case GL_FOG_COLOR:
       R200_STATECHANGE( rmesa, ctx );
       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
-      i = r200PackColor( 4, col[0], col[1], col[2], 0 );
+      i = radeonPackColor( 4, col[0], col[1], col[2], 0 );
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~R200_FOG_COLOR_MASK;
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= i;
       break;
@@ -505,7 +507,7 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 
       if (out_0 != rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0]) {
 	 R200_STATECHANGE( rmesa, vtx );
-	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;	 
+	 rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] = out_0;
       }
 
       break;
@@ -521,102 +523,6 @@ static void r200Fogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
    }
 }
 
-
-/* =============================================================
- * Scissoring
- */
-
-
-static GLboolean intersect_rect( drm_clip_rect_t *out,
-				 drm_clip_rect_t *a,
-				 drm_clip_rect_t *b )
-{
-   *out = *a;
-   if ( b->x1 > out->x1 ) out->x1 = b->x1;
-   if ( b->y1 > out->y1 ) out->y1 = b->y1;
-   if ( b->x2 < out->x2 ) out->x2 = b->x2;
-   if ( b->y2 < out->y2 ) out->y2 = b->y2;
-   if ( out->x1 >= out->x2 ) return GL_FALSE;
-   if ( out->y1 >= out->y2 ) return GL_FALSE;
-   return GL_TRUE;
-}
-
-
-void r200RecalcScissorRects( r200ContextPtr rmesa )
-{
-   drm_clip_rect_t *out;
-   int i;
-
-   /* Grow cliprect store?
-    */
-   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
-	 rmesa->state.scissor.numAllocedClipRects *= 2;
-      }
-
-      if (rmesa->state.scissor.pClipRects)
-	 FREE(rmesa->state.scissor.pClipRects);
-
-      rmesa->state.scissor.pClipRects = 
-	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
-		 sizeof(drm_clip_rect_t) );
-
-      if ( rmesa->state.scissor.pClipRects == NULL ) {
-	 rmesa->state.scissor.numAllocedClipRects = 0;
-	 return;
-      }
-   }
-   
-   out = rmesa->state.scissor.pClipRects;
-   rmesa->state.scissor.numClipRects = 0;
-
-   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
-      if ( intersect_rect( out, 
-			   &rmesa->pClipRects[i], 
-			   &rmesa->state.scissor.rect ) ) {
-	 rmesa->state.scissor.numClipRects++;
-	 out++;
-      }
-   }
-}
-
-
-static void r200UpdateScissor( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if ( rmesa->dri.drawable ) {
-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-
-      int x = ctx->Scissor.X;
-      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
-      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
-      int h = dPriv->h - ctx->Scissor.Y - 1;
-
-      rmesa->state.scissor.rect.x1 = x + dPriv->x;
-      rmesa->state.scissor.rect.y1 = y + dPriv->y;
-      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
-      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
-
-      r200RecalcScissorRects( rmesa );
-   }
-}
-
-
-static void r200Scissor( GLcontext *ctx,
-			   GLint x, GLint y, GLsizei w, GLsizei h )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if ( ctx->Scissor.Enabled ) {
-      R200_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
-      r200UpdateScissor( ctx );
-   }
-
-}
-
-
 /* =============================================================
  * Culling
  */
@@ -668,6 +574,10 @@ static void r200FrontFace( GLcontext *ctx, GLenum mode )
    R200_STATECHANGE( rmesa, tcl );
    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_CULL_FRONT_IS_CCW;
 
+   /* Winding is inverted when rendering to FBO */
+   if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+      mode = (mode == GL_CW) ? GL_CCW : GL_CW;
+
    switch ( mode ) {
    case GL_CW:
       rmesa->hw.set.cmd[SET_SE_CNTL] |= R200_FFACE_CULL_CW;
@@ -790,7 +700,7 @@ static void r200LineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
    R200_STATECHANGE( rmesa, lin );
-   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] =
       ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
 }
 
@@ -803,21 +713,27 @@ static void r200ColorMask( GLcontext *ctx,
 			   GLboolean b, GLboolean a )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint mask = r200PackColor( rmesa->r200Screen->cpp,
-				ctx->Color.ColorMask[RCOMP],
-				ctx->Color.ColorMask[GCOMP],
-				ctx->Color.ColorMask[BCOMP],
-				ctx->Color.ColorMask[ACOMP] );
-
+   GLuint mask;
+   struct radeon_renderbuffer *rrb;
    GLuint flag = rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] & ~R200_PLANE_MASK_ENABLE;
 
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
+   mask = radeonPackColor( rrb->cpp,
+			   ctx->Color.ColorMask[RCOMP],
+			   ctx->Color.ColorMask[GCOMP],
+			   ctx->Color.ColorMask[BCOMP],
+			   ctx->Color.ColorMask[ACOMP] );
+
+
    if (!(r && g && b && a))
       flag |= R200_PLANE_MASK_ENABLE;
 
-   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) { 
-      R200_STATECHANGE( rmesa, ctx ); 
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag; 
-   } 
+   if ( rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] != flag ) {
+      R200_STATECHANGE( rmesa, ctx );
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = flag;
+   }
 
    if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
       R200_STATECHANGE( rmesa, msk );
@@ -834,7 +750,8 @@ static void r200PolygonOffset( GLcontext *ctx,
 			       GLfloat factor, GLfloat units )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   float_ui32_type constant =  { units * rmesa->state.depth.scale };
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   float_ui32_type constant =  { units * depthScale };
    float_ui32_type factoru = { factor };
 
 /*    factor *= 2; */
@@ -861,15 +778,15 @@ static void r200PolygonStipple( GLcontext *ctx, const GLubyte *mask )
 
    /* TODO: push this into cmd mechanism
     */
-   R200_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
+   radeon_firevertices(&rmesa->radeon);
+   LOCK_HARDWARE( &rmesa->radeon );
 
    /* FIXME: Use window x,y offsets into stipple RAM.
     */
    stipple.mask = rmesa->state.stipple.mask;
-   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
+   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE,
                     &stipple, sizeof(stipple) );
-   UNLOCK_HARDWARE( rmesa );
+   UNLOCK_HARDWARE( &rmesa->radeon );
 }
 
 static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
@@ -878,10 +795,10 @@ static void r200PolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
 
    /* Can't generally do unfilled via tcl, but some good special
-    * cases work. 
+    * cases work.
     */
    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_UNFILLED, flag);
-   if (rmesa->TclFallback) {
+   if (rmesa->radeon.TclFallback) {
       r200ChooseRenderState( ctx );
       r200ChooseVertexState( ctx );
    }
@@ -920,34 +837,34 @@ static void r200UpdateSpecular( GLcontext *ctx )
 
    if (ctx->Light.Enabled &&
        ctx->Light.Model.ColorControl == GL_SEPARATE_SPECULAR_COLOR) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
 	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
-	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
       p |=  R200_SPECULAR_ENABLE;
-      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= 
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &=
 	 ~R200_DIFFUSE_SPECULAR_COMBINE;
    }
    else if (ctx->Light.Enabled) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_0;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHTING_ENABLE;
    } else if (ctx->Fog.ColorSumEnabled ) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
 	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT) |
-	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+	  (R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       p |=  R200_SPECULAR_ENABLE;
    } else {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_0_SHIFT));
    }
 
    if (ctx->Fog.Enabled) {
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |= 
-	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));	
+      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_0] |=
+	 ((R200_VTX_FP_RGBA << R200_VTX_COLOR_1_SHIFT));
       rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL] |= R200_OUTPUT_COLOR_1;
    }
 
@@ -958,7 +875,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
 
    /* Update vertex/render formats
     */
-   if (rmesa->TclFallback) { 
+   if (rmesa->radeon.TclFallback) {
       r200ChooseRenderState( ctx );
       r200ChooseVertexState( ctx );
    }
@@ -970,7 +887,7 @@ static void r200UpdateSpecular( GLcontext *ctx )
  */
 
 
-/* Update on colormaterial, material emmissive/ambient, 
+/* Update on colormaterial, material emmissive/ambient,
  * lightmodel.globalambient
  */
 static void update_global_ambient( GLcontext *ctx )
@@ -984,23 +901,23 @@ static void update_global_ambient( GLcontext *ctx )
     */
    if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] &
        ((3 << R200_FRONT_EMISSIVE_SOURCE_SHIFT) |
-	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0) 
+	(3 << R200_FRONT_AMBIENT_SOURCE_SHIFT))) == 0)
    {
-      COPY_3V( &fcmd[GLT_RED], 
+      COPY_3V( &fcmd[GLT_RED],
 	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
       ACC_SCALE_3V( &fcmd[GLT_RED],
 		   ctx->Light.Model.Ambient,
 		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
-   } 
+   }
    else
    {
       COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
    }
-   
+
    R200_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
 }
 
-/* Update on change to 
+/* Update on change to
  *    - light[p].colors
  *    - light[p].enabled
  */
@@ -1014,10 +931,10 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
       r200ContextPtr rmesa = R200_CONTEXT(ctx);
       float *fcmd = (float *)R200_DB_STATE( lit[p] );
 
-      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );
       COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
       COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
-      
+
       R200_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
    }
 }
@@ -1037,7 +954,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 
    if (ctx->Light.ColorMaterialEnabled) {
       GLuint mask = ctx->Light.ColorMaterialBitmask;
-   
+
       if (mask & MAT_BIT_FRONT_EMISSION) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_EMISSIVE_SOURCE_SHIFT);
@@ -1053,7 +970,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       else
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_AMBIENT_SOURCE_SHIFT);
-	 
+
       if (mask & MAT_BIT_FRONT_DIFFUSE) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
@@ -1061,7 +978,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       else
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_DIFFUSE_SOURCE_SHIFT);
-   
+
       if (mask & MAT_BIT_FRONT_SPECULAR) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
@@ -1070,7 +987,7 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
          light_model_ctl1 |= (R200_LM1_SOURCE_MATERIAL_0 <<
 			     R200_FRONT_SPECULAR_SOURCE_SHIFT);
       }
-   
+
       if (mask & MAT_BIT_BACK_EMISSION) {
 	 light_model_ctl1 |= (R200_LM1_SOURCE_VERTEX_COLOR_0 <<
 			     R200_BACK_EMISSIVE_SOURCE_SHIFT);
@@ -1120,8 +1037,8 @@ static void r200ColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
       R200_STATECHANGE( rmesa, tcl );
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_1] = light_model_ctl1;
    }
-   
-   
+
+
 }
 
 void r200UpdateMaterial( GLcontext *ctx )
@@ -1131,7 +1048,7 @@ void r200UpdateMaterial( GLcontext *ctx )
    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( mtl[0] );
    GLfloat *fcmd2 = (GLfloat *)R200_DB_STATE( mtl[1] );
    GLuint mask = ~0;
-   
+
    /* Might be possible and faster to update everything unconditionally? */
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
@@ -1217,7 +1134,7 @@ void r200UpdateMaterial( GLcontext *ctx )
  *
  * which are calculated in light.c and are correct for the current
  * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
- * and _MESA_NEW_NEED_EYE_COORDS.  
+ * and _MESA_NEW_NEED_EYE_COORDS.
  */
 static void update_light( GLcontext *ctx )
 {
@@ -1234,8 +1151,8 @@ static void update_light( GLcontext *ctx )
 	 tmp &= ~R200_LIGHT_IN_MODELSPACE;
       else
 	 tmp |= R200_LIGHT_IN_MODELSPACE;
-      
-      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]) 
+
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0])
       {
 	 R200_STATECHANGE( rmesa, tcl );
 	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] = tmp;
@@ -1259,10 +1176,10 @@ static void update_light( GLcontext *ctx )
 	 if (ctx->Light.Light[p].Enabled) {
 	    struct gl_light *l = &ctx->Light.Light[p];
 	    GLfloat *fcmd = (GLfloat *)R200_DB_STATE( lit[p] );
-	    
+
 	    if (l->EyePosition[3] == 0.0) {
-	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
-	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm );
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm );
 	       fcmd[LIT_POSITION_W] = 0;
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
@@ -1286,21 +1203,21 @@ static void r200Lightfv( GLcontext *ctx, GLenum light,
    GLint p = light - GL_LIGHT0;
    struct gl_light *l = &ctx->Light.Light[p];
    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
-   
+
 
    switch (pname) {
-   case GL_AMBIENT:		
+   case GL_AMBIENT:
    case GL_DIFFUSE:
    case GL_SPECULAR:
       update_light_colors( ctx, p );
       break;
 
-   case GL_SPOT_DIRECTION: 
-      /* picked up in update_light */	
+   case GL_SPOT_DIRECTION:
+      /* picked up in update_light */
       break;
 
    case GL_POSITION: {
-      /* positions picked up in update_light, but can do flag here */	
+      /* positions picked up in update_light, but can do flag here */
       GLuint flag = (p&1)? R200_LIGHT_1_IS_LOCAL : R200_LIGHT_0_IS_LOCAL;
       GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
 
@@ -1416,7 +1333,7 @@ static void r200LightModelfv( GLcontext *ctx, GLenum pname,
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
    switch (pname) {
-      case GL_LIGHT_MODEL_AMBIENT: 
+      case GL_LIGHT_MODEL_AMBIENT:
 	 update_global_ambient( ctx );
 	 break;
 
@@ -1430,7 +1347,7 @@ static void r200LightModelfv( GLcontext *ctx, GLenum pname,
 	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] |= R200_LIGHT_TWOSIDE;
 	 else
 	    rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~(R200_LIGHT_TWOSIDE);
-	 if (rmesa->TclFallback) {
+	 if (rmesa->radeon.TclFallback) {
 	    r200ChooseRenderState( ctx );
 	    r200ChooseVertexState( ctx );
 	 }
@@ -1675,7 +1592,7 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   rmesa->state.stencil.clear = 
+   rmesa->radeon.state.stencil.clear =
       ((GLuint) (ctx->Stencil.Clear & 0xff) |
        (0xff << R200_STENCIL_MASK_SHIFT) |
        ((ctx->Stencil.WriteMask[0] & 0xff) << R200_STENCIL_WRITEMASK_SHIFT));
@@ -1700,19 +1617,29 @@ static void r200ClearStencil( GLcontext *ctx, GLint s )
 void r200UpdateWindow( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   GLfloat xoffset = (GLfloat)dPriv->x;
-   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+   GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer ? (ctx->DrawBuffer->Name != 0) : 0);
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   GLfloat y_scale, y_bias;
+
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   } else {
+      y_scale = -1.0;
+      y_bias = yoffset;
+   }
 
    float_ui32_type sx = { v[MAT_SX] };
    float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
-   float_ui32_type sy = { - v[MAT_SY] };
-   float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
-   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
-   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
+   float_ui32_type sy = { v[MAT_SY] * y_scale };
+   float_ui32_type ty = { (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y };
+   float_ui32_type sz = { v[MAT_SZ] * depthScale };
+   float_ui32_type tz = { v[MAT_TZ] * depthScale };
 
-   R200_FIREVERTICES( rmesa );
    R200_STATECHANGE( rmesa, vpt );
 
    rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
@@ -1733,6 +1660,8 @@ static void r200Viewport( GLcontext *ctx, GLint x, GLint y,
     * values, or keep the originals hanging around.
     */
    r200UpdateWindow( ctx );
+
+   radeon_viewport(ctx, x, y, width, height);
 }
 
 static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
@@ -1744,7 +1673,7 @@ static void r200DepthRange( GLcontext *ctx, GLclampd nearval,
 void r200UpdateViewportOffset( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1774,8 +1703,8 @@ void r200UpdateViewportOffset( GLcontext *ctx )
                 R200_STIPPLE_Y_OFFSET_MASK);
 
          /* add magic offsets, then invert */
-         stx = 31 - ((rmesa->dri.drawable->x - 1) & R200_STIPPLE_COORD_MASK);
-         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
+         stx = 31 - ((dPriv->x - 1) & R200_STIPPLE_COORD_MASK);
+         sty = 31 - ((dPriv->y + dPriv->h - 1)
                      & R200_STIPPLE_COORD_MASK);
 
          m |= ((stx << R200_STIPPLE_X_OFFSET_SHIFT) |
@@ -1788,7 +1717,7 @@ void r200UpdateViewportOffset( GLcontext *ctx )
       }
    }
 
-   r200UpdateScissor( ctx );
+   radeonUpdateScissor( ctx );
 }
 
 
@@ -1801,11 +1730,16 @@ static void r200ClearColor( GLcontext *ctx, const GLfloat c[4] )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLubyte color[4];
+   struct radeon_renderbuffer *rrb;
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
    CLAMPED_FLOAT_TO_UBYTE(color[0], c[0]);
    CLAMPED_FLOAT_TO_UBYTE(color[1], c[1]);
    CLAMPED_FLOAT_TO_UBYTE(color[2], c[2]);
    CLAMPED_FLOAT_TO_UBYTE(color[3], c[3]);
-   rmesa->state.color.clear = r200PackColor( rmesa->r200Screen->cpp,
+   rmesa->radeon.state.color.clear = radeonPackColor( rrb->cpp,
                                              color[0], color[1],
                                              color[2], color[3] );
 }
@@ -1848,96 +1782,6 @@ static void r200LogicOpCode( GLcontext *ctx, GLenum opcode )
    rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = r200_rop_tab[rop];
 }
 
-
-/*
- * Set up the cliprects for either front or back-buffer drawing.
- */
-void r200SetCliprects( r200ContextPtr rmesa )
-{
-   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
-   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
-   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
-
-   if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BIT_BACK_LEFT) {
-      /* Can't ignore 2d windows if we are page flipping.
-       */
-      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
-         rmesa->numClipRects = drawable->numClipRects;
-         rmesa->pClipRects = drawable->pClipRects;
-      }
-      else {
-         rmesa->numClipRects = drawable->numBackClipRects;
-         rmesa->pClipRects = drawable->pBackClipRects;
-      }
-   }
-   else {
-     /* front buffer (or none, or multiple buffers) */
-     rmesa->numClipRects = drawable->numClipRects;
-     rmesa->pClipRects = drawable->pClipRects;
-  }
-
-   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
-      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
-			       drawable->w, drawable->h);
-      draw_fb->Initialized = GL_TRUE;
-   }
-
-   if (drawable != readable) {
-      if ((read_fb->Width != readable->w) ||
-	  (read_fb->Height != readable->h)) {
-	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
-				  readable->w, readable->h);
-	 read_fb->Initialized = GL_TRUE;
-      }
-   }
-
-   if (rmesa->state.scissor.enabled)
-      r200RecalcScissorRects( rmesa );
-
-   rmesa->lastStamp = drawable->lastStamp;
-}
-
-
-static void r200DrawBuffer( GLcontext *ctx, GLenum mode )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (R200_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s %s\n", __FUNCTION__,
-	      _mesa_lookup_enum_by_nr( mode ));
-
-   R200_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
-
-   if (ctx->DrawBuffer->_NumColorDrawBuffers != 1) {
-      /* 0 (GL_NONE) buffers or multiple color drawing buffers */
-      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   switch ( ctx->DrawBuffer->_ColorDrawBufferIndexes[0] ) {
-   case BUFFER_FRONT_LEFT:
-   case BUFFER_BACK_LEFT:
-      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_FALSE );
-      break;
-   default:
-      FALLBACK( rmesa, R200_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   r200SetCliprects( rmesa );
-
-   /* We'll set the drawing engine's offset/pitch parameters later
-    * when we update other state.
-    */
-}
-
-
-static void r200ReadBuffer( GLcontext *ctx, GLenum mode )
-{
-   /* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
-}
-
 /* =============================================================
  * State enable/disable
  */
@@ -1979,7 +1823,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_CLIP_PLANE2:
    case GL_CLIP_PLANE3:
    case GL_CLIP_PLANE4:
-   case GL_CLIP_PLANE5: 
+   case GL_CLIP_PLANE5:
       p = cap-GL_CLIP_PLANE0;
       R200_STATECHANGE( rmesa, tcl );
       if (state) {
@@ -2013,10 +1857,10 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
       R200_STATECHANGE(rmesa, ctx );
       if ( state ) {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->radeon.state.color.roundEnable;
       } else {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->radeon.state.color.roundEnable;
       }
       break;
 
@@ -2031,7 +1875,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
 	 rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~R200_TCL_FOG_MASK;
       }
       r200UpdateSpecular( ctx ); /* for PK_SPEC */
-      if (rmesa->TclFallback) 
+      if (rmesa->radeon.TclFallback)
 	 r200ChooseVertexState( ctx );
       _mesa_allow_light_in_model( ctx, !state );
       break;
@@ -2046,13 +1890,13 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHT7:
       R200_STATECHANGE(rmesa, tcl);
       p = cap - GL_LIGHT0;
-      if (p&1) 
+      if (p&1)
 	 flag = (R200_LIGHT_1_ENABLE |
-		 R200_LIGHT_1_ENABLE_AMBIENT | 
+		 R200_LIGHT_1_ENABLE_AMBIENT |
 		 R200_LIGHT_1_ENABLE_SPECULAR);
       else
 	 flag = (R200_LIGHT_0_ENABLE |
-		 R200_LIGHT_0_ENABLE_AMBIENT | 
+		 R200_LIGHT_0_ENABLE_AMBIENT |
 		 R200_LIGHT_0_ENABLE_SPECULAR);
 
       if (state)
@@ -2060,7 +1904,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
       else
 	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
 
-      /* 
+      /*
        */
       update_light_colors( ctx, p );
       break;
@@ -2068,7 +1912,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHTING:
       r200UpdateSpecular(ctx);
       /* for reflection map fixup - might set recheck_texgen for all units too */
-      rmesa->NewGLState |= _NEW_TEXTURE;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE;
       break;
 
    case GL_LINE_SMOOTH:
@@ -2181,21 +2025,30 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    }
 
    case GL_SCISSOR_TEST:
-      R200_FIREVERTICES( rmesa );
-      rmesa->state.scissor.enabled = state;
-      r200UpdateScissor( ctx );
+      radeon_firevertices(&rmesa->radeon);
+      rmesa->radeon.state.scissor.enabled = state;
+      radeonUpdateScissor( ctx );
       break;
 
    case GL_STENCIL_TEST:
-      if ( rmesa->state.stencil.hwBuffer ) {
-	 R200_STATECHANGE( rmesa, ctx );
-	 if ( state ) {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_STENCIL_ENABLE;
+      {
+	 GLboolean hw_stencil = GL_FALSE;
+	 if (ctx->DrawBuffer) {
+	    struct radeon_renderbuffer *rrbStencil
+	       = radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+	    hw_stencil = (rrbStencil && rrbStencil->bo);
+	 }
+
+	 if (hw_stencil) {
+	    R200_STATECHANGE( rmesa, ctx );
+	    if ( state ) {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  R200_STENCIL_ENABLE;
+	    } else {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_STENCIL_ENABLE;
+	    }
 	 } else {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~R200_STENCIL_ENABLE;
+	    FALLBACK( rmesa, R200_FALLBACK_STENCIL, state );
 	 }
-      } else {
-	 FALLBACK( rmesa, R200_FALLBACK_STENCIL, state );
       }
       break;
 
@@ -2205,7 +2058,7 @@ static void r200Enable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_TEXTURE_GEN_T:
       /* Picked up in r200UpdateTextureState.
        */
-      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE;
       break;
 
    case GL_COLOR_SUM_EXT:
@@ -2322,7 +2175,7 @@ void r200LightingSpaceChange( GLcontext *ctx )
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLboolean tmp;
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s %d BEFORE %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
 
@@ -2338,7 +2191,7 @@ void r200LightingSpaceChange( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0] &= ~R200_RESCALE_NORMALS;
    }
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL_0]);
 }
@@ -2381,7 +2234,7 @@ static void update_texturematrix( GLcontext *ctx )
    GLuint compsel = rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL];
    int unit;
 
-   if (R200_DEBUG & DEBUG_STATE) 
+   if (R200_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s before COMPSEL: %x\n", __FUNCTION__,
 	      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_COMPSEL]);
 
@@ -2389,7 +2242,7 @@ static void update_texturematrix( GLcontext *ctx )
    rmesa->TexMatCompSel = 0;
 
    for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
-      if (!ctx->Texture.Unit[unit]._ReallyEnabled) 
+      if (!ctx->Texture.Unit[unit]._ReallyEnabled)
 	 continue;
 
       if (ctx->TextureMatrixStack[unit].Top->type != MATRIX_IDENTITY) {
@@ -2399,21 +2252,21 @@ static void update_texturematrix( GLcontext *ctx )
 	 rmesa->TexMatCompSel |= R200_OUTPUT_TEX_0 << unit;
 
 	 if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
-	    /* Need to preconcatenate any active texgen 
+	    /* Need to preconcatenate any active texgen
 	     * obj/eyeplane matrices:
 	     */
 	    _math_matrix_mul_matrix( &rmesa->tmpmat,
-				     ctx->TextureMatrixStack[unit].Top, 
+				     ctx->TextureMatrixStack[unit].Top,
 				     &rmesa->TexGenMatrix[unit] );
 	    upload_matrix( rmesa, rmesa->tmpmat.m, R200_MTX_TEX0+unit );
-	 } 
+	 }
 	 else {
-	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m, 
+	    upload_matrix( rmesa, ctx->TextureMatrixStack[unit].Top->m,
 			   R200_MTX_TEX0+unit );
 	 }
       }
       else if (rmesa->TexGenEnabled & (R200_TEXMAT_0_ENABLE << unit)) {
-	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m, 
+	 upload_matrix( rmesa, rmesa->TexGenMatrix[unit].m,
 			R200_MTX_TEX0+unit );
       }
    }
@@ -2432,69 +2285,78 @@ static void update_texturematrix( GLcontext *ctx )
    }
 }
 
-
-
-/**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void
-r200UpdateDrawBuffer(GLcontext *ctx)
+static GLboolean r200ValidateBuffers(GLcontext *ctx)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   driRenderbuffer *drb;
+   struct radeon_renderbuffer *rrb;
+   int i, ret;
 
-   if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-      /* draw to front */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
-   }
-   else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-      /* draw to back */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+   radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
-   else {
-      /* drawing to multiple buffers, or none */
-      return;
+
+   /* depth buffer */
+   rrb = radeon_get_depthbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
 
-   assert(drb);
-   assert(drb->flippedPitch);
+   for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+      radeonTexObj *t;
 
-   R200_STATECHANGE( rmesa, ctx );
+      if (!ctx->Texture.Unit[i]._ReallyEnabled)
+	 continue;
 
-   /* Note: we used the (possibly) page-flipped values */
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-     = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
-	& R200_COLOROFFSET_MASK);
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
+      t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+      if (t->image_override && t->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+      else if (t->mt->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->mt->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
    }
-}
-
 
+   ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, rmesa->radeon.dma.current, RADEON_GEM_DOMAIN_GTT, 0);
+   if (ret)
+       return GL_FALSE;
+   return GL_TRUE;
+}
 
-void r200ValidateState( GLcontext *ctx )
+GLboolean r200ValidateState( GLcontext *ctx )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint new_state = rmesa->NewGLState;
+   GLuint new_state = rmesa->radeon.NewGLState;
 
-   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-     r200UpdateDrawBuffer(ctx);
+   if (new_state & _NEW_BUFFERS) {
+      _mesa_update_framebuffer(ctx);
+      /* this updates the DrawBuffer's Width/Height if it's a FBO */
+      _mesa_update_draw_buffer_bounds(ctx);
+
+      R200_STATECHANGE(rmesa, ctx);
    }
 
-   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM)) {
+   if (new_state & (_NEW_TEXTURE | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)) {
       r200UpdateTextureState( ctx );
-      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
+      new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
       r200UpdateLocalViewer( ctx );
    }
 
+   /* we need to do a space check here */
+   if (!r200ValidateBuffers(ctx))
+     return GL_FALSE;
+
 /* FIXME: don't really need most of these when vertex progs are enabled */
 
    /* Need an event driven matrix update?
     */
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
       upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, R200_MTX_MVP );
 
    /* Need these for lighting (shouldn't upload otherwise)
@@ -2518,11 +2380,12 @@ void r200ValidateState( GLcontext *ctx )
    /* emit all active clip planes if projection matrix changes.
     */
    if (new_state & (_NEW_PROJECTION)) {
-      if (ctx->Transform.ClipPlanesEnabled) 
+      if (ctx->Transform.ClipPlanesEnabled)
 	 r200UpdateClipPlanes( ctx );
    }
 
    if (new_state & (_NEW_PROGRAM|
+                    _NEW_PROGRAM_CONSTANTS |
    /* need to test for pretty much anything due to possible parameter bindings */
 	_NEW_MODELVIEW|_NEW_PROJECTION|_NEW_TRANSFORM|
 	_NEW_LIGHT|_NEW_TEXTURE|_NEW_TEXTURE_MATRIX|
@@ -2533,7 +2396,8 @@ void r200ValidateState( GLcontext *ctx )
       else TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, 0);
    }
 
-   rmesa->NewGLState = 0;
+   rmesa->radeon.NewGLState = 0;
+   return GL_TRUE;
 }
 
 
@@ -2544,7 +2408,7 @@ static void r200InvalidateState( GLcontext *ctx, GLuint new_state )
    _vbo_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
    _ae_invalidate_state( ctx, new_state );
-   R200_CONTEXT(ctx)->NewGLState |= new_state;
+   R200_CONTEXT(ctx)->radeon.NewGLState |= new_state;
 }
 
 /* A hack.  The r200 can actually cope just fine with materials
@@ -2573,12 +2437,13 @@ static void r200WrapRunPipeline( GLcontext *ctx )
    GLboolean has_material;
 
    if (0)
-      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
+      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->radeon.NewGLState);
 
    /* Validate state:
     */
-   if (rmesa->NewGLState)
-      r200ValidateState( ctx );
+   if (rmesa->radeon.NewGLState)
+      if (!r200ValidateState( ctx ))
+	 FALLBACK(rmesa, RADEON_FALLBACK_TEXTURE, GL_TRUE);
 
    has_material = !ctx->VertexProgram._Enabled && ctx->Light.Enabled && check_material( ctx );
 
@@ -2587,7 +2452,7 @@ static void r200WrapRunPipeline( GLcontext *ctx )
    }
 
    /* Run the pipeline.
-    */ 
+    */
    _tnl_run_pipeline( ctx );
 
    if (has_material) {
@@ -2603,8 +2468,8 @@ void r200InitStateFuncs( struct dd_function_table *functions )
    functions->UpdateState		= r200InvalidateState;
    functions->LightingSpaceChange	= r200LightingSpaceChange;
 
-   functions->DrawBuffer		= r200DrawBuffer;
-   functions->ReadBuffer		= r200ReadBuffer;
+   functions->DrawBuffer		= radeonDrawBuffer;
+   functions->ReadBuffer		= radeonReadBuffer;
 
    functions->AlphaFunc			= r200AlphaFunc;
    functions->BlendColor		= r200BlendColor;
@@ -2636,7 +2501,7 @@ void r200InitStateFuncs( struct dd_function_table *functions )
    functions->PointParameterfv		= r200PointParameter;
    functions->PointSize			= r200PointSize;
    functions->RenderMode		= r200RenderMode;
-   functions->Scissor			= r200Scissor;
+   functions->Scissor			= radeonScissor;
    functions->ShadeModel		= r200ShadeModel;
    functions->StencilFuncSeparate	= r200StencilFuncSeparate;
    functions->StencilMaskSeparate	= r200StencilMaskSeparate;
diff --git a/src/mesa/drivers/dri/r200/r200_state.h b/src/mesa/drivers/dri/r200/r200_state.h
index a917163a00a..23cf8aea667 100644
--- a/src/mesa/drivers/dri/r200/r200_state.h
+++ b/src/mesa/drivers/dri/r200/r200_state.h
@@ -43,23 +43,17 @@ extern void r200InitTnlFuncs( GLcontext *ctx );
 
 extern void r200UpdateMaterial( GLcontext *ctx );
 
-extern void r200SetCliprects( r200ContextPtr rmesa );
-extern void r200RecalcScissorRects( r200ContextPtr rmesa );
 extern void r200UpdateViewportOffset( GLcontext *ctx );
 extern void r200UpdateWindow( GLcontext *ctx );
 extern void r200UpdateDrawBuffer(GLcontext *ctx);
 
-extern void r200ValidateState( GLcontext *ctx );
-
-extern void r200PrintDirty( r200ContextPtr rmesa,
-			      const char *msg );
-
+extern GLboolean r200ValidateState( GLcontext *ctx );
 
 extern void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode );
 #define FALLBACK( rmesa, bit, mode ) do {				\
    if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
 		     __FUNCTION__, bit, mode );				\
-   r200Fallback( rmesa->glCtx, bit, mode );				\
+   r200Fallback( rmesa->radeon.glCtx, bit, mode );				\
 } while (0)
 
 extern void r200LightingSpaceChange( GLcontext *ctx );
diff --git a/src/mesa/drivers/dri/r200/r200_state_init.c b/src/mesa/drivers/dri/r200/r200_state_init.c
index 9e4677eda40..bc871d99048 100644
--- a/src/mesa/drivers/dri/r200/r200_state_init.c
+++ b/src/mesa/drivers/dri/r200/r200_state_init.c
@@ -43,6 +43,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_pipeline.h"
 #include "swrast_setup/swrast_setup.h"
 
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_ioctl.h"
 #include "r200_state.h"
@@ -52,31 +54,129 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "xmlpool.h"
 
+/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
+ * 1.3 cmdbuffers allow all previous state to be updated as well as
+ * the tcl scalar and vector areas.
+ */
+static struct {
+	int start;
+	int len;
+	const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+	{RADEON_PP_MISC, 7, "RADEON_PP_MISC"},
+	{RADEON_PP_CNTL, 3, "RADEON_PP_CNTL"},
+	{RADEON_RB3D_COLORPITCH, 1, "RADEON_RB3D_COLORPITCH"},
+	{RADEON_RE_LINE_PATTERN, 2, "RADEON_RE_LINE_PATTERN"},
+	{RADEON_SE_LINE_WIDTH, 1, "RADEON_SE_LINE_WIDTH"},
+	{RADEON_PP_LUM_MATRIX, 1, "RADEON_PP_LUM_MATRIX"},
+	{RADEON_PP_ROT_MATRIX_0, 2, "RADEON_PP_ROT_MATRIX_0"},
+	{RADEON_RB3D_STENCILREFMASK, 3, "RADEON_RB3D_STENCILREFMASK"},
+	{RADEON_SE_VPORT_XSCALE, 6, "RADEON_SE_VPORT_XSCALE"},
+	{RADEON_SE_CNTL, 2, "RADEON_SE_CNTL"},
+	{RADEON_SE_CNTL_STATUS, 1, "RADEON_SE_CNTL_STATUS"},
+	{RADEON_RE_MISC, 1, "RADEON_RE_MISC"},
+	{RADEON_PP_TXFILTER_0, 6, "RADEON_PP_TXFILTER_0"},
+	{RADEON_PP_BORDER_COLOR_0, 1, "RADEON_PP_BORDER_COLOR_0"},
+	{RADEON_PP_TXFILTER_1, 6, "RADEON_PP_TXFILTER_1"},
+	{RADEON_PP_BORDER_COLOR_1, 1, "RADEON_PP_BORDER_COLOR_1"},
+	{RADEON_PP_TXFILTER_2, 6, "RADEON_PP_TXFILTER_2"},
+	{RADEON_PP_BORDER_COLOR_2, 1, "RADEON_PP_BORDER_COLOR_2"},
+	{RADEON_SE_ZBIAS_FACTOR, 2, "RADEON_SE_ZBIAS_FACTOR"},
+	{RADEON_SE_TCL_OUTPUT_VTX_FMT, 11, "RADEON_SE_TCL_OUTPUT_VTX_FMT"},
+	{RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, 17,
+		    "RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED"},
+	{R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0"},
+	{R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1"},
+	{R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2"},
+	{R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3"},
+	{R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4"},
+	{R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5"},
+	{R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6"},
+	{R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7"},
+	{R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0"},
+	{R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0"},
+	{R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0"},
+	{R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL"},
+	{R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0"},
+	{R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2"},
+	{R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL"},
+	{R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0"},
+	{R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1"},
+	{R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2"},
+	{R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3"},
+	{R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4"},
+	{R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5"},
+	{R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0"},
+	{R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1"},
+	{R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2"},
+	{R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3"},
+	{R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4"},
+	{R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5"},
+	{R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL"},
+	{R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1,
+	 "R200_SE_TCL_OUTPUT_VTX_COMP_SEL"},
+	{R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3"},
+	{R200_PP_CNTL_X, 1, "R200_PP_CNTL_X"},
+	{R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET"},
+	{R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL"},
+	{R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0"},
+	{R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1"},
+	{R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2"},
+	{R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS"},
+	{R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL"},
+	{R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE"},
+	{R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4,
+		    "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0"},
+	{R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0"},	/* 61 */
+	{R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0"}, /* 62 */
+	{R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1"},
+	{R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1"},
+	{R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2"},
+	{R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2"},
+	{R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3"},
+	{R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3"},
+	{R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4"},
+	{R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4"},
+	{R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5"},
+	{R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5"},
+	{RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"},
+	{RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"},
+	{RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"},
+	{R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},
+	{R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
+	{RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"},
+	{RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"},
+	{RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"},
+	{RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"},
+	{RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"},
+	{RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
+	{R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF"},
+	{R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},     /* 85 */
+	{R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
+	{R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
+	{R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
+	{R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
+	{R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
+	{R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
+	{R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
+	{R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
+	{R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
+};
+
 /* =============================================================
  * State initialization
  */
-
-void r200PrintDirty( r200ContextPtr rmesa, const char *msg )
+static int cmdpkt( r200ContextPtr rmesa, int id ) 
 {
-   struct r200_state_atom *l;
-
-   fprintf(stderr, msg);
-   fprintf(stderr, ": ");
+   drm_radeon_cmd_header_t h;
 
-   foreach(l, &rmesa->hw.atomlist) {
-      if (l->dirty || rmesa->hw.all_dirty)
-	 fprintf(stderr, "%s, ", l->name);
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     return CP_PACKET0(packet[id].start, packet[id].len - 1);
+   } else {
+     h.i = 0;
+     h.packet.cmd_type = RADEON_CMD_PACKET;
+     h.packet.packet_id = id;
    }
-
-   fprintf(stderr, "\n");
-}
-
-static int cmdpkt( int id ) 
-{
-   drm_radeon_cmd_header_t h;
-   h.i = 0;
-   h.packet.cmd_type = RADEON_CMD_PACKET;
-   h.packet.packet_id = id;
    return h.i;
 }
 
@@ -127,150 +227,502 @@ static int cmdscl2( int offset, int stride, int count )
 }
 
 #define CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom) \
 {							\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
    (void) rmesa;					\
-   return FLAG;						\
+   return (FLAG) ? atom->cmd_size : 0;			\
 }
 
 #define TCL_CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
-{							\
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
-   return !rmesa->TclFallback && !ctx->VertexProgram._Enabled && (FLAG);	\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom) \
+{									\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
+   return (!rmesa->radeon.TclFallback && !ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size : 0; \
 }
 
 #define TCL_OR_VP_CHECK( NM, FLAG )			\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom ) \
 {							\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
-   return !rmesa->TclFallback && (FLAG);		\
+   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size : 0;	\
 }
 
 #define VP_CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx, int idx )	\
-{							\
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   (void) idx;						\
-   return !rmesa->TclFallback && ctx->VertexProgram._Enabled && (FLAG);		\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom ) \
+{									\
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);				\
+   (void) atom;								\
+   return (!rmesa->radeon.TclFallback && ctx->VertexProgram._Enabled && (FLAG)) ? atom->cmd_size : 0; \
 }
 
-
 CHECK( always, GL_TRUE )
 CHECK( never, GL_FALSE )
 CHECK( tex_any, ctx->Texture._EnabledUnits )
 CHECK( tf, (ctx->Texture._EnabledUnits && !ctx->ATIFragmentShader._Enabled) );
-CHECK( tex_pair, (rmesa->state.texture.unit[idx].unitneeded | rmesa->state.texture.unit[idx & ~1].unitneeded) )
-CHECK( tex, rmesa->state.texture.unit[idx].unitneeded )
+CHECK( tex_pair, (rmesa->state.texture.unit[atom->idx].unitneeded | rmesa->state.texture.unit[atom->idx & ~1].unitneeded) )
+CHECK( tex, rmesa->state.texture.unit[atom->idx].unitneeded )
 CHECK( pix_zero, !ctx->ATIFragmentShader._Enabled )
-CHECK( texenv, (rmesa->state.envneeded & (1 << idx) && !ctx->ATIFragmentShader._Enabled) )
+   CHECK( texenv, (rmesa->state.envneeded & (1 << (atom->idx)) && !ctx->ATIFragmentShader._Enabled) )
 CHECK( afs_pass1, (ctx->ATIFragmentShader._Enabled && (ctx->ATIFragmentShader.Current->NumPasses > 1)) )
 CHECK( afs, ctx->ATIFragmentShader._Enabled )
-CHECK( tex_cube, rmesa->state.texture.unit[idx].unitneeded & TEXTURE_CUBE_BIT )
+CHECK( tex_cube, rmesa->state.texture.unit[atom->idx].unitneeded & TEXTURE_CUBE_BIT )
 TCL_CHECK( tcl_fog, ctx->Fog.Enabled )
 TCL_CHECK( tcl, GL_TRUE )
-TCL_CHECK( tcl_tex, rmesa->state.texture.unit[idx].unitneeded )
+TCL_CHECK( tcl_tex, rmesa->state.texture.unit[atom->idx].unitneeded )
 TCL_CHECK( tcl_lighting, ctx->Light.Enabled )
-TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[idx].Enabled )
-TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << idx)) )
+TCL_CHECK( tcl_light, ctx->Light.Enabled && ctx->Light.Light[atom->idx].Enabled )
+TCL_OR_VP_CHECK( tcl_ucp, (ctx->Transform.ClipPlanesEnabled & (1 << (atom->idx))) )
 TCL_OR_VP_CHECK( tcl_or_vp, GL_TRUE )
 VP_CHECK( tcl_vp, GL_TRUE )
 VP_CHECK( tcl_vp_size, ctx->VertexProgram.Current->Base.NumNativeInstructions > 64 )
 VP_CHECK( tcl_vpp_size, ctx->VertexProgram.Current->Base.NumNativeParameters > 96 )
 
+#define OUT_VEC(hdr, data) do {			\
+    drm_radeon_cmd_header_t h;					\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
+    OUT_BATCH(0);							\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
+    OUT_BATCH(h.vectors.offset | (h.vectors.stride << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, h.vectors.count - 1));	\
+    OUT_BATCH_TABLE((data), h.vectors.count);				\
+  } while(0)
+
+#define OUT_VECLINEAR(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    uint32_t _start, _sz;						\
+    h.i = hdr;								\
+    _start = h.veclinear.addr_lo | (h.veclinear.addr_hi << 8);		\
+    _sz = h.veclinear.count * 4;					\
+    if (r200->radeon.radeonScreen->kernel_mm && _sz) { \
+    BEGIN_BATCH_NO_AUTOSTATE(dwords); \
+    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
+    OUT_BATCH(0);							\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
+    OUT_BATCH(_start | (1 << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT));	\
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, _sz - 1));	\
+    OUT_BATCH_TABLE((data), _sz);					\
+    END_BATCH(); \
+    } \
+  } while(0)
+
+#define OUT_SCL(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
+    OUT_BATCH((h.scalars.offset) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
+    OUT_BATCH_TABLE((data), h.scalars.count);				\
+  } while(0)
+
+#define OUT_SCL2(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
+    OUT_BATCH((h.scalars.offset + 0x100) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
+    OUT_BATCH_TABLE((data), h.scalars.count);				\
+  } while(0)
+
+static void mtl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 6;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[MTL_CMD_0], (atom->cmd+1));
+   OUT_SCL2(atom->cmd[MTL_CMD_1], (atom->cmd + 18));
+   END_BATCH();
+}
 
-/* Initialize the context's hardware state.
- */
-void r200InitState( r200ContextPtr rmesa )
+static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
-   GLcontext *ctx = rmesa->glCtx;
-   GLuint color_fmt, depth_fmt, i;
-   GLint drawPitch, drawOffset;
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 8;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
+   OUT_VEC(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
+   END_BATCH();
+}
 
-   switch ( rmesa->r200Screen->cpp ) {
-   case 2:
-      color_fmt = R200_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      color_fmt = R200_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
-      exit( -1 );
+static void ptp_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 8;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[PTP_CMD_0], atom->cmd+1);
+   OUT_VEC(atom->cmd[PTP_CMD_1], atom->cmd+PTP_CMD_1+1);
+   END_BATCH();
+}
+
+static void veclinear_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 4;
+   OUT_VECLINEAR(atom->cmd[0], atom->cmd+1);
+}
+
+static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_SCL(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
+
+
+static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 4;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
+
+static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   struct radeon_renderbuffer *rrb;
+   uint32_t cbpitch;
+   uint32_t zbpitch, depth_fmt;
+   uint32_t dwords = atom->cmd_size;
+
+   /* output the first 7 bytes of context */
+   BEGIN_BATCH_NO_AUTOSTATE(dwords+2+2);
+   OUT_BATCH_TABLE(atom->cmd, 5);
+
+   rrb = radeon_get_depthbuffer(&r200->radeon);
+   if (!rrb) {
+     OUT_BATCH(0);
+     OUT_BATCH(0);
+   } else {
+     zbpitch = (rrb->pitch / rrb->cpp);
+     if (r200->using_hyperz)
+       zbpitch |= RADEON_DEPTH_HYPERZ;
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+     OUT_BATCH(zbpitch);
+     if (rrb->cpp == 4) 
+       depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z; 
+     else 
+       depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z; 
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK; 
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt; 
+   }
+     
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(atom->cmd[CTX_CMD_1]);
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+
+   rrb = radeon_get_colorbuffer(&r200->radeon);
+   if (!rrb || !rrb->bo) {
+     OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+     OUT_BATCH(atom->cmd[CTX_RB3D_COLOROFFSET]);
+   } else {
+     atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10); 
+     if (rrb->cpp == 4) 
+       atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888; 
+     else 
+       atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565; 
+ 
+     OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]); 
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
    }
 
-   rmesa->state.color.clear = 0x00000000;
+   OUT_BATCH(atom->cmd[CTX_CMD_2]);
 
-   switch ( ctx->Visual.depthBits ) {
-   case 16:
-      rmesa->state.depth.clear = 0x0000ffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
-      depth_fmt = R200_DEPTH_FORMAT_16BIT_INT_Z;
-      rmesa->state.stencil.clear = 0x00000000;
-      break;
-   case 24:
-      rmesa->state.depth.clear = 0x00ffffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
-      depth_fmt = R200_DEPTH_FORMAT_24BIT_INT_Z;
-      rmesa->state.stencil.clear = 0xffff0000;
-      break;
-   default:
-      fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
-	       ctx->Visual.depthBits );
-      exit( -1 );
+   if (!rrb || !rrb->bo) {
+     OUT_BATCH(atom->cmd[CTX_RB3D_COLORPITCH]);
+   } else {
+     cbpitch = (rrb->pitch / rrb->cpp);
+     if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= R200_COLOR_TILE_ENABLE;
+     OUT_BATCH(cbpitch);
    }
 
-   /* Only have hw stencil when depth buffer is 24 bits deep */
-   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
-				     ctx->Visual.depthBits == 24 );
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
+     OUT_BATCH_TABLE((atom->cmd + 14), 4);
+
+   END_BATCH();
+}
 
-   rmesa->Fallback = 0;
+static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t cbpitch = 0;
+   uint32_t zbpitch = 0;
+   uint32_t dwords = atom->cmd_size;
+   uint32_t depth_fmt;
+
+   rrb = radeon_get_colorbuffer(&r200->radeon);
+   if (!rrb || !rrb->bo) {
+      return;
+   }
 
-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
-      drawOffset = rmesa->r200Screen->backOffset;
-      drawPitch  = rmesa->r200Screen->backPitch;
-   } else {
-      drawOffset = rmesa->r200Screen->frontOffset;
-      drawPitch  = rmesa->r200Screen->frontPitch;
+   atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
+   if (rrb->cpp == 4)
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
+   else switch (rrb->base._ActualFormat) {
+   case GL_RGB5:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
+	break;
+   case GL_RGBA4:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
+	break;
+   case GL_RGB5_A1:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
+	break;
+   }
+
+   cbpitch = (rrb->pitch / rrb->cpp);
+   if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= R200_COLOR_TILE_ENABLE;
+
+   drb = radeon_get_depthbuffer(&r200->radeon);
+   if (drb) {
+     zbpitch = (drb->pitch / drb->cpp);
+     if (drb->cpp == 4)
+        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+     else
+        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
+   }
+
+   dwords = 10;
+   if (drb)
+     dwords += 6;
+   if (rrb)
+     dwords += 6;
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM)
+     dwords += 4;
+
+   /* output the first 7 bytes of context */
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   /* In the CS case we need to split this up */
+   OUT_BATCH(CP_PACKET0(packet[0].start, 3));
+   OUT_BATCH_TABLE((atom->cmd + 1), 4);
+
+   if (drb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0));
+     OUT_BATCH_RELOC(0, drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0));
+     OUT_BATCH(zbpitch);
    }
-#if 000
-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
-      rmesa->state.color.drawOffset = rmesa->r200Screen->backOffset;
-      rmesa->state.color.drawPitch  = rmesa->r200Screen->backPitch;
+
+   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZSTENCILCNTL, 0));
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 1));
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+   OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+
+
+   if (rrb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0));
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
+     OUT_BATCH(cbpitch);
+   }
+
+   if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
+     OUT_BATCH_TABLE((atom->cmd + 14), 4);
+   }
+
+   END_BATCH();
+}
+
+static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+   int i = atom->idx;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   if (t && t->mt && !t->image_override)
+     dwords += 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   /* is this ok even with drm older than 1.18? */
+   OUT_BATCH_TABLE(atom->cmd, 10);
+
+   if (t && t->mt && !t->image_override) {
+     OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+   } else if (!t) {
+     /* workaround for old CS mechanism */
+     OUT_BATCH(r200->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
    } else {
-      rmesa->state.color.drawOffset = rmesa->r200Screen->frontOffset;
-      rmesa->state.color.drawPitch  = rmesa->r200Screen->frontPitch;
+     OUT_BATCH(t->override_offset);
    }
 
-   rmesa->state.pixel.readOffset = rmesa->state.color.drawOffset;
-   rmesa->state.pixel.readPitch  = rmesa->state.color.drawPitch;
-#endif
+   END_BATCH();
+}
+
+static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = atom->cmd_size;
+   int i = atom->idx;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+   int hastexture = 1;
+
+   if (!r200->state.texture.unit[i].unitneeded)
+        hastexture = 0;
+   if (!t)
+	hastexture = 0;
+   else {
+	if (!t->mt && !t->bo)
+		hastexture = 0;
+   }
+
+   if (hastexture)
+     dwords += 2;
+   else
+     dwords -= 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   OUT_BATCH(CP_PACKET0(R200_PP_TXFILTER_0 + (32 * i), 7));
+   OUT_BATCH_TABLE((atom->cmd + 1), 8);
+
+   if (hastexture) {
+     OUT_BATCH(CP_PACKET0(R200_PP_TXOFFSET_0 + (24 * i), 0));
+     if (t->mt && !t->image_override) {
+        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		  RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+      } else {
+	if (t->bo)
+            OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+                            RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+      }
+   }
+   END_BATCH();
+}
+
+
+static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = 3;
+   int i = atom->idx, j;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + (3 * 5));
+   /* XXX that size won't really match with image_override... */
+   OUT_BATCH_TABLE(atom->cmd, 2);
+
+   if (t && !t->image_override) {
+     lvl = &t->mt->levels[0];
+     OUT_BATCH_TABLE((atom->cmd + 2), 1);
+     for (j = 1; j <= 5; j++) {
+       OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
+   }
+   END_BATCH();
+}
+
+static void cube_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r200ContextPtr r200 = R200_CONTEXT(ctx);
+   BATCH_LOCALS(&r200->radeon);
+   uint32_t dwords = 2;
+   int i = atom->idx, j;
+   radeonTexObj *t = r200->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + (4 * 5));
+   OUT_BATCH_TABLE(atom->cmd, 2);
+
+   if (t && !t->image_override) {
+     lvl = &t->mt->levels[0];
+     for (j = 1; j <= 5; j++) {
+       OUT_BATCH(CP_PACKET0(R200_PP_CUBIC_OFFSET_F1_0 + (24*i) + (4 * (j-1)), 0));
+       OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
+   }
+   END_BATCH();
+}
+
+/* Initialize the context's hardware state.
+ */
+void r200InitState( r200ContextPtr rmesa )
+{
+   GLcontext *ctx = rmesa->radeon.glCtx;
+   GLuint i;
+
+   rmesa->radeon.state.color.clear = 0x00000000;
+
+   switch ( ctx->Visual.depthBits ) {
+   case 16:
+      rmesa->radeon.state.depth.clear = 0x0000ffff;
+      rmesa->radeon.state.stencil.clear = 0x00000000;
+      break;
+   case 24:
+   default:
+      rmesa->radeon.state.depth.clear = 0x00ffffff;
+      rmesa->radeon.state.stencil.clear = 0xffff0000;
+      break;
+   }
+
+   rmesa->radeon.Fallback = 0;
 
-   rmesa->hw.max_state_size = 0;
+   rmesa->radeon.hw.max_state_size = 0;
 
 #define ALLOC_STATE( ATOM, CHK, SZ, NM, IDX )				\
    do {								\
       rmesa->hw.ATOM.cmd_size = SZ;				\
-      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
-      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.cmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.lastcmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
       rmesa->hw.ATOM.name = NM;					\
       rmesa->hw.ATOM.idx = IDX;					\
       rmesa->hw.ATOM.check = check_##CHK;			\
       rmesa->hw.ATOM.dirty = GL_FALSE;				\
-      rmesa->hw.max_state_size += SZ * sizeof(int);		\
+      rmesa->radeon.hw.max_state_size += SZ * sizeof(int);		\
    } while (0)
 
 
    /* Allocate state buffers:
     */
-   if (rmesa->r200Screen->drmSupportsBlendColor)
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
       ALLOC_STATE( ctx, always, CTX_STATE_SIZE_NEWDRM, "CTX/context", 0 );
    else
       ALLOC_STATE( ctx, always, CTX_STATE_SIZE_OLDDRM, "CTX/context", 0 );
+
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     rmesa->hw.ctx.emit = ctx_emit_cs;
+   else
+     rmesa->hw.ctx.emit = ctx_emit;
    ALLOC_STATE( set, always, SET_STATE_SIZE, "SET/setup", 0 );
    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
@@ -282,8 +734,8 @@ void r200InitState( r200ContextPtr rmesa )
    ALLOC_STATE( cst, always, CST_STATE_SIZE, "CST/constant", 0 );
    ALLOC_STATE( zbs, always, ZBS_STATE_SIZE, "ZBS/zbias", 0 );
    ALLOC_STATE( tf, tf, TF_STATE_SIZE, "TF/tfactor", 0 );
-   if (rmesa->r200Screen->drmSupportsFragShader) {
-      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+   if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
       /* make sure texture units 0/1 are emitted pair-wise for r200 t0 hang workaround */
 	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-0", 0 );
 	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_NEWDRM, "TEX/tex-1", 1 );
@@ -303,7 +755,7 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( afs[1], afs, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
    }
    else {
-      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
 	 ALLOC_STATE( tex[0], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-0", 0 );
 	 ALLOC_STATE( tex[1], tex_pair, TEX_STATE_SIZE_OLDDRM, "TEX/tex-1", 1 );
 	 ALLOC_STATE( tam, tex_any, TAM_STATE_SIZE, "TAM/tam", 0 );
@@ -321,13 +773,24 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( afs[0], never, AFS_STATE_SIZE, "AFS/afsinst-0", 0 );
       ALLOC_STATE( afs[1], never, AFS_STATE_SIZE, "AFS/afsinst-1", 1 );
    }
-   if (rmesa->r200Screen->drmSupportsCubeMapsR200) {
+
+   for (i = 0; i < 5; i++)
+      if (rmesa->radeon.radeonScreen->kernel_mm)
+          rmesa->hw.tex[i].emit = tex_emit_cs;
+      else
+          rmesa->hw.tex[i].emit = tex_emit;
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR200) {
       ALLOC_STATE( cube[0], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
       ALLOC_STATE( cube[1], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-1", 1 );
       ALLOC_STATE( cube[2], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-2", 2 );
       ALLOC_STATE( cube[3], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-3", 3 );
       ALLOC_STATE( cube[4], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
       ALLOC_STATE( cube[5], tex_cube, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
+      for (i = 0; i < 5; i++)
+          if (rmesa->radeon.radeonScreen->kernel_mm)
+              rmesa->hw.cube[i].emit = cube_emit_cs;
+          else
+              rmesa->hw.cube[i].emit = cube_emit;
    }
    else {
       ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/tex-0", 0 );
@@ -337,7 +800,8 @@ void r200InitState( r200ContextPtr rmesa )
       ALLOC_STATE( cube[4], never, CUBE_STATE_SIZE, "CUBE/tex-4", 4 );
       ALLOC_STATE( cube[5], never, CUBE_STATE_SIZE, "CUBE/tex-5", 5 );
    }
-   if (rmesa->r200Screen->drmSupportsVertexProgram) {
+
+   if (rmesa->radeon.radeonScreen->drmSupportsVertexProgram) {
       ALLOC_STATE( pvs, tcl_vp, PVS_STATE_SIZE, "PVS/pvscntl", 0 );
       ALLOC_STATE( vpi[0], tcl_vp, VPI_STATE_SIZE, "VP/vertexprog-0", 0 );
       ALLOC_STATE( vpi[1], tcl_vp_size, VPI_STATE_SIZE, "VP/vertexprog-1", 1 );
@@ -390,13 +854,13 @@ void r200InitState( r200ContextPtr rmesa )
    ALLOC_STATE( pix[3], texenv, PIX_STATE_SIZE, "PIX/pixstage-3", 3 );
    ALLOC_STATE( pix[4], texenv, PIX_STATE_SIZE, "PIX/pixstage-4", 4 );
    ALLOC_STATE( pix[5], texenv, PIX_STATE_SIZE, "PIX/pixstage-5", 5 );
-   if (rmesa->r200Screen->drmSupportsTriPerf) {
+   if (rmesa->radeon.radeonScreen->drmSupportsTriPerf) {
       ALLOC_STATE( prf, always, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
    }
    else {
       ALLOC_STATE( prf, never, PRF_STATE_SIZE, "PRF/performance-tri", 0 );
    }
-   if (rmesa->r200Screen->drmSupportsPointSprites) {
+   if (rmesa->radeon.radeonScreen->drmSupportsPointSprites) {
       ALLOC_STATE( spr, always, SPR_STATE_SIZE, "SPR/pointsprite", 0 );
       ALLOC_STATE( ptp, tcl, PTP_STATE_SIZE, "PTP/pointparams", 0 );
    }
@@ -409,87 +873,115 @@ void r200InitState( r200ContextPtr rmesa )
 
    /* Fill in the packet headers:
     */
-   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
-   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
-   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
-   if (rmesa->r200Screen->drmSupportsBlendColor)
-      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(R200_EMIT_RB3D_BLENDCOLOR);
-   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
-   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
-   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
-   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
-   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
-   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
-   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(R200_EMIT_PP_CNTL_X);
-   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(R200_EMIT_RB3D_DEPTHXY_OFFSET);
-   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(R200_EMIT_RE_AUX_SCISSOR_CNTL);
-   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(R200_EMIT_RE_SCISSOR_TL_0);
-   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(R200_EMIT_SE_VAP_CNTL_STATUS);
-   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(R200_EMIT_RE_POINTSIZE);
-   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
-   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(R200_EMIT_PP_TAM_DEBUG3);
-   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(R200_EMIT_TFACTOR_0);
-   if (rmesa->r200Screen->drmSupportsFragShader) {
-      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(R200_EMIT_ATF_TFACTOR);
-      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_0);
-      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
-      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_1);
-      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
-      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_2);
-      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
-      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_3);
-      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
-      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_4);
-      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
-      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCTLALL_5);
-      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
+   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_MISC);
+   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CNTL);
+   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(rmesa, RADEON_EMIT_RB3D_COLORPITCH);
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor)
+      rmesa->hw.ctx.cmd[CTX_CMD_3] = cmdpkt(rmesa, R200_EMIT_RB3D_BLENDCOLOR);
+   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_LINE_PATTERN);
+   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_LINE_WIDTH);
+   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RB3D_STENCILREFMASK);
+   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_VPORT_XSCALE);
+   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL);
+   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_MISC);
+   rmesa->hw.cst.cmd[CST_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CNTL_X);
+   rmesa->hw.cst.cmd[CST_CMD_1] = cmdpkt(rmesa, R200_EMIT_RB3D_DEPTHXY_OFFSET);
+   rmesa->hw.cst.cmd[CST_CMD_2] = cmdpkt(rmesa, R200_EMIT_RE_AUX_SCISSOR_CNTL);
+   rmesa->hw.cst.cmd[CST_CMD_3] = cmdpkt(rmesa, R200_EMIT_RE_SCISSOR_TL_0);
+   rmesa->hw.cst.cmd[CST_CMD_4] = cmdpkt(rmesa, R200_EMIT_SE_VAP_CNTL_STATUS);
+   rmesa->hw.cst.cmd[CST_CMD_5] = cmdpkt(rmesa, R200_EMIT_RE_POINTSIZE);
+   rmesa->hw.cst.cmd[CST_CMD_6] = cmdpkt(rmesa, R200_EMIT_TCL_INPUT_VTX_VECTOR_ADDR_0);
+   rmesa->hw.tam.cmd[TAM_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TAM_DEBUG3);
+   rmesa->hw.tf.cmd[TF_CMD_0] = cmdpkt(rmesa, R200_EMIT_TFACTOR_0);
+   if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
+      rmesa->hw.atf.cmd[ATF_CMD_0] = cmdpkt(rmesa, R200_EMIT_ATF_TFACTOR);
+      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_0);
+      rmesa->hw.tex[0].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_0);
+      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_1);
+      rmesa->hw.tex[1].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_1);
+      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_2);
+      rmesa->hw.tex[2].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_2);
+      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_3);
+      rmesa->hw.tex[3].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_3);
+      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_4);
+      rmesa->hw.tex[4].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_4);
+      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCTLALL_5);
+      rmesa->hw.tex[5].cmd[TEX_CMD_1_NEWDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_5);
    } else {
-      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_0);
-      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_0);
-      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_1);
-      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_1);
-      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_2);
-      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_2);
-      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_3);
-      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_3);
-      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_4);
-      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_4);
-      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(R200_EMIT_PP_TXFILTER_5);
-      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(R200_EMIT_PP_TXOFFSET_5);
-   }
-   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_0);
-   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(R200_EMIT_PP_AFS_1);
-   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(R200_EMIT_VAP_PVS_CNTL);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_0);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_0);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_1);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_1);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_2);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_2);
-   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_3);
-   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_3);
-   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_4);
-   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_4);
-   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(R200_EMIT_PP_CUBIC_FACES_5);
-   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(R200_EMIT_PP_CUBIC_OFFSETS_5);
-   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_0);
-   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_1);
-   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_2);
-   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_3);
-   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_4);
-   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(R200_EMIT_PP_TXCBLEND_5);
-   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
-   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
-   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
-   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(R200_EMIT_TEX_PROC_CTL_2);
-   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(R200_EMIT_MATRIX_SELECT_0);
-   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(R200_EMIT_VAP_CTL);
-   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(R200_EMIT_VTX_FMT_0);
-   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(R200_EMIT_OUTPUT_VTX_COMP_SEL);
-   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(R200_EMIT_SE_VTX_STATE_CNTL);
-   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(R200_EMIT_VTE_CNTL);
-   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(R200_EMIT_PP_TRI_PERF_CNTL);
-   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(R200_EMIT_TCL_POINT_SPRITE_CNTL);
+      rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_0);
+      rmesa->hw.tex[0].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_0);
+      rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_1);
+      rmesa->hw.tex[1].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_1);
+      rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_2);
+      rmesa->hw.tex[2].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_2);
+      rmesa->hw.tex[3].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_3);
+      rmesa->hw.tex[3].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_3);
+      rmesa->hw.tex[4].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_4);
+      rmesa->hw.tex[4].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_4);
+      rmesa->hw.tex[5].cmd[TEX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXFILTER_5);
+      rmesa->hw.tex[5].cmd[TEX_CMD_1_OLDDRM] = cmdpkt(rmesa, R200_EMIT_PP_TXOFFSET_5);
+   }
+   rmesa->hw.afs[0].cmd[AFS_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_AFS_0);
+   rmesa->hw.afs[1].cmd[AFS_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_AFS_1);
+   rmesa->hw.pvs.cmd[PVS_CMD_0] = cmdpkt(rmesa, R200_EMIT_VAP_PVS_CNTL);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_0);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_0);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_1);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_1);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_2);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_2);
+   rmesa->hw.cube[3].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_3);
+   rmesa->hw.cube[3].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_3);
+   rmesa->hw.cube[4].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_4);
+   rmesa->hw.cube[4].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_4);
+   rmesa->hw.cube[5].cmd[CUBE_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_FACES_5);
+   rmesa->hw.cube[5].cmd[CUBE_CMD_1] = cmdpkt(rmesa, R200_EMIT_PP_CUBIC_OFFSETS_5);
+   rmesa->hw.pix[0].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_0);
+   rmesa->hw.pix[1].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_1);
+   rmesa->hw.pix[2].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_2);
+   rmesa->hw.pix[3].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_3);
+   rmesa->hw.pix[4].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_4);
+   rmesa->hw.pix[5].cmd[PIX_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TXCBLEND_5);
+   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_ZBIAS_FACTOR);
+   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(rmesa, R200_EMIT_TCL_LIGHT_MODEL_CTL_0);
+   rmesa->hw.tcl.cmd[TCL_CMD_1] = cmdpkt(rmesa, R200_EMIT_TCL_UCP_VERT_BLEND_CTL);
+   rmesa->hw.tcg.cmd[TCG_CMD_0] = cmdpkt(rmesa, R200_EMIT_TEX_PROC_CTL_2);
+   rmesa->hw.msl.cmd[MSL_CMD_0] = cmdpkt(rmesa, R200_EMIT_MATRIX_SELECT_0);
+   rmesa->hw.vap.cmd[VAP_CMD_0] = cmdpkt(rmesa, R200_EMIT_VAP_CTL);
+   rmesa->hw.vtx.cmd[VTX_CMD_0] = cmdpkt(rmesa, R200_EMIT_VTX_FMT_0);
+   rmesa->hw.vtx.cmd[VTX_CMD_1] = cmdpkt(rmesa, R200_EMIT_OUTPUT_VTX_COMP_SEL);
+   rmesa->hw.vtx.cmd[VTX_CMD_2] = cmdpkt(rmesa, R200_EMIT_SE_VTX_STATE_CNTL);
+   rmesa->hw.vte.cmd[VTE_CMD_0] = cmdpkt(rmesa, R200_EMIT_VTE_CNTL);
+   rmesa->hw.prf.cmd[PRF_CMD_0] = cmdpkt(rmesa, R200_EMIT_PP_TRI_PERF_CNTL);
+   rmesa->hw.spr.cmd[SPR_CMD_0] = cmdpkt(rmesa, R200_EMIT_TCL_POINT_SPRITE_CNTL);
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+	rmesa->hw.mtl[0].emit = mtl_emit;
+	rmesa->hw.mtl[1].emit = mtl_emit;
+
+	rmesa->hw.vpi[0].emit = veclinear_emit;
+	rmesa->hw.vpi[1].emit = veclinear_emit;
+	rmesa->hw.vpp[0].emit = veclinear_emit;
+	rmesa->hw.vpp[1].emit = veclinear_emit;
+
+	rmesa->hw.grd.emit = scl_emit;
+	rmesa->hw.fog.emit = vec_emit;
+	rmesa->hw.glt.emit = vec_emit;
+	rmesa->hw.eye.emit = vec_emit;
+
+	for (i = R200_MTX_MV; i <= R200_MTX_TEX5; i++)
+	  rmesa->hw.mat[i].emit = vec_emit;
+
+	for (i = 0; i < 8; i++)
+	  rmesa->hw.lit[i].emit = lit_emit;
+
+	for (i = 0; i < 6; i++)
+	  rmesa->hw.ucp[i].emit = vec_emit;
+
+	rmesa->hw.ptp.emit = ptp_emit;
+   }
+
+
+   
    rmesa->hw.mtl[0].cmd[MTL_CMD_0] = 
       cmdvec( R200_VS_MAT_0_EMISS, 1, 16 );
    rmesa->hw.mtl[0].cmd[MTL_CMD_1] = 
@@ -567,7 +1059,7 @@ void r200InitState( r200ContextPtr rmesa )
 				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
 				(R200_BLEND_GL_ZERO << R200_DST_BLEND_SHIFT));
 
-   if (rmesa->r200Screen->drmSupportsBlendColor) {
+   if (rmesa->radeon.radeonScreen->drmSupportsBlendColor) {
       rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCOLOR] = 0x00000000;
       rmesa->hw.ctx.cmd[CTX_RB3D_ABLENDCNTL] = (R200_COMB_FCN_ADD_CLAMP |
 				(R200_BLEND_GL_ONE << R200_SRC_BLEND_SHIFT) |
@@ -578,18 +1070,17 @@ void r200InitState( r200ContextPtr rmesa )
    }
 
    rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
-      rmesa->r200Screen->depthOffset + rmesa->r200Screen->fbLocation;
+      rmesa->radeon.radeonScreen->depthOffset + rmesa->radeon.radeonScreen->fbLocation;
 
    rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
-      ((rmesa->r200Screen->depthPitch &
+      ((rmesa->radeon.radeonScreen->depthPitch &
 	R200_DEPTHPITCH_MASK) |
        R200_DEPTH_ENDIAN_NO_SWAP);
    
    if (rmesa->using_hyperz)
       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= R200_DEPTH_HYPERZ;
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
-					       R200_Z_TEST_LESS |
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (R200_Z_TEST_LESS |
 					       R200_STENCIL_TEST_ALWAYS |
 					       R200_STENCIL_FAIL_KEEP |
 					       R200_STENCIL_ZPASS_KEEP |
@@ -599,15 +1090,14 @@ void r200InitState( r200ContextPtr rmesa )
    if (rmesa->using_hyperz) {
       rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= R200_Z_COMPRESSION_ENABLE |
 						  R200_Z_DECOMPRESSION_ENABLE;
-/*      if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200)
+/*      if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200)
 	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
    }
 
    rmesa->hw.ctx.cmd[CTX_PP_CNTL] = (R200_ANTI_ALIAS_NONE 
  				     | R200_TEX_BLEND_0_ENABLE);
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = color_fmt;
-   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
+   switch ( driQueryOptioni( &rmesa->radeon.optionCache, "dither_mode" ) ) {
    case DRI_CONF_DITHER_XERRORDIFFRESET:
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_INIT;
       break;
@@ -615,41 +1105,19 @@ void r200InitState( r200ContextPtr rmesa )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_SCALE_DITHER_ENABLE;
       break;
    }
-   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
+   if ( driQueryOptioni( &rmesa->radeon.optionCache, "round_mode" ) ==
 	DRI_CONF_ROUND_ROUND )
-      rmesa->state.color.roundEnable = R200_ROUND_ENABLE;
+      rmesa->radeon.state.color.roundEnable = R200_ROUND_ENABLE;
    else
-      rmesa->state.color.roundEnable = 0;
-   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
+      rmesa->radeon.state.color.roundEnable = 0;
+   if ( driQueryOptioni (&rmesa->radeon.optionCache, "color_reduction" ) ==
 	DRI_CONF_COLOR_REDUCTION_DITHER )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= R200_DITHER_ENABLE;
    else
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
-
-#if 000
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((rmesa->state.color.drawOffset +
-					       rmesa->r200Screen->fbLocation)
-					      & R200_COLOROFFSET_MASK);
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((rmesa->state.color.drawPitch &
-					      R200_COLORPITCH_MASK) |
-					     R200_COLOR_ENDIAN_NO_SWAP);
-#else
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
-					       rmesa->r200Screen->fbLocation)
-					      & R200_COLOROFFSET_MASK);
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
-					      R200_COLORPITCH_MASK) |
-					     R200_COLOR_ENDIAN_NO_SWAP);
-#endif
-   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= R200_COLOR_TILE_ENABLE;
-   }
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->radeon.state.color.roundEnable;
 
    rmesa->hw.prf.cmd[PRF_PP_TRI_PERF] = R200_TRI_CUTOFF_MASK - R200_TRI_CUTOFF_MASK * 
-			driQueryOptionf (&rmesa->optionCache,"texture_blend_quality");
+			driQueryOptionf (&rmesa->radeon.optionCache,"texture_blend_quality");
    rmesa->hw.prf.cmd[PRF_PP_PERF_CNTL] = 0;
 
    rmesa->hw.set.cmd[SET_SE_CNTL] = (R200_FFACE_CULL_CCW |
@@ -704,7 +1172,7 @@ void r200InitState( r200ContextPtr rmesa )
 						R200_VC_NO_SWAP;
 #endif
 
-   if (!(rmesa->r200Screen->chip_flags & RADEON_CHIPSET_TCL)) {
+   if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
       /* Bypass TCL */
       rmesa->hw.cst.cmd[CST_SE_VAP_CNTL_STATUS] |= (1<<8);
    }
@@ -743,28 +1211,28 @@ void r200InitState( r200ContextPtr rmesa )
       rmesa->hw.tex[i].cmd[TEX_PP_TXFORMAT_X] =
          (/* R200_TEXCOORD_PROJ | */
           0x100000);	/* Small default bias */
-      if (rmesa->r200Screen->drmSupportsFragShader) {
+      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
 	 rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_NEWDRM] =
-	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	     rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
 	 rmesa->hw.tex[i].cmd[TEX_PP_CUBIC_FACES] = 0;
 	 rmesa->hw.tex[i].cmd[TEX_PP_TXMULTI_CTL] = 0;
       }
       else {
 	  rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET_OLDDRM] =
-	     rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	     rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
      }
 
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F1] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F2] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F3] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F4] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_F5] =
-         rmesa->r200Screen->texOffset[RADEON_LOCAL_TEX_HEAP];
+         rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
 
       rmesa->hw.pix[i].cmd[PIX_PP_TXCBLEND] =
          (R200_TXC_ARG_A_ZERO |
@@ -967,5 +1435,7 @@ void r200InitState( r200ContextPtr rmesa )
 
    r200LightingSpaceChange( ctx );
 
-   rmesa->hw.all_dirty = GL_TRUE;
+   rmesa->radeon.hw.all_dirty = GL_TRUE;
+
+   rcommonInitCmdBuf(&rmesa->radeon);
 }
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.c b/src/mesa/drivers/dri/r200/r200_swtcl.c
index b25f0282445..83e70b586d7 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.c
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.c
@@ -55,27 +55,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_tcl.h"
 
 
-static void flush_last_swtcl_prim( r200ContextPtr rmesa  );
-
-
 /***********************************************************************
- *                         Initialization 
+ *                         Initialization
  ***********************************************************************/
 
 #define EMIT_ATTR( ATTR, STYLE, F0 )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
    fmt_0 |= F0;								\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 static void r200SetVertexFormat( GLcontext *ctx )
@@ -100,7 +97,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    }
 
    assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-   rmesa->swtcl.vertex_attr_count = 0;
+   rmesa->radeon.swtcl.vertex_attr_count = 0;
 
    /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
     * build up a hardware vertex.
@@ -121,7 +118,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    }
 
    rmesa->swtcl.coloroffset = offset;
-#if MESA_LITTLE_ENDIAN 
+#if MESA_LITTLE_ENDIAN
    EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
 #else
    EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR, (R200_VTX_PK_RGBA << R200_VTX_COLOR_0_SHIFT) );
@@ -132,7 +129,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
    if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 ) ||
        RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_FOG )) {
 
-#if MESA_LITTLE_ENDIAN 
+#if MESA_LITTLE_ENDIAN
       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
 	 rmesa->swtcl.specoffset = offset;
 	 EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_3UB_3F_RGB, (R200_VTX_PK_RGBA << R200_VTX_COLOR_1_SHIFT) );
@@ -185,7 +182,7 @@ static void r200SetVertexFormat( GLcontext *ctx )
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] |= R200_FOG_USE_SPEC_ALPHA;
    }
 
-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
+   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
 	(rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0) ||
 	(rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
       R200_NEWPRIM(rmesa);
@@ -193,26 +190,20 @@ static void r200SetVertexFormat( GLcontext *ctx )
       rmesa->hw.vtx.cmd[VTX_VTXFMT_0] = fmt_0;
       rmesa->hw.vtx.cmd[VTX_VTXFMT_1] = fmt_1;
 
-      rmesa->swtcl.vertex_size =
+      rmesa->radeon.swtcl.vertex_size =
 	  _tnl_install_attrs( ctx,
-			      rmesa->swtcl.vertex_attrs, 
-			      rmesa->swtcl.vertex_attr_count,
+			      rmesa->radeon.swtcl.vertex_attrs,
+			      rmesa->radeon.swtcl.vertex_attr_count,
 			      NULL, 0 );
-      rmesa->swtcl.vertex_size /= 4;
-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
+      rmesa->radeon.swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
    }
 }
 
 
 static void r200RenderStart( GLcontext *ctx )
 {
-   r200ContextPtr rmesa = R200_CONTEXT( ctx );
-
    r200SetVertexFormat( ctx );
-
-   if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim)
-      rmesa->dma.flush( rmesa );
 }
 
 
@@ -232,7 +223,7 @@ void r200ChooseVertexState( GLcontext *ctx )
     * rasterization fallback.  As this function will be called again when we
     * leave a rasterization fallback, we can just skip it for now.
     */
-   if (rmesa->Fallback != 0)
+   if (rmesa->radeon.Fallback != 0)
       return;
 
    vte = rmesa->hw.vte.cmd[VTE_SE_VTE_CNTL];
@@ -273,78 +264,27 @@ void r200ChooseVertexState( GLcontext *ctx )
    }
 }
 
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( r200ContextPtr rmesa  )
+void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 {
-   if (R200_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->dma.flush = NULL;
-
-   if (rmesa->dma.current.buf) {
-      struct r200_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->r200Screen->gart_buffer_offset +
-			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
-			       current->start);
-
-      assert (!(rmesa->swtcl.hw_primitive & R200_VF_PRIM_WALK_IND));
-
-      assert (current->start + 
-	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	      current->ptr);
-
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-	 r200EnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
-			        rmesa->hw.max_state_size + VBUF_BUFSZ );
-	 r200EmitVertexAOS( rmesa,
-			      rmesa->swtcl.vertex_size,
-			      current_offset);
-
-	 r200EmitVbufPrim( rmesa,
-			   rmesa->swtcl.hw_primitive,
-			   rmesa->swtcl.numverts);
-      }
-
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
-   }
-}
-
-
-/* Alloc space in the current dma region.
- */
-static INLINE void *
-r200AllocDmaLowVerts( r200ContextPtr rmesa, int nverts, int vsize )
-{
-   GLuint bytes = vsize * nverts;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      r200RefillCurrentDmaRegion( rmesa );
+   r200ContextPtr rmesa = R200_CONTEXT(ctx);
+   rcommonEnsureCmdBufSpace(&rmesa->radeon,
+			    rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+			    __FUNCTION__);
 
-   if (!rmesa->dma.flush) {
-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      rmesa->dma.flush = flush_last_swtcl_prim;
-   }
 
-   ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-   ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-   ASSERT( rmesa->dma.current.start + 
-	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	   rmesa->dma.current.ptr );
+   radeonEmitState(&rmesa->radeon);
+   r200EmitVertexAOS( rmesa,
+		      rmesa->radeon.swtcl.vertex_size,
+		      rmesa->radeon.dma.current,
+		      current_offset);
 
 
-   {
-      GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-      rmesa->dma.current.ptr += bytes;
-      rmesa->swtcl.numverts += nverts;
-      return head;
-   }
+   r200EmitVbufPrim( rmesa,
+		     rmesa->radeon.swtcl.hw_primitive,
+		     rmesa->radeon.swtcl.numverts);
 
 }
 
-
 /**************************************************************************/
 
 
@@ -392,13 +332,13 @@ static void r200ResetLineStipple( GLcontext *ctx );
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
 #define CTX_ARG r200ContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) r200AllocDmaLowVerts( rmesa, n, size * 4 )
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
 #define LOCAL_VARS						\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   const char *r200verts = (char *)rmesa->swtcl.verts;
-#define VERT(x) (r200Vertex *)(r200verts + ((x) * vertsize * sizeof(int)))
-#define VERTEX r200Vertex 
+   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;
+#define VERT(x) (radeonVertex *)(r200verts + ((x) * vertsize * sizeof(int)))
+#define VERTEX radeonVertex
 #define DO_DEBUG_VERTS (1 && (R200_DEBUG & DEBUG_VERTS))
 
 #undef TAG
@@ -456,11 +396,11 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
 
 #define VERT_SET_RGBA( v, c )  					\
 do {								\
-   r200_color_t *color = (r200_color_t *)&((v)->ui[coloroffset]);	\
+   radeon_color_t *color = (radeon_color_t *)&((v)->ui[coloroffset]);	\
    UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]);		\
    UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]);		\
    UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]);		\
@@ -472,7 +412,7 @@ do {								\
 #define VERT_SET_SPEC( v, c )					\
 do {								\
    if (specoffset) {						\
-      r200_color_t *spec = (r200_color_t *)&((v)->ui[specoffset]);	\
+      radeon_color_t *spec = (radeon_color_t *)&((v)->ui[specoffset]);	\
       UNCLAMPED_FLOAT_TO_UBYTE(spec->red, (c)[0]);	\
       UNCLAMPED_FLOAT_TO_UBYTE(spec->green, (c)[1]);	\
       UNCLAMPED_FLOAT_TO_UBYTE(spec->blue, (c)[2]);	\
@@ -481,8 +421,8 @@ do {								\
 #define VERT_COPY_SPEC( v0, v1 )			\
 do {							\
    if (specoffset) {					\
-      r200_color_t *spec0 = (r200_color_t *)&((v0)->ui[specoffset]);	\
-      r200_color_t *spec1 = (r200_color_t *)&((v1)->ui[specoffset]);	\
+      radeon_color_t *spec0 = (radeon_color_t *)&((v0)->ui[specoffset]);	\
+      radeon_color_t *spec1 = (radeon_color_t *)&((v1)->ui[specoffset]);	\
       spec0->red   = spec1->red;	\
       spec0->green = spec1->green;	\
       spec0->blue  = spec1->blue; 	\
@@ -513,7 +453,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) r200RasterPrimitive( ctx, reduced_hw_prim(ctx, x) )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -569,8 +509,8 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
    r200ContextPtr rmesa = R200_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *r200verts = (char *)rmesa->swtcl.verts;		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *r200verts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -599,13 +539,13 @@ void r200ChooseRenderState( GLcontext *ctx )
    GLuint index = 0;
    GLuint flags = ctx->_TriangleCaps;
 
-   if (!rmesa->TclFallback || rmesa->Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback)
       return;
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R200_TWOSIDE_BIT;
    if (flags & DD_TRI_UNFILLED)      index |= R200_UNFILLED_BIT;
 
-   if (index != rmesa->swtcl.RenderIndex) {
+   if (index != rmesa->radeon.swtcl.RenderIndex) {
       tnl->Driver.Render.Points = rast_tab[index].points;
       tnl->Driver.Render.Line = rast_tab[index].line;
       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -622,7 +562,7 @@ void r200ChooseRenderState( GLcontext *ctx )
 	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
       }
 
-      rmesa->swtcl.RenderIndex = index;
+      rmesa->radeon.swtcl.RenderIndex = index;
    }
 }
 
@@ -636,7 +576,7 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
 
-   if (rmesa->swtcl.hw_primitive != hwprim) {
+   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
       /* need to disable perspective-correct texturing for point sprites */
       if ((hwprim & 0xf) == R200_VF_PRIM_POINT_SPRITES && ctx->Point.PointSprite) {
 	 if (rmesa->hw.set.cmd[SET_RE_CNTL] & R200_PERSPECTIVE_ENABLE) {
@@ -649,15 +589,15 @@ static void r200RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 	 rmesa->hw.set.cmd[SET_RE_CNTL] |= R200_PERSPECTIVE_ENABLE;
       }
       R200_NEWPRIM( rmesa );
-      rmesa->swtcl.hw_primitive = hwprim;
+      rmesa->radeon.swtcl.hw_primitive = hwprim;
    }
 }
 
 static void r200RenderPrimitive( GLcontext *ctx, GLenum prim )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   rmesa->swtcl.render_primitive = prim;
-   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
+   rmesa->radeon.swtcl.render_primitive = prim;
+   if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED))
       r200RasterPrimitive( ctx, reduced_hw_prim(ctx, prim) );
 }
 
@@ -701,15 +641,15 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->Fallback;
+   GLuint oldfallback = rmesa->radeon.Fallback;
 
    if (mode) {
-      rmesa->Fallback |= bit;
+      rmesa->radeon.Fallback |= bit;
       if (oldfallback == 0) {
-	 R200_FIREVERTICES( rmesa );
+	 radeon_firevertices(&rmesa->radeon);
 	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_TRUE );
 	 _swsetup_Wakeup( ctx );
-	 rmesa->swtcl.RenderIndex = ~0;
+	 rmesa->radeon.swtcl.RenderIndex = ~0;
          if (R200_DEBUG & DEBUG_FALLBACKS) {
             fprintf(stderr, "R200 begin rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
@@ -717,7 +657,7 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
       }
    }
    else {
-      rmesa->Fallback &= ~bit;
+      rmesa->radeon.Fallback &= ~bit;
       if (oldfallback == bit) {
 
 	 _swrast_flush( ctx );
@@ -731,14 +671,14 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 	 tnl->Driver.Render.ResetLineStipple = r200ResetLineStipple;
 	 TCL_FALLBACK( ctx, R200_TCL_FALLBACK_RASTER, GL_FALSE );
-	 if (rmesa->TclFallback) {
-	    /* These are already done if rmesa->TclFallback goes to
+	 if (rmesa->radeon.TclFallback) {
+	    /* These are already done if rmesa->radeon.TclFallback goes to
 	     * zero above. But not if it doesn't (R200_NO_TCL for
 	     * example?)
 	     */
 	    _tnl_invalidate_vertex_state( ctx, ~0 );
 	    _tnl_invalidate_vertices( ctx, ~0 );
-	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
+	    RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
 	    r200ChooseVertexState( ctx );
 	    r200ChooseRenderState( ctx );
 	 }
@@ -755,7 +695,7 @@ void r200Fallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 /**
  * Cope with depth operations by drawing individual pixels as points.
- * 
+ *
  * \todo
  * The way the vertex state is set in this routine is hokey.  It seems to
  * work, but it's very hackish.  This whole routine is pretty hackish.  If
@@ -770,14 +710,14 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 		  const GLubyte *bitmap )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   const GLfloat *rc = ctx->Current.RasterColor; 
+   const GLfloat *rc = ctx->Current.RasterColor;
    GLint row, col;
-   r200Vertex vert;
+   radeonVertex vert;
    GLuint orig_vte;
    GLuint h;
 
 
-   /* Turn off tcl.  
+   /* Turn off tcl.
     */
    TCL_FALLBACK( ctx, R200_TCL_FALLBACK_BITMAP, 1 );
 
@@ -794,7 +734,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
       vte |= R200_VTX_W0_FMT;
       vap &= ~R200_VAP_FORCE_W_TO_ONE;
 
-      rmesa->swtcl.vertex_size = 5;
+      rmesa->radeon.swtcl.vertex_size = 5;
 
       if ( (rmesa->hw.vtx.cmd[VTX_VTXFMT_0] != fmt_0)
 	   || (rmesa->hw.vtx.cmd[VTX_VTXFMT_1] != fmt_1) ) {
@@ -828,7 +768,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 					   R200_VPORT_Z_SCALE_ENA |
 					   R200_VPORT_X_OFFSET_ENA |
 					   R200_VPORT_Y_OFFSET_ENA |
-					   R200_VPORT_Z_OFFSET_ENA); 
+					   R200_VPORT_Z_OFFSET_ENA);
 
    /* Turn off other stuff:  Stipple?, texture?, blending?, etc.
     */
@@ -871,16 +811,16 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Update window height
     */
-   LOCK_HARDWARE( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   h = rmesa->dri.drawable->h + rmesa->dri.drawable->y;
-   px += rmesa->dri.drawable->x;
+   LOCK_HARDWARE( &rmesa->radeon );
+   UNLOCK_HARDWARE( &rmesa->radeon );
+   h = radeon_get_drawable(&rmesa->radeon)->h + radeon_get_drawable(&rmesa->radeon)->y;
+   px += radeon_get_drawable(&rmesa->radeon)->x;
 
    /* Clipping handled by existing mechansims in r200_ioctl.c?
     */
    for (row=0; row<height; row++) {
-      const GLubyte *src = (const GLubyte *) 
-	 _mesa_image_address2d(unpack, bitmap, width, height, 
+      const GLubyte *src = (const GLubyte *)
+	 _mesa_image_address2d(unpack, bitmap, width, height,
                                GL_COLOR_INDEX, GL_BITMAP, row, 0 );
 
       if (unpack->LsbFirst) {
@@ -929,7 +869,7 @@ r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 
    /* Need to restore vertexformat?
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       r200ChooseVertexState( ctx );
 }
 
@@ -959,20 +899,12 @@ void r200InitSwtcl( GLcontext *ctx )
    tnl->Driver.Render.Interp = _tnl_interp;
 
    /* FIXME: what are these numbers? */
-   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
+   _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 		       36 * sizeof(GLfloat) );
-   
-   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
-   rmesa->swtcl.hw_primitive = 0;
-}
-
 
-void r200DestroySwtcl( GLcontext *ctx )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (rmesa->swtcl.indexed_verts.buf) 
-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, __FUNCTION__ );
+   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->radeon.swtcl.hw_primitive = 0;
 }
+
diff --git a/src/mesa/drivers/dri/r200/r200_swtcl.h b/src/mesa/drivers/dri/r200/r200_swtcl.h
index 8c29fd0c999..b0905879d7a 100644
--- a/src/mesa/drivers/dri/r200/r200_swtcl.h
+++ b/src/mesa/drivers/dri/r200/r200_swtcl.h
@@ -39,7 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r200_context.h"
 
 extern void r200InitSwtcl( GLcontext *ctx );
-extern void r200DestroySwtcl( GLcontext *ctx );
 
 extern void r200ChooseRenderState( GLcontext *ctx );
 extern void r200ChooseVertexState( GLcontext *ctx );
@@ -52,15 +51,11 @@ extern void r200BuildVertices( GLcontext *ctx, GLuint start, GLuint count,
 extern void r200PrintSetupFlags(char *msg, GLuint flags );
 
 
-extern void r200_emit_indexed_verts( GLcontext *ctx,
-				       GLuint start,
-				       GLuint count );
-
 extern void r200_translate_vertex( GLcontext *ctx, 
-				     const r200Vertex *src, 
+				     const radeonVertex *src, 
 				     SWvertex *dst );
 
-extern void r200_print_vertex( GLcontext *ctx, const r200Vertex *v );
+extern void r200_print_vertex( GLcontext *ctx, const radeonVertex *v );
 
 extern void r200_import_float_colors( GLcontext *ctx );
 extern void r200_import_float_spec_colors( GLcontext *ctx );
@@ -70,5 +65,5 @@ extern void r200PointsBitmap( GLcontext *ctx, GLint px, GLint py,
 			      const struct gl_pixelstore_attrib *unpack,
 			      const GLubyte *bitmap );
 
-
+void r200_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
 #endif
diff --git a/src/mesa/drivers/dri/r200/r200_tcl.c b/src/mesa/drivers/dri/r200/r200_tcl.c
index 99aecfe1e90..580370933ee 100644
--- a/src/mesa/drivers/dri/r200/r200_tcl.c
+++ b/src/mesa/drivers/dri/r200/r200_tcl.c
@@ -123,7 +123,7 @@ static GLboolean discrete_prim[0x10] = {
 
 #define RESET_STIPPLE() do {			\
    R200_STATECHANGE( rmesa, lin );		\
-   r200EmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);			\
 } while (0)
 
 #define AUTO_STIPPLE( mode )  do {		\
@@ -134,7 +134,7 @@ static GLboolean discrete_prim[0x10] = {
    else						\
       rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
 	 ~R200_LINE_PATTERN_AUTO_RESET;	\
-   r200EmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);			\
 } while (0)
 
 
@@ -142,26 +142,24 @@ static GLboolean discrete_prim[0x10] = {
 
 static GLushort *r200AllocElts( r200ContextPtr rmesa, GLuint nr ) 
 {
-   if (rmesa->dma.flush == r200FlushElts &&
-       rmesa->store.cmd_used + nr*2 < R200_CMD_BUF_SZ) {
+   if (rmesa->radeon.dma.flush == r200FlushElts &&
+       rmesa->tcl.elt_used + nr*2 < R200_ELT_BUF_SZ) {
 
-      GLushort *dest = (GLushort *)(rmesa->store.cmd_buf +
-				    rmesa->store.cmd_used);
+      GLushort *dest = (GLushort *)(rmesa->radeon.tcl.elt_dma_bo->ptr +
+				    rmesa->tcl.elt_used);
 
-      rmesa->store.cmd_used += nr*2;
+      rmesa->tcl.elt_used += nr*2;
 
       return dest;
    }
    else {
-      if (rmesa->dma.flush)
-	 rmesa->dma.flush( rmesa );
+      if (rmesa->radeon.dma.flush)
+	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
 
-      r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			     rmesa->hw.max_state_size + ELTS_BUFSZ(nr) );
+      rcommonEnsureCmdBufSpace(&rmesa->radeon, AOS_BUFSZ(rmesa->radeon.tcl.aos_count), __FUNCTION__);
 
       r200EmitAOS( rmesa,
-		   rmesa->tcl.aos_components,
-		   rmesa->tcl.nr_aos_components, 0 );
+		   rmesa->radeon.tcl.aos_count, 0 );
 
       return r200AllocEltsOpenEnded( rmesa, rmesa->tcl.hw_primitive, nr );
    }
@@ -188,13 +186,14 @@ static void r200EmitPrim( GLcontext *ctx,
    r200ContextPtr rmesa = R200_CONTEXT( ctx );
    r200TclPrimitive( ctx, prim, hwprim );
    
-   r200EnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			  rmesa->hw.max_state_size + VBUF_BUFSZ );
+   //   fprintf(stderr,"Emit prim %d\n", rmesa->radeon.tcl.aos_count);
+   rcommonEnsureCmdBufSpace( &rmesa->radeon,
+			     AOS_BUFSZ(rmesa->radeon.tcl.aos_count) +
+			     rmesa->radeon.hw.max_state_size + VBUF_BUFSZ, __FUNCTION__ );
 
    r200EmitAOS( rmesa,
-		  rmesa->tcl.aos_components,
-		  rmesa->tcl.nr_aos_components,
-		  start );
+		rmesa->radeon.tcl.aos_count,
+		start );
    
    /* Why couldn't this packet have taken an offset param?
     */
@@ -394,7 +393,7 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 
    /* TODO: separate this from the swtnl pipeline 
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       return GL_TRUE;	/* fallback to software t&l */
 
    if (R200_DEBUG & DEBUG_PRIMS)
@@ -405,8 +404,9 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 
    /* Validate state:
     */
-   if (rmesa->NewGLState)
-      r200ValidateState( ctx );
+   if (rmesa->radeon.NewGLState)
+      if (!r200ValidateState( ctx ))
+         return GL_TRUE; /* fallback to sw t&l */
 
    if (!ctx->VertexProgram._Enabled) {
    /* NOTE: inputs != tnl->render_inputs - these are the untransformed
@@ -481,7 +481,7 @@ static GLboolean r200_run_tcl_render( GLcontext *ctx,
 
    /* Do the actual work:
     */
-   r200ReleaseArrays( ctx, ~0 /* stage->changed_inputs */ );
+   radeonReleaseArrays( ctx, ~0 /* stage->changed_inputs */ );
    r200EmitArrays( ctx, vimap_rev );
 
    rmesa->tcl.Elts = VB->Elts;
@@ -545,7 +545,7 @@ static void transition_to_swtnl( GLcontext *ctx )
    tnl->Driver.NotifyMaterialChange = 
       _mesa_validate_all_lighting_tables;
 
-   r200ReleaseArrays( ctx, ~0 );
+   radeonReleaseArrays( ctx, ~0 );
 
    /* Still using the D3D based hardware-rasterizer from the radeon;
     * need to put the card into D3D mode to make it work:
@@ -565,15 +565,11 @@ static void transition_to_hwtnl( GLcontext *ctx )
 
    tnl->Driver.NotifyMaterialChange = r200UpdateMaterial;
 
-   if ( rmesa->dma.flush )			
-      rmesa->dma.flush( rmesa );	
+   if ( rmesa->radeon.dma.flush )			
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	
 
-   rmesa->dma.flush = NULL;
+   rmesa->radeon.dma.flush = NULL;
    
-   if (rmesa->swtcl.indexed_verts.buf) 
-      r200ReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-			      __FUNCTION__ );
-
    R200_STATECHANGE( rmesa, vap );
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] |= R200_VAP_TCL_ENABLE;
    rmesa->hw.vap.cmd[VAP_SE_VAP_CNTL] &= ~R200_VAP_FORCE_W_TO_ONE;
@@ -631,10 +627,10 @@ static char *getFallbackString(GLuint bit)
 void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->TclFallback;
+   GLuint oldfallback = rmesa->radeon.TclFallback;
 
    if (mode) {
-      rmesa->TclFallback |= bit;
+      rmesa->radeon.TclFallback |= bit;
       if (oldfallback == 0) {
 	 if (R200_DEBUG & DEBUG_FALLBACKS) 
 	    fprintf(stderr, "R200 begin tcl fallback %s\n",
@@ -643,7 +639,7 @@ void r200TclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
       }
    }
    else {
-      rmesa->TclFallback &= ~bit;
+      rmesa->radeon.TclFallback &= ~bit;
       if (oldfallback == bit) {
 	 if (R200_DEBUG & DEBUG_FALLBACKS) 
 	    fprintf(stderr, "R200 end tcl fallback %s\n",
diff --git a/src/mesa/drivers/dri/r200/r200_tex.c b/src/mesa/drivers/dri/r200/r200_tex.c
index 259f35a34c3..9f791579158 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.c
+++ b/src/mesa/drivers/dri/r200/r200_tex.c
@@ -43,8 +43,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/teximage.h"
 #include "main/texobj.h"
 
-#include "texmem.h"
-
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -63,10 +62,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \param twrap Wrap mode for the \a t texture coordinate
  */
 
-static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
+static void r200SetTexWrap( radeonTexObjPtr t, GLenum swrap, GLenum twrap, GLenum rwrap )
 {
    GLboolean  is_clamp = GL_FALSE;
    GLboolean  is_clamp_to_border = GL_FALSE;
+   struct gl_texture_object *tObj = &t->base;
 
    t->pp_txfilter &= ~(R200_CLAMP_S_MASK | R200_CLAMP_T_MASK | R200_BORDER_MODE_D3D);
 
@@ -103,7 +103,7 @@ static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum
       _mesa_problem(NULL, "bad S wrap mode in %s", __FUNCTION__);
    }
 
-   if (t->base.tObj->Target != GL_TEXTURE_1D) {
+   if (tObj->Target != GL_TEXTURE_1D) {
       switch ( twrap ) {
       case GL_REPEAT:
          t->pp_txfilter |= R200_CLAMP_T_WRAP;
@@ -180,7 +180,7 @@ static void r200SetTexWrap( r200TexObjPtr t, GLenum swrap, GLenum twrap, GLenum
    t->border_fallback = (is_clamp && is_clamp_to_border);
 }
 
-static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
+static void r200SetTexMaxAnisotropy( radeonTexObjPtr t, GLfloat max )
 {
    t->pp_txfilter &= ~R200_MAX_ANISO_MASK;
 
@@ -205,10 +205,13 @@ static void r200SetTexMaxAnisotropy( r200TexObjPtr t, GLfloat max )
  * \param magf Texture magnification mode
  */
 
-static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
+static void r200SetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
 {
    GLuint anisotropy = (t->pp_txfilter & R200_MAX_ANISO_MASK);
 
+   /* Force revalidation to account for switches from/to mipmapping. */
+   t->validated = GL_FALSE;
+
    t->pp_txfilter &= ~(R200_MIN_FILTER_MASK | R200_MAG_FILTER_MASK);
    t->pp_txformat_x &= ~R200_VOLUME_FILTER_MASK;
 
@@ -267,701 +270,16 @@ static void r200SetTexFilter( r200TexObjPtr t, GLenum minf, GLenum magf )
    }
 }
 
-static void r200SetTexBorderColor( r200TexObjPtr t, const GLfloat color[4] )
+static void r200SetTexBorderColor( radeonTexObjPtr t, const GLfloat color[4] )
 {
    GLubyte c[4];
    CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
    CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
    CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
    CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
-   t->pp_border_color = r200PackColor( 4, c[0], c[1], c[2], c[3] );
-}
-
-
-/**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static r200TexObjPtr r200AllocTexObj( struct gl_texture_object *texObj )
-{
-   r200TexObjPtr t;
-
-   t = CALLOC_STRUCT( r200_tex_obj );
-   texObj->DriverData = t;
-   if ( t != NULL ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE ) {
-	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, 
-		  (void *)t );
-      }
-
-      /* Initialize non-image-dependent parts of the state:
-       */
-      t->base.tObj = texObj;
-      t->border_fallback = GL_FALSE;
-
-      make_empty_list( & t->base );
-
-      r200SetTexWrap( t, texObj->WrapS, texObj->WrapT, texObj->WrapR );
-      r200SetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
-      r200SetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-      r200SetTexBorderColor( t, texObj->BorderColor );
-   }
-
-   return t;
-}
-
-/* try to find a format which will only need a memcopy */
-static const struct gl_texture_format *
-r200Choose8888TexFormat( GLenum srcFormat, GLenum srcType )
-{
-   const GLuint ui = 1;
-   const GLubyte littleEndian = *((const GLubyte *) &ui);
-
-   if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
-      return &_mesa_texformat_rgba8888;
-   }
-   else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-       (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-       (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
-      return &_mesa_texformat_rgba8888_rev;
-   }
-   else return _dri_texformat_argb8888;
-}
-
-static const struct gl_texture_format *
-r200ChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
-                           GLenum format, GLenum type )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   const GLboolean do32bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
-   const GLboolean force16bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
-   (void) format;
-
-   switch ( internalFormat ) {
-   case 4:
-   case GL_RGBA:
-   case GL_COMPRESSED_RGBA:
-      switch ( type ) {
-      case GL_UNSIGNED_INT_10_10_10_2:
-      case GL_UNSIGNED_INT_2_10_10_10_REV:
-	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      default:
-         return do32bpt ?
-	    r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
-      }
-
-   case 3:
-   case GL_RGB:
-   case GL_COMPRESSED_RGB:
-      switch ( type ) {
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_5_6_5:
-      case GL_UNSIGNED_SHORT_5_6_5_REV:
-	 return _dri_texformat_rgb565;
-      default:
-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-      }
-
-   case GL_RGBA8:
-   case GL_RGB10_A2:
-   case GL_RGBA12:
-   case GL_RGBA16:
-      return !force16bpt ?
-	  r200Choose8888TexFormat(format, type) : _dri_texformat_argb4444;
-
-   case GL_RGBA4:
-   case GL_RGBA2:
-      return _dri_texformat_argb4444;
-
-   case GL_RGB5_A1:
-      return _dri_texformat_argb1555;
-
-   case GL_RGB8:
-   case GL_RGB10:
-   case GL_RGB12:
-   case GL_RGB16:
-      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-
-   case GL_RGB5:
-   case GL_RGB4:
-   case GL_R3_G3_B2:
-      return _dri_texformat_rgb565;
-
-   case GL_ALPHA:
-   case GL_ALPHA4:
-   case GL_ALPHA8:
-   case GL_ALPHA12:
-   case GL_ALPHA16:
-   case GL_COMPRESSED_ALPHA:
-   /* can't use a8 format since interpreting hw I8 as a8 would result
-      in wrong rgb values (same as alpha value instead of 0). */
-      return _dri_texformat_al88;
-
-   case 1:
-   case GL_LUMINANCE:
-   case GL_LUMINANCE4:
-   case GL_LUMINANCE8:
-   case GL_LUMINANCE12:
-   case GL_LUMINANCE16:
-   case GL_COMPRESSED_LUMINANCE:
-      return _dri_texformat_l8;
-
-   case 2:
-   case GL_LUMINANCE_ALPHA:
-   case GL_LUMINANCE4_ALPHA4:
-   case GL_LUMINANCE6_ALPHA2:
-   case GL_LUMINANCE8_ALPHA8:
-   case GL_LUMINANCE12_ALPHA4:
-   case GL_LUMINANCE12_ALPHA12:
-   case GL_LUMINANCE16_ALPHA16:
-   case GL_COMPRESSED_LUMINANCE_ALPHA:
-      return _dri_texformat_al88;
-
-   case GL_INTENSITY:
-   case GL_INTENSITY4:
-   case GL_INTENSITY8:
-   case GL_INTENSITY12:
-   case GL_INTENSITY16:
-   case GL_COMPRESSED_INTENSITY:
-       return _dri_texformat_i8;
-
-   case GL_YCBCR_MESA:
-      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-          type == GL_UNSIGNED_BYTE)
-         return &_mesa_texformat_ycbcr;
-      else
-         return &_mesa_texformat_ycbcr_rev;
-
-   case GL_RGB_S3TC:
-   case GL_RGB4_S3TC:
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgb_dxt1;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgba_dxt1;
-
-   case GL_RGBA_S3TC:
-   case GL_RGBA4_S3TC:
-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      return &_mesa_texformat_rgba_dxt3;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-      return &_mesa_texformat_rgba_dxt5;
-
-   default:
-      _mesa_problem(ctx,
-         "unexpected internalFormat 0x%x in r200ChooseTextureFormat",
-         (int) internalFormat);
-      return NULL;
-   }
-
-   return NULL; /* never get here */
-}
-
-
-static GLboolean
-r200ValidateClientStorage( GLcontext *ctx, GLenum target,
-			   GLint internalFormat,
-			   GLint srcWidth, GLint srcHeight, 
-                           GLenum format, GLenum type,  const void *pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if ( R200_DEBUG & DEBUG_TEXTURE )
-      fprintf(stderr, "intformat %s format %s type %s\n",
-	      _mesa_lookup_enum_by_nr( internalFormat ),
-	      _mesa_lookup_enum_by_nr( format ),
-	      _mesa_lookup_enum_by_nr( type ));
-
-   if (!ctx->Unpack.ClientStorage)
-      return 0;
-
-   if (ctx->_ImageTransferState ||
-       texImage->IsCompressed ||
-       texObj->GenerateMipmap)
-      return 0;
-
-
-   /* This list is incomplete, may be different on ppc???
-    */
-   switch ( internalFormat ) {
-   case GL_RGBA:
-      if ( format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV ) {
-	 texImage->TexFormat = _dri_texformat_argb8888;
-      }
-      else
-	 return 0;
-      break;
-
-   case GL_RGB:
-      if ( format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5 ) {
-	 texImage->TexFormat = _dri_texformat_rgb565;
-      }
-      else
-	 return 0;
-      break;
-
-   case GL_YCBCR_MESA:
-      if ( format == GL_YCBCR_MESA && 
-	   type == GL_UNSIGNED_SHORT_8_8_REV_APPLE ) {
-	 texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
-      }
-      else if ( format == GL_YCBCR_MESA && 
-		(type == GL_UNSIGNED_SHORT_8_8_APPLE || 
-		 type == GL_UNSIGNED_BYTE)) {
-	 texImage->TexFormat = &_mesa_texformat_ycbcr;
-      }
-      else
-	 return 0;
-      break;
-
-   default:
-      return 0;
-   }
-
-   /* Could deal with these packing issues, but currently don't:
-    */
-   if (packing->SkipPixels || 
-       packing->SkipRows || 
-       packing->SwapBytes ||
-       packing->LsbFirst) {
-      return 0;
-   }
-
-   {      
-      GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
-						  format, type);
-
-      
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf(stderr, "%s: srcRowStride %d/%x\n", 
-		 __FUNCTION__, srcRowStride, srcRowStride);
-
-      /* Could check this later in upload, pitch restrictions could be
-       * relaxed, but would need to store the image pitch somewhere,
-       * as packing details might change before image is uploaded:
-       */
-      if (!r200IsGartMemory( rmesa, pixels, srcHeight * srcRowStride ) ||
-	  (srcRowStride & 63))
-	 return 0;
-
-
-      /* Have validated that _mesa_transfer_teximage would be a straight
-       * memcpy at this point.  NOTE: future calls to TexSubImage will
-       * overwrite the client data.  This is explicitly mentioned in the
-       * extension spec.
-       */
-      texImage->Data = (void *)pixels;
-      texImage->IsClientData = GL_TRUE;
-      texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
-
-      return 1;
-   }
-}
-
-
-static void r200TexImage1D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_teximage1d(ctx, target, level, internalFormat,
-                          width, border, format, type, pixels,
-                          &ctx->Unpack, texObj, texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void r200TexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset,
-                                 GLsizei width,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-			     format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void r200TexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-         return;
-      }
-   }
-
-   texImage->IsClientData = GL_FALSE;
-
-   if (r200ValidateClientStorage( ctx, target, 
-				  internalFormat, 
-				  width, height, 
-				  format, type, pixels, 
-				  packing, texObj, texImage)) {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
-   }
-   else {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
-
-      /* Normal path: copy (to cached memory) and eventually upload
-       * via another copy to GART memory and then a blit...  Could
-       * eliminate one copy by going straight to (permanent) GART.
-       *
-       * Note, this will call r200ChooseTextureFormat.
-       */
-      _mesa_store_teximage2d(ctx, target, level, internalFormat,
-			     width, height, border, format, type, pixels,
-			     &ctx->Unpack, texObj, texImage);
-      
-      t->dirty_images[face] |= (1 << level);
-   }
-}
-
-
-static void r200TexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-			     height, format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-static void r200CompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLsizei imageSize, const GLvoid *data,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
-         return;
-      }
-   }
-
-   texImage->IsClientData = GL_FALSE;
-/* can't call this, different parameters. Would never evaluate to true anyway currently
-   if (r200ValidateClientStorage( ctx, target, 
-				  internalFormat,
-				  width, height,
-				  format, type, pixels,
-				  packing, texObj, texImage)) {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__);
-   }
-   else */{
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__);
-
-      /* Normal path: copy (to cached memory) and eventually upload
-       * via another copy to GART memory and then a blit...  Could
-       * eliminate one copy by going straight to (permanent) GART.
-       *
-       * Note, this will call r200ChooseTextureFormat.
-       */
-      _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
-                                 height, border, imageSize, data, texObj, texImage);
-
-      t->dirty_images[face] |= (1 << level);
-   }
+   t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
 }
 
-
-static void r200CompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format,
-                                 GLsizei imageSize, const GLvoid *data,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-                            height, format, imageSize, data, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-#if ENABLE_HW_3D_TEXTURE
-static void r200TexImage3D( GLcontext *ctx, GLenum target, GLint level,
-                            GLint internalFormat,
-                            GLint width, GLint height, GLint depth,
-                            GLint border,
-                            GLenum format, GLenum type, const GLvoid *pixels,
-                            const struct gl_pixelstore_attrib *packing,
-                            struct gl_texture_object *texObj,
-                            struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-         return;
-      }
-   }
-
-   texImage->IsClientData = GL_FALSE;
-
-#if 0
-   if (r200ValidateClientStorage( ctx, target, 
-				  internalFormat, 
-				  width, height, 
-				  format, type, pixels, 
-				  packing, texObj, texImage)) {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using client storage\n", __FUNCTION__); 
-   }
-   else
-#endif
-   {
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, "%s: Using normal storage\n", __FUNCTION__); 
-
-      /* Normal path: copy (to cached memory) and eventually upload
-       * via another copy to GART memory and then a blit...  Could
-       * eliminate one copy by going straight to (permanent) GART.
-       *
-       * Note, this will call r200ChooseTextureFormat.
-       */
-      _mesa_store_teximage3d(ctx, target, level, internalFormat,
-			     width, height, depth, border,
-                             format, type, pixels,
-			     &ctx->Unpack, texObj, texImage);
-      
-      t->dirty_images[0] |= (1 << level);
-   }
-}
-#endif
-
-
-#if ENABLE_HW_3D_TEXTURE
-static void
-r200TexSubImage3D( GLcontext *ctx, GLenum target, GLint level,
-                   GLint xoffset, GLint yoffset, GLint zoffset,
-                   GLsizei width, GLsizei height, GLsizei depth,
-                   GLenum format, GLenum type,
-                   const GLvoid *pixels,
-                   const struct gl_pixelstore_attrib *packing,
-                   struct gl_texture_object *texObj,
-                   struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) r200AllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
-         return;
-      }
-      texObj->DriverData = t;
-   }
-
-   _mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
-                             width, height, depth,
-                             format, type, pixels, packing, texObj, texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-#endif
-
-
-
 static void r200TexEnv( GLcontext *ctx, GLenum target,
 			  GLenum pname, const GLfloat *param )
 {
@@ -983,7 +301,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
       GLubyte c[4];
       GLuint envColor;
       UNCLAMPED_FLOAT_TO_RGBA_CHAN( c, texUnit->EnvColor );
-      envColor = r200PackColor( 4, c[0], c[1], c[2], c[3] );
+      envColor = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
       if ( rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] != envColor ) {
 	 R200_STATECHANGE( rmesa, tf );
 	 rmesa->hw.tf.cmd[TF_TFACTOR_0 + unit] = envColor;
@@ -1002,7 +320,7 @@ static void r200TexEnv( GLcontext *ctx, GLenum target,
        * NOTE: Add a small bias to the bias for conform mipsel.c test.
        */
       bias = *param + .01;
-      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
+      min = driQueryOptionb (&rmesa->radeon.optionCache, "no_neg_lod_bias") ?
 	  0.0 : -16.0;
       bias = CLAMP( bias, min, 16.0 );
       b = (int)(bias * fixed_one) & R200_LOD_BIAS_MASK;
@@ -1039,7 +357,7 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
 				struct gl_texture_object *texObj,
 				GLenum pname, const GLfloat *params )
 {
-   r200TexObjPtr t = (r200TexObjPtr) texObj->DriverData;
+   radeonTexObj* t = radeon_tex_obj(texObj);
 
    if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
       fprintf( stderr, "%s( %s )\n", __FUNCTION__,
@@ -1073,59 +391,46 @@ static void r200TexParameter( GLcontext *ctx, GLenum target,
        * we just have to rely on loading the right subset of mipmap levels
        * to simulate a clamped LOD.
        */
-      driSwapOutTextureObject( (driTextureObject *) t );
+      if (t->mt) {
+         radeon_miptree_unreference(t->mt);
+	 t->mt = 0;
+	 t->validated = GL_FALSE;
+      }
       break;
 
    default:
       return;
    }
-
-   /* Mark this texobj as dirty (one bit per tex unit)
-    */
-   t->dirty_state = TEX_ALL;
 }
 
 
-
-static void r200BindTexture( GLcontext *ctx, GLenum target,
-			       struct gl_texture_object *texObj )
-{
-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
-	       ctx->Texture.CurrentUnit );
-   }
-
-   if ( (target == GL_TEXTURE_1D)
-	|| (target == GL_TEXTURE_2D) 
-#if ENABLE_HW_3D_TEXTURE
-	|| (target == GL_TEXTURE_3D)
-#endif
-	|| (target == GL_TEXTURE_CUBE_MAP)
-	|| (target == GL_TEXTURE_RECTANGLE_NV) ) {
-      assert( texObj->DriverData != NULL );
-   }
-}
-
-
-static void r200DeleteTexture( GLcontext *ctx,
-				 struct gl_texture_object *texObj )
+static void r200DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( R200_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
-	       _mesa_lookup_enum_by_nr( texObj->Target ) );
+   radeonTexObj* t = radeon_tex_obj(texObj);
+
+   if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+      fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+	      (void *)texObj,
+	      _mesa_lookup_enum_by_nr(texObj->Target));
+   }
+   
+   if (rmesa) {
+      int i;
+      radeon_firevertices(&rmesa->radeon);
+      for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
+	 if ( t == rmesa->state.texture.unit[i].texobj ) {
+	    rmesa->state.texture.unit[i].texobj = NULL;
+	    rmesa->hw.tex[i].dirty = GL_FALSE;
+	    rmesa->hw.cube[i].dirty = GL_FALSE;
+	 }
+      }      
    }
-
-   if ( t != NULL ) {
-      if ( rmesa ) {
-         R200_FIREVERTICES( rmesa );
-      }
-
-      driDestroyTextureObject( t );
+   
+   if (t->mt) {
+      radeon_miptree_unreference(t->mt);
+      t->mt = 0;
    }
-   /* Free mipmap images and the texture object itself */
    _mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -1155,46 +460,59 @@ static void r200TexGen( GLcontext *ctx,
  * Called via ctx->Driver.NewTextureObject.
  * Note: this function will be called during context creation to
  * allocate the default texture objects.
- * Note: we could use containment here to 'derive' the driver-specific
- * texture object from the core mesa gl_texture_object.  Not done at this time.
  * Fixup MaxAnisotropy according to user preference.
  */
-static struct gl_texture_object *
-r200NewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
+static struct gl_texture_object *r200NewTextureObject(GLcontext * ctx,
+						      GLuint name,
+						      GLenum target)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_object *obj;
-   obj = _mesa_new_texture_object(ctx, name, target);
-   if (!obj)
-      return NULL;
-   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
-   r200AllocTexObj( obj );
-   return obj;
+   radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+
+   if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+     fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+	     t, _mesa_lookup_enum_by_nr(target));
+   }
+
+   _mesa_initialize_texture_object(&t->base, name, target);
+   t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+   /* Initialize hardware state */
+   r200SetTexWrap( t, t->base.WrapS, t->base.WrapT, t->base.WrapR );
+   r200SetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
+   r200SetTexFilter(t, t->base.MinFilter, t->base.MagFilter);
+   r200SetTexBorderColor(t, t->base.BorderColor);
+
+   return &t->base;
 }
 
 
+
 void r200InitTextureFuncs( struct dd_function_table *functions )
 {
    /* Note: we only plug in the functions we implement in the driver
     * since _mesa_init_driver_functions() was already called.
     */
-   functions->ChooseTextureFormat	= r200ChooseTextureFormat;
-   functions->TexImage1D		= r200TexImage1D;
-   functions->TexImage2D		= r200TexImage2D;
+   functions->ChooseTextureFormat	= radeonChooseTextureFormat_mesa;
+   functions->TexImage1D		= radeonTexImage1D;
+   functions->TexImage2D		= radeonTexImage2D;
 #if ENABLE_HW_3D_TEXTURE
-   functions->TexImage3D		= r200TexImage3D;
+   functions->TexImage3D		= radeonTexImage3D;
 #else
    functions->TexImage3D		= _mesa_store_teximage3d;
 #endif
-   functions->TexSubImage1D		= r200TexSubImage1D;
-   functions->TexSubImage2D		= r200TexSubImage2D;
+   functions->TexSubImage1D		= radeonTexSubImage1D;
+   functions->TexSubImage2D		= radeonTexSubImage2D;
 #if ENABLE_HW_3D_TEXTURE
-   functions->TexSubImage3D		= r200TexSubImage3D;
+   functions->TexSubImage3D		= radeonTexSubImage3D;
 #else
    functions->TexSubImage3D		= _mesa_store_texsubimage3d;
 #endif
+   functions->GetTexImage               = radeonGetTexImage;
+   functions->GetCompressedTexImage     = radeonGetCompressedTexImage;
    functions->NewTextureObject		= r200NewTextureObject;
-   functions->BindTexture		= r200BindTexture;
+   //   functions->BindTexture		= r200BindTexture;
    functions->DeleteTexture		= r200DeleteTexture;
    functions->IsTextureResident		= driIsTextureResident;
 
@@ -1202,22 +520,16 @@ void r200InitTextureFuncs( struct dd_function_table *functions )
    functions->TexParameter		= r200TexParameter;
    functions->TexGen			= r200TexGen;
 
-   functions->CompressedTexImage2D	= r200CompressedTexImage2D;
-   functions->CompressedTexSubImage2D	= r200CompressedTexSubImage2D;
+   functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
+   functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
-   driInitTextureFormats();
+   functions->GenerateMipmap = radeonGenerateMipmap;
 
-#if 000
-   /* moved or obsolete code */
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   driInitTextureObjects( ctx, & rmesa->swapped,
-			  DRI_TEXMGR_DO_TEXTURE_1D
-			  | DRI_TEXMGR_DO_TEXTURE_2D );
+   functions->NewTextureImage = radeonNewTextureImage;
+   functions->FreeTexImageData = radeonFreeTexImageData;
+   functions->MapTexture = radeonMapTexture;
+   functions->UnmapTexture = radeonUnmapTexture;
+
+   driInitTextureFormats();
 
-   /* Hack: r200NewTextureObject is not yet installed when the
-    * default textures are created. Therefore set MaxAnisotropy of the
-    * default 2D texture now. */
-   ctx->Shared->Default2D->MaxAnisotropy = driQueryOptionf (&rmesa->optionCache,
-							    "def_max_anisotropy");
-#endif
 }
diff --git a/src/mesa/drivers/dri/r200/r200_tex.h b/src/mesa/drivers/dri/r200/r200_tex.h
index 10ff8e8a660..e122de6e5ed 100644
--- a/src/mesa/drivers/dri/r200/r200_tex.h
+++ b/src/mesa/drivers/dri/r200/r200_tex.h
@@ -35,15 +35,18 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __R200_TEX_H__
 #define __R200_TEX_H__
 
+extern void r200SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv);
+extern void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+			      __DRIdrawable *dPriv);
 extern void r200SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 			     unsigned long long offset, GLint depth,
 			     GLuint pitch);
 
 extern void r200UpdateTextureState( GLcontext *ctx );
 
-extern int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face );
+extern int r200UploadTexImages( r200ContextPtr rmesa, radeonTexObjPtr t, GLuint face );
 
-extern void r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t );
+extern void r200DestroyTexObj( r200ContextPtr rmesa, radeonTexObjPtr t );
 
 extern void r200InitTextureFuncs( struct dd_function_table *functions );
 
diff --git a/src/mesa/drivers/dri/r200/r200_texmem.c b/src/mesa/drivers/dri/r200/r200_texmem.c
deleted file mode 100644
index 3b81ac0c802..00000000000
--- a/src/mesa/drivers/dri/r200/r200_texmem.c
+++ /dev/null
@@ -1,530 +0,0 @@
-/**************************************************************************
-
-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.  
-The Weather Channel, Inc. funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86
-license. This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <[email protected]>
- *   Gareth Hughes <[email protected]>
- *
- */
- 
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/macros.h"
-#include "r200_context.h"
-#include "r200_ioctl.h"
-#include "r200_tex.h"
-#include "radeon_reg.h"
-
-#include <unistd.h>  /* for usleep() */
-
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void
-r200DestroyTexObj( r200ContextPtr rmesa, r200TexObjPtr t )
-{
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, 
-	       (void *)t, (void *)t->base.tObj );
-   }
-
-   if ( rmesa != NULL ) {
-      unsigned   i;
-
-
-      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
-	    rmesa->state.texture.unit[i].texobj = NULL;
-	    rmesa->hw.tex[i].dirty = GL_FALSE;
-	    rmesa->hw.cube[i].dirty = GL_FALSE;
-	 }
-      }
-   }
-}
-
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-
-static void r200UploadGARTClientSubImage( r200ContextPtr rmesa,
-					  r200TexObjPtr t, 
-					  struct gl_texture_image *texImage,
-					  GLint hwlevel,
-					  GLint x, GLint y, 
-					  GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   GLuint srcPitch, dstPitch;
-   int blit_format;
-   int srcOffset;
-
-   /*
-    * XXX it appears that we always upload the full image, not a subimage.
-    * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-    * changed, the src pitch will have to change.
-    */
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = R200_CP_COLOR_FORMAT_CI8;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   case 2:
-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   case 4:
-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
-      srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-      dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-      break;
-   default:
-      return;
-   }
-
-   t->image[0][hwlevel].data = texImage->Data;
-   srcOffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
-
-   assert( srcOffset != ~0 );
-
-   /* Don't currently need to cope with small pitches?
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-
-   r200EmitWait( rmesa, RADEON_WAIT_3D );
-
-   r200EmitBlit( rmesa, blit_format, 
-		 srcPitch,  
-		 srcOffset,   
-		 dstPitch,
-		 t->bufAddr,
-		 x, 
-		 y, 
-		 t->image[0][hwlevel].x + x,
-		 t->image[0][hwlevel].y + y, 
-		 width,
-		 height );
-
-   r200EmitWait( rmesa, RADEON_WAIT_2D );
-}
-
-static void r200UploadRectSubImage( r200ContextPtr rmesa,
-				    r200TexObjPtr t, 
-				    struct gl_texture_image *texImage,
-				    GLint x, GLint y, 
-				    GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   int blit_format, dstPitch, done;
-
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = R200_CP_COLOR_FORMAT_CI8;
-      break;
-   case 2:
-      blit_format = R200_CP_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      blit_format = R200_CP_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      return;
-   }
-
-   t->image[0][0].data = texImage->Data;
-
-   /* Currently don't need to cope with small pitches.
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-   dstPitch = t->pp_txpitch + 32;
-
-   if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-      /* In this case, could also use GART texturing.  This is
-       * currently disabled, but has been tested & works.
-       */
-      if ( !t->image_override )
-         t->pp_txoffset = r200GartOffsetFromVirtual( rmesa, texImage->Data );
-      t->pp_txpitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-      if (R200_DEBUG & DEBUG_TEXTURE)
-	 fprintf(stderr, 
-		 "Using GART texturing for rectangular client texture\n");
-
-      /* Release FB memory allocated for this image:
-       */
-      /* FIXME This may not be correct as driSwapOutTextureObject sets
-       * FIXME dirty_images.  It may be fine, though.
-       */
-      if ( t->base.memBlock ) {
-	 driSwapOutTextureObject( (driTextureObject *) t );
-      }
-   }
-   else if (texImage->IsClientData) {
-      /* Data already in GART memory, with usable pitch.
-       */
-      GLuint srcPitch;
-      srcPitch = texImage->RowStride * texFormat->TexelBytes;
-      r200EmitBlit( rmesa, 
-		    blit_format, 
-		    srcPitch,
-		    r200GartOffsetFromVirtual( rmesa, texImage->Data ),   
-		    dstPitch, t->bufAddr,
-		    0, 0, 
-		    0, 0, 
-		    width, height );
-   }
-   else {
-      /* Data not in GART memory, or bad pitch.
-       */
-      for (done = 0; done < height ; ) {
-	 struct r200_dma_region region;
-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
-	 int src_pitch;
-	 char *tex;
-
-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-	 tex = (char *)texImage->Data + done * src_pitch;
-
-	 memset(&region, 0, sizeof(region));
-	 r200AllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
-
-	 /* Copy texdata to dma:
-	  */
-	 if (0)
-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
-		    __FUNCTION__, src_pitch, dstPitch);
-
-	 if (src_pitch == dstPitch) {
-	    memcpy( region.address + region.start, tex, lines * src_pitch );
-	 } 
-	 else {
-	    char *buf = region.address + region.start;
-	    int i;
-	    for (i = 0 ; i < lines ; i++) {
-	       memcpy( buf, tex, src_pitch );
-	       buf += dstPitch;
-	       tex += src_pitch;
-	    }
-	 }
-
-	 r200EmitWait( rmesa, RADEON_WAIT_3D );
-
-	 /* Blit to framebuffer
-	  */
-	 r200EmitBlit( rmesa,
-		       blit_format,
-		       dstPitch, GET_START( &region ),
-		       dstPitch | (t->tile_bits >> 16),
-		       t->bufAddr,
-		       0, 0,
-		       0, done,
-		       width, lines );
-	 
-	 r200EmitWait( rmesa, RADEON_WAIT_2D );
-
-	 r200ReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
-	 done += lines;
-      }
-   }
-}
-
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void uploadSubImage( r200ContextPtr rmesa, r200TexObjPtr t, 
-			    GLint hwlevel,
-			    GLint x, GLint y, GLint width, GLint height,
-			    GLuint face )
-{
-   struct gl_texture_image *texImage = NULL;
-   GLuint offset;
-   GLint imageWidth, imageHeight;
-   GLint ret;
-   drm_radeon_texture_t tex;
-   drm_radeon_tex_image_t tmp;
-   const int level = hwlevel + t->base.firstLevel;
-
-   if ( R200_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
-	       __FUNCTION__, (void *)t, (void *)t->base.tObj,
-	       level, width, height, face );
-   }
-
-   ASSERT(face < 6);
-
-   /* Ensure we have a valid texture to upload */
-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-      return;
-   }
-
-   texImage = t->base.tObj->Image[face][level];
-
-   if ( !texImage ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
-      return;
-   }
-   if ( !texImage->Data ) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
-      return;
-   }
-
-
-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-      assert(level == 0);
-      assert(hwlevel == 0);
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
-      r200UploadRectSubImage( rmesa, t, texImage, x, y, width, height );
-      return;
-   }
-   else if (texImage->IsClientData) {
-      if ( R200_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is in GART client storage\n",
-		  __FUNCTION__);
-      r200UploadGARTClientSubImage( rmesa, t, texImage, hwlevel,
-				   x, y, width, height );
-      return;
-   }
-   else if ( R200_DEBUG & DEBUG_TEXTURE )
-      fprintf( stderr, "%s: image data is in normal memory\n",
-	       __FUNCTION__);
-      
-
-   imageWidth = texImage->Width;
-   imageHeight = texImage->Height;
-
-   offset = t->bufAddr + t->base.totalSize / 6 * face;
-
-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      GLint imageX = 0;
-      GLint imageY = 0;
-      GLint blitX = t->image[face][hwlevel].x;
-      GLint blitY = t->image[face][hwlevel].y;
-      GLint blitWidth = t->image[face][hwlevel].width;
-      GLint blitHeight = t->image[face][hwlevel].height;
-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
-	       imageWidth, imageHeight, imageX, imageY );
-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
-	       blitWidth, blitHeight, blitX, blitY );
-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-	       (GLuint)offset, hwlevel, level );
-   }
-
-   t->image[face][hwlevel].data = texImage->Data;
-
-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-    * We used to use 1, 2 and 4-byte texels and used to use the texture
-    * width to dictate the blit width - but that won't work for compressed
-    * textures. (Brian)
-    * NOTE: can't do that with texture tiling. (sroland)
-    */
-   tex.offset = offset;
-   tex.image = &tmp;
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(tmp) );
-   
-   if (texImage->TexFormat->TexelBytes) {
-      /* use multi-byte upload scheme */
-      tex.height = imageHeight;
-      tex.width = imageWidth;
-      tex.format = t->pp_txformat & R200_TXFORMAT_FORMAT_MASK;
-      if (tex.format == R200_TXFORMAT_ABGR8888) {
-	 /* drm will refuse abgr8888 textures. */
-	 tex.format = R200_TXFORMAT_ARGB8888;
-      }
-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
-      tex.offset += tmp.x & ~1023;
-      tmp.x = tmp.x % 1024;
-      if (t->tile_bits & R200_TXO_MICRO_TILE) {
-	 /* need something like "tiled coordinates" ? */
-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-      }
-      else {
-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-      }
-      if ((t->tile_bits & R200_TXO_MACRO_TILE) &&
-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256) &&
-	 ((!(t->tile_bits & R200_TXO_MICRO_TILE) && (texImage->Height >= 8)) ||
-	    (texImage->Height >= 16))) {
-	 /* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-	    OR if height is smaller than 8 automatically, but if micro tiling is active
-	    the limit is height 16 instead ? */
-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-      }
-   }
-   else {
-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
-         so the kernel module reads the right amount of data. */
-      tex.format = R200_TXFORMAT_I8; /* any 1-byte texel format */
-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
-      tex.height = (imageHeight + 3) / 4;
-      tex.width = (imageWidth + 3) / 4;
-      switch (t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) {
-      case R200_TXFORMAT_DXT1:
-           tex.width *= 8;
-           break;
-      case R200_TXFORMAT_DXT23:
-      case R200_TXFORMAT_DXT45:
-           tex.width *= 16;
-           break;
-      default:
-          fprintf(stderr, "unknown compressed tex format in uploadSubImage\n");
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
-   do {
-      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
-                                 &tex, sizeof(drm_radeon_texture_t) );
-      if (ret) {
-	 if (R200_DEBUG & DEBUG_IOCTL)
-	    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
-	 usleep(1);
-      }
-   } while ( ret == -EAGAIN );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
-      fprintf( stderr, "   offset=0x%08x\n",
-	       offset );
-      fprintf( stderr, "   image width=%d height=%d\n",
-	       imageWidth, imageHeight );
-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
-	       t->image[face][hwlevel].data );
-      exit( 1 );
-   }
-}
-
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- * 
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r200UploadTexImages( r200ContextPtr rmesa, r200TexObjPtr t, GLuint face )
-{
-   const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   if ( R200_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
-	       t->base.firstLevel, t->base.lastLevel );
-   }
-
-   if ( !t || t->base.totalSize == 0 || t->image_override )
-      return 0;
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      r200Finish( rmesa->glCtx );
-   }
-
-   LOCK_HARDWARE( rmesa );
-
-   if ( t->base.memBlock == NULL ) {
-      int heap;
-
-      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
-				 (driTextureObject *) t );
-      if ( heap == -1 ) {
-	 UNLOCK_HARDWARE( rmesa );
-	 return -1;
-      }
-
-      /* Set the base offset of the texture image */
-      t->bufAddr = rmesa->r200Screen->texOffset[heap] 
-	   + t->base.memBlock->ofs;
-      t->pp_txoffset = t->bufAddr;
-       
-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-	 /* hope it's safe to add that here... */
-	 t->pp_txoffset |= t->tile_bits;
-      }
-
-      /* Mark this texobj as dirty on all units:
-       */
-      t->dirty_state = TEX_ALL;
-   }
-
-   /* Let the world know we've used this memory recently.
-    */
-   driUpdateTextureLRU( (driTextureObject *) t );
-   UNLOCK_HARDWARE( rmesa );
-
-   /* Upload any images that are new */
-   if (t->base.dirty_images[face]) {
-      int i;
-      for ( i = 0 ; i < numLevels ; i++ ) {
-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
-			    t->image[face][i].height, face );
-         }
-      }
-      t->base.dirty_images[face] = 0;
-   }
-
-
-   if (R200_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      r200Finish( rmesa->glCtx );
-   }
-
-   return 0;
-}
diff --git a/src/mesa/drivers/dri/r200/r200_texstate.c b/src/mesa/drivers/dri/r200/r200_texstate.c
index 0ad5651cd47..4e53672aee2 100644
--- a/src/mesa/drivers/dri/r200/r200_texstate.c
+++ b/src/mesa/drivers/dri/r200/r200_texstate.c
@@ -37,9 +37,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/texformat.h"
+#include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
 
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
 #include "r200_context.h"
 #include "r200_state.h"
 #include "r200_ioctl.h"
@@ -139,257 +142,6 @@ static const struct tx_table tx_table_le[] =
 #undef _ALPHA
 #undef _INVALID
 
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
- * too.
- * 
- * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
- */
-static void r200SetTexImages( r200ContextPtr rmesa,
-			      struct gl_texture_object *tObj )
-{
-   r200TexObjPtr t = (r200TexObjPtr)tObj->DriverData;
-   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset, blitWidth;
-   GLint i, texelBytes;
-   GLint numLevels;
-   GLint log2Width, log2Height, log2Depth;
-
-   /* Set the hardware texture format
-    */
-   if ( !t->image_override ) {
-      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
-	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
-								tx_table_be;
-
-         t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
-                             R200_TXFORMAT_ALPHA_IN_MAP);
-         t->pp_txfilter &= ~R200_YUV_TO_RGB;
-
-	 t->pp_txformat |= table[ baseImage->TexFormat->MesaFormat ].format;
-	 t->pp_txfilter |= table[ baseImage->TexFormat->MesaFormat ].filter;
-      }
-      else {
-         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
-         return;
-      }
-   }
-
-   texelBytes = baseImage->TexFormat->TexelBytes;
-
-   /* Compute which mipmap levels we really want to send to the hardware.
-    */
-
-   driCalculateTextureFirstLastLevel( (driTextureObject *) t );
-   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
-
-   /* Calculate mipmap offsets and dimensions for blitting (uploading)
-    * The idea is that we lay out the mipmap levels within a block of
-    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-    */
-   curOffset = 0;
-   blitWidth = BLIT_WIDTH_BYTES;
-   t->tile_bits = 0;
-
-   /* figure out if this texture is suitable for tiling. */
-   if (texelBytes) {
-      if (rmesa->texmicrotile  && (tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
-      /* texrect might be able to use micro tiling too in theory? */
-	 (baseImage->Height > 1)) {
-	 /* allow 32 (bytes) x 1 mip (which will use two times the space
-	 the non-tiled version would use) max if base texture is large enough */
-	 if ((numLevels == 1) ||
-	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
-	       (baseImage->Width * texelBytes > 64)) ||
-	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
-	    t->tile_bits |= R200_TXO_MICRO_TILE;
-	 }
-      }
-      if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
-	 /* we can set macro tiling even for small textures, they will be untiled anyway */
-	 t->tile_bits |= R200_TXO_MACRO_TILE;
-      }
-   }
-
-   for (i = 0; i < numLevels; i++) {
-      const struct gl_texture_image *texImage;
-      GLuint size;
-
-      texImage = tObj->Image[0][i + t->base.firstLevel];
-      if ( !texImage )
-	 break;
-
-      /* find image size in bytes */
-      if (texImage->IsCompressed) {
-      /* need to calculate the size AFTER padding even though the texture is
-         submitted without padding.
-         Only handle pot textures currently - don't know if npot is even possible,
-         size calculation would certainly need (trivial) adjustments.
-         Align (and later pad) to 32byte, not sure what that 64byte blit width is
-         good for? */
-         if ((t->pp_txformat & R200_TXFORMAT_FORMAT_MASK) == R200_TXFORMAT_DXT1) {
-            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
-            if ((texImage->Width + 3) < 8) /* width one block */
-               size = texImage->CompressedSize * 4;
-            else if ((texImage->Width + 3) < 16)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-         }
-         else /* DXT3/5, 16 bytes per block */
-            if ((texImage->Width + 3) < 8)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-      }
-      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
-      }
-      else if (t->tile_bits & R200_TXO_MICRO_TILE) {
-	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-	    though the actual offset may be different (if texture is less than
-	    32 bytes width) to the untiled case */
-	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      else {
-	 int w = (texImage->Width * texelBytes + 31) & ~31;
-	 size = w * texImage->Height * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      assert(size > 0);
-
-      /* Align to 32-byte offset.  It is faster to do this unconditionally
-       * (no branch penalty).
-       */
-
-      curOffset = (curOffset + 0x1f) & ~0x1f;
-
-      if (texelBytes) {
-	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
-	 t->image[0][i].y = 0;
-	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
-	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
-      }
-      else {
-         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-         t->image[0][i].height = size / t->image[0][i].width;     
-      }
-
-#if 0
-      /* for debugging only and only  applicable to non-rectangle targets */
-      assert(size % t->image[0][i].width == 0);
-      assert(t->image[0][i].x == 0
-             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
-#endif
-
-      if (0)
-         fprintf(stderr,
-                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-                 i, texImage->Width, texImage->Height,
-                 t->image[0][i].x, t->image[0][i].y,
-                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
-
-      curOffset += size;
-
-   }
-
-   /* Align the total size of texture memory block.
-    */
-   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-   /* Setup remaining cube face blits, if needed */
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      const GLuint faceSize = t->base.totalSize;
-      GLuint face;
-      /* reuse face 0 x/y/width/height - just update the offset when uploading */
-      for (face = 1; face < 6; face++) {
-         for (i = 0; i < numLevels; i++) {
-            t->image[face][i].x =  t->image[0][i].x;
-            t->image[face][i].y =  t->image[0][i].y;
-            t->image[face][i].width  = t->image[0][i].width;
-            t->image[face][i].height = t->image[0][i].height;
-         }
-      }
-      t->base.totalSize = 6 * faceSize; /* total texmem needed */
-   }
-
-
-   /* Hardware state:
-    */
-   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (numLevels - 1) << R200_MAX_MIP_LEVEL_SHIFT;
-
-   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
-		       R200_TXFORMAT_HEIGHT_MASK |
-                       R200_TXFORMAT_CUBIC_MAP_ENABLE |
-                       R200_TXFORMAT_F5_WIDTH_MASK |
-                       R200_TXFORMAT_F5_HEIGHT_MASK);
-   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
-		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
-
-   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
-   if (tObj->Target == GL_TEXTURE_3D) {
-      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
-      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
-   }
-   else if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      ASSERT(log2Width == log2Height);
-      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
-                         (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
-/* don't think we need this bit, if it exists at all - fglrx does not set it */
-                         (R200_TXFORMAT_CUBIC_MAP_ENABLE));
-      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
-      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
-                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
-                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
-                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
-                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
-   }
-   else {
-      /* If we don't in fact send enough texture coordinates, q will be 1,
-       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
-       */
-      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
-   }
-
-   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
-                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
-
-   /* Only need to round to nearest 32 for textures, but the blitter
-    * requires 64-byte aligned pitches, and we may/may not need the
-    * blitter.   NPOT only!
-    */
-   if ( !t->image_override ) {
-      if (baseImage->IsCompressed)
-         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-      else
-         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
-      t->pp_txpitch -= 32;
-   }
-
-   t->dirty_state = TEX_ALL;
-
-   /* FYI: r200UploadTexImages( rmesa, t ) used to be called here */
-}
-
-
-
 /* ================================================================
  * Texture combine functions
  */
@@ -981,20 +733,19 @@ void r200SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 {
 	r200ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
-	    _mesa_lookup_texture(rmesa->glCtx, texname);
-	r200TexObjPtr t;
+	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 
 	if (!tObj)
 		return;
 
-	t = (r200TexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
 
-	t->pp_txoffset = offset;
+	t->bo = NULL;
+	t->override_offset = offset;
 	t->pp_txpitch = pitch - 32;
 
 	switch (depth) {
@@ -1014,6 +765,125 @@ void r200SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	}
 }
 
+void r200SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+		       __DRIdrawable *dPriv)
+{
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r200ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = tx_table_le[MESA_FORMAT_RGB888].format;
+		else
+			t->pp_txformat = tx_table_le[MESA_FORMAT_ARGB8888].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_ARGB8888].filter;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_RGB888].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_RGB888].filter;
+		break;
+	case 2:
+		t->pp_txformat = tx_table_le[MESA_FORMAT_RGB565].format;
+		t->pp_txfilter |= tx_table_le[MESA_FORMAT_RGB565].filter;
+		break;
+	}
+        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		   | ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
+        t->pp_txformat |= R200_TXFORMAT_NON_POWER2;
+	t->pp_txpitch = pitch_val;
+        t->pp_txpitch -= 32;
+
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
+}
+
+
+void r200SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        r200SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
+
+
 #define REF_COLOR 1
 #define REF_ALPHA 2
 
@@ -1207,12 +1077,41 @@ static GLboolean r200UpdateAllTexEnv( GLcontext *ctx )
                                 R200_VOLUME_FILTER_MASK)
 
 
+static void disable_tex_obj_state( r200ContextPtr rmesa, 
+				   int unit )
+{
+   
+   R200_STATECHANGE( rmesa, vtx );
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+
+   if (rmesa->radeon.TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
+      TCL_FALLBACK( rmesa->radeon.glCtx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+   }
+
+   /* Actually want to keep all units less than max active texture
+    * enabled, right?  Fix this for >2 texunits.
+    */
+
+   {
+      GLuint tmp = rmesa->TexGenEnabled;
+
+      rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenNeedNormals[unit] = GL_FALSE;
+      rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
+
+      if (tmp != rmesa->TexGenEnabled) {
+	 rmesa->recheck_texgen[unit] = GL_TRUE;
+	 rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+      }
+   }
+}
 static void import_tex_obj_state( r200ContextPtr rmesa,
 				  int unit,
-				  r200TexObjPtr texobj )
+				  radeonTexObjPtr texobj )
 {
 /* do not use RADEON_DB_STATE to avoid stale texture caches */
-   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+   GLuint *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
 
    R200_STATECHANGE( rmesa, tex[unit] );
 
@@ -1225,36 +1124,21 @@ static void import_tex_obj_state( r200ContextPtr rmesa,
    cmd[TEX_PP_TXSIZE] = texobj->pp_txsize; /* NPOT only! */
    cmd[TEX_PP_TXPITCH] = texobj->pp_txpitch; /* NPOT only! */
    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
-   if (rmesa->r200Screen->drmSupportsFragShader) {
-      cmd[TEX_PP_TXOFFSET_NEWDRM] = texobj->pp_txoffset;
-   }
-   else {
-      cmd[TEX_PP_TXOFFSET_OLDDRM] = texobj->pp_txoffset;
-   }
 
-   if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
-      GLuint bytesPerFace = texobj->base.totalSize / 6;
-      ASSERT(texobj->base.totalSize % 6 == 0);
+   if (texobj->base.Target == GL_TEXTURE_CUBE_MAP) {
+      GLuint *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
 
       R200_STATECHANGE( rmesa, cube[unit] );
       cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
-      if (rmesa->r200Screen->drmSupportsFragShader) {
+      if (rmesa->radeon.radeonScreen->drmSupportsFragShader) {
 	 /* that value is submitted twice. could change cube atom
 	    to not include that command when new drm is used */
 	 cmd[TEX_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
       }
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F1] = texobj->pp_txoffset + 1 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F2] = texobj->pp_txoffset + 2 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F3] = texobj->pp_txoffset + 3 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F4] = texobj->pp_txoffset + 4 * bytesPerFace;
-      cube_cmd[CUBE_PP_CUBIC_OFFSET_F5] = texobj->pp_txoffset + 5 * bytesPerFace;
    }
 
-   texobj->dirty_state &= ~(1<<unit);
 }
 
-
 static void set_texgen_matrix( r200ContextPtr rmesa, 
 			       GLuint unit,
 			       const GLfloat *s_plane,
@@ -1377,7 +1261,6 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
    } else {
       tgcm |= R200_TEXGEN_COMP_T << (unit * 4);
    }
-
    if (texUnit->TexGenEnabled & R_BIT) {
       if (texUnit->GenR.Mode != mode)
 	 mixed_fallback = GL_TRUE;
@@ -1517,52 +1400,6 @@ static GLboolean r200_validate_texgen( GLcontext *ctx, GLuint unit )
    return GL_TRUE;
 }
 
-
-static void disable_tex( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-
-   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit)) {
-      /* Texture unit disabled */
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
-
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
-	 rmesa->state.texture.unit[unit].texobj = NULL;
-      }
-
-      R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~(R200_TEX_0_ENABLE << unit);
-	 
-      R200_STATECHANGE( rmesa, vtx );
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
-	 
-      if (rmesa->TclFallback & (R200_TCL_FALLBACK_TEXGEN_0<<unit)) {
-	 TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
-      }
-
-      /* Actually want to keep all units less than max active texture
-       * enabled, right?  Fix this for >2 texunits.
-       */
-
-      {
-	 GLuint tmp = rmesa->TexGenEnabled;
-
-	 rmesa->TexGenEnabled &= ~(R200_TEXGEN_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenEnabled &= ~(R200_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenNeedNormals[unit] = GL_FALSE;
-	 rmesa->TexGenCompSel &= ~(R200_OUTPUT_TEX_0 << unit);
-
-	 if (tmp != rmesa->TexGenEnabled) {
-	    rmesa->recheck_texgen[unit] = GL_TRUE;
-	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-	 }
-      }
-   }
-}
-
 void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
@@ -1579,237 +1416,169 @@ void set_re_cntl_d3d( GLcontext *ctx, int unit, GLboolean use_d3d )
    }
 }
 
-static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static void setup_hardware_state(r200ContextPtr rmesa, radeonTexObj *t)
 {
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
-   }
-
-   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
-
-   if ( t->base.dirty_images[0] ) {
-      R200_FIREVERTICES( rmesa );
-      r200SetTexImages( rmesa, tObj );
-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock && !t->image_override ) 
-	 return GL_FALSE;
+   int firstlevel = t->mt ? t->mt->firstLevel : 0;
+   const struct gl_texture_image *firstImage = t->base.Image[0][firstlevel];
+   GLint log2Width, log2Height, log2Depth, texelBytes;
+   
+   if ( t->bo ) {
+       return;
    }
 
-   set_re_cntl_d3d( ctx, unit, GL_FALSE );
+   log2Width  = firstImage->WidthLog2;
+   log2Height = firstImage->HeightLog2;
+   log2Depth  = firstImage->DepthLog2;
+   texelBytes = firstImage->TexFormat->TexelBytes;
 
-   return GL_TRUE;
-}
 
-#if ENABLE_HW_3D_TEXTURE
-static GLboolean enable_tex_3d( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   /* Need to load the 3d images associated with this unit.
-    */
-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
+   if (!t->image_override) {
+      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+	 const struct tx_table *table = _mesa_little_endian() ? tx_table_le :
+	    tx_table_be;
+	 
+	 t->pp_txformat &= ~(R200_TXFORMAT_FORMAT_MASK |
+			     R200_TXFORMAT_ALPHA_IN_MAP);
+	 t->pp_txfilter &= ~R200_YUV_TO_RGB;
+	 
+	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
+	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
+      } else {
+	 _mesa_problem(NULL, "unexpected texture format in %s",
+		       __FUNCTION__);
+	 return;
+      }
    }
+   
+   t->pp_txfilter &= ~R200_MAX_MIP_LEVEL_MASK;
+   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << R200_MAX_MIP_LEVEL_SHIFT;
+	
+   t->pp_txformat &= ~(R200_TXFORMAT_WIDTH_MASK |
+		       R200_TXFORMAT_HEIGHT_MASK |
+		       R200_TXFORMAT_CUBIC_MAP_ENABLE |
+		       R200_TXFORMAT_F5_WIDTH_MASK |
+		       R200_TXFORMAT_F5_HEIGHT_MASK);
+   t->pp_txformat |= ((log2Width << R200_TXFORMAT_WIDTH_SHIFT) |
+		      (log2Height << R200_TXFORMAT_HEIGHT_SHIFT));
+   
+   t->tile_bits = 0;
+   
+   t->pp_txformat_x &= ~(R200_DEPTH_LOG2_MASK | R200_TEXCOORD_MASK);
+   if (t->base.Target == GL_TEXTURE_3D) {
+      t->pp_txformat_x |= (log2Depth << R200_DEPTH_LOG2_SHIFT);
+      t->pp_txformat_x |= R200_TEXCOORD_VOLUME;
 
-   ASSERT(tObj->Target == GL_TEXTURE_3D);
-
-   /* R100 & R200 do not support mipmaps for 3D textures.
-    */
-   if ( (tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR) ) {
-      return GL_FALSE;
    }
-
-   if ( t->base.dirty_images[0] ) {
-      R200_FIREVERTICES( rmesa );
-      r200SetTexImages( rmesa, tObj );
-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock ) 
-	 return GL_FALSE;
+   else if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
+      ASSERT(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << R200_TXFORMAT_F5_WIDTH_SHIFT) |
+			 (log2Height << R200_TXFORMAT_F5_HEIGHT_SHIFT) |
+			 /* don't think we need this bit, if it exists at all - fglrx does not set it */
+			 (R200_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_txformat_x |= R200_TEXCOORD_CUBIC_ENV;
+      t->pp_cubic_faces = ((log2Width << R200_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << R200_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << R200_FACE_HEIGHT_4_SHIFT));
    }
-
-   set_re_cntl_d3d( ctx, unit, GL_TRUE );
-
-   return GL_TRUE;
-}
-#endif
-
-static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-   GLuint face;
-
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & R200_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~R200_TXFORMAT_NON_POWER2;
-      for (face = 0; face < 6; face++)
-         t->base.dirty_images[face] = ~0;
+   else {
+      /* If we don't in fact send enough texture coordinates, q will be 1,
+       * making TEXCOORD_PROJ act like TEXCOORD_NONPROJ (Right?)
+       */
+      t->pp_txformat_x |= R200_TEXCOORD_PROJ;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+   t->pp_txsize = (((firstImage->Width - 1) << R200_PP_TX_WIDTHMASK_SHIFT)
+		   | ((firstImage->Height - 1) << R200_PP_TX_HEIGHTMASK_SHIFT));
 
-   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
-        t->base.dirty_images[2] || t->base.dirty_images[3] ||
-        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
-      /* flush */
-      R200_FIREVERTICES( rmesa );
-      /* layout memory space, once for all faces */
-      r200SetTexImages( rmesa, tObj );
-   }
-
-   /* upload (per face) */
-   for (face = 0; face < 6; face++) {
-      if (t->base.dirty_images[face]) {
-         r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, face );
-      }
-   }
-      
-   if ( !t->base.memBlock ) {
-      /* texmem alloc failed, use s/w fallback */
-      return GL_FALSE;
+   if ( !t->image_override ) {
+      if (firstImage->IsCompressed)
+         t->pp_txpitch = (firstImage->Width + 63) & ~(63);
+      else
+         t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
+      t->pp_txpitch -= 32;
    }
 
-   set_re_cntl_d3d( ctx, unit, GL_TRUE );
-
-   return GL_TRUE;
-}
-
-static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
-{
-   r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   if (!(t->pp_txformat & R200_TXFORMAT_NON_POWER2)) {
+   if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
       t->pp_txformat |= R200_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
-   }
-
-   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
-
-   if ( t->base.dirty_images[0] ) {
-      R200_FIREVERTICES( rmesa );
-      r200SetTexImages( rmesa, tObj );
-      r200UploadTexImages( rmesa, (r200TexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock &&
-           !t->image_override &&
-           !rmesa->prefer_gart_client_texturing ) 
-	 return GL_FALSE;
    }
 
-   set_re_cntl_d3d( ctx, unit, GL_FALSE );
-
-   return GL_TRUE;
 }
 
-
-static GLboolean update_tex_common( GLcontext *ctx, int unit )
+static GLboolean r200_validate_texture(GLcontext *ctx, struct gl_texture_object *texObj, int unit)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   r200TexObjPtr t = (r200TexObjPtr) tObj->DriverData;
-
-   /* Fallback if there's a texture border */
-   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 )
-       return GL_FALSE;
-
-   /* Update state if this is a different texture object to last
-    * time.
-    */
-   if ( rmesa->state.texture.unit[unit].texobj != t ) {
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
+   radeonTexObj *t = radeon_tex_obj(texObj);
 
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
-	     ~(1UL << unit);
-      }
-
-      rmesa->state.texture.unit[unit].texobj = t;
-      t->base.bound |= (1UL << unit);
-      t->dirty_state |= 1<<unit;
-      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
-   }
-
-
-   /* Newly enabled?
-    */
-   if ( 1|| !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (R200_TEX_0_ENABLE<<unit))) {
-      R200_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
-
-      R200_STATECHANGE( rmesa, vtx );
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
-      rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
+   if (!radeon_validate_texture_miptree(ctx, texObj))
+      return GL_FALSE;
 
-      rmesa->recheck_texgen[unit] = GL_TRUE;
-   }
+   r200_validate_texgen(ctx, unit);
+   /* Configure the hardware registers (more precisely, the cached version
+    * of the hardware registers). */
+   setup_hardware_state(rmesa, t);
+
+   if (texObj->Target == GL_TEXTURE_RECTANGLE_NV ||
+       texObj->Target == GL_TEXTURE_2D ||
+       texObj->Target == GL_TEXTURE_1D)
+      set_re_cntl_d3d( ctx, unit, GL_FALSE );
+   else
+      set_re_cntl_d3d( ctx, unit, GL_TRUE );
+   R200_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_0_ENABLE << unit;
+   
+   R200_STATECHANGE( rmesa, vtx );
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] &= ~(7 << (unit * 3));
+   rmesa->hw.vtx.cmd[VTX_TCL_OUTPUT_VTXFMT_1] |= 4 << (unit * 3);
 
-   if (t->dirty_state & (1<<unit)) {
-      import_tex_obj_state( rmesa, unit, t );
-   }
+   rmesa->recheck_texgen[unit] = GL_TRUE;
+   import_tex_obj_state( rmesa, unit, t );
 
    if (rmesa->recheck_texgen[unit]) {
       GLboolean fallback = !r200_validate_texgen( ctx, unit );
       TCL_FALLBACK( ctx, (R200_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
       rmesa->recheck_texgen[unit] = 0;
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
-   FALLBACK( rmesa, R200_FALLBACK_BORDER_MODE, t->border_fallback );
-   return !t->border_fallback;
-}
+   t->validated = GL_TRUE;
 
+   FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
 
+   return !t->border_fallback;
+}
 
-static GLboolean r200UpdateTextureUnit( GLcontext *ctx, int unit )
+static GLboolean r200UpdateTextureUnit(GLcontext *ctx, int unit)
 {
    r200ContextPtr rmesa = R200_CONTEXT(ctx);
    GLuint unitneeded = rmesa->state.texture.unit[unit].unitneeded;
 
-   if ( unitneeded & (TEXTURE_RECT_BIT) ) {
-      return (enable_tex_rect( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( unitneeded & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
-      return (enable_tex_2d( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-#if ENABLE_HW_3D_TEXTURE
-   else if ( unitneeded & (TEXTURE_3D_BIT) ) {
-      return (enable_tex_3d( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-#endif
-   else if ( unitneeded & (TEXTURE_CUBE_BIT) ) {
-      return (enable_tex_cube( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( unitneeded ) {
-      return GL_FALSE;
-   }
-   else {
-      disable_tex( ctx, unit );
-      return GL_TRUE;
+   if (!unitneeded) {
+      /* disable the unit */
+     disable_tex_obj_state(rmesa, unit);
+     return GL_TRUE;
    }
+
+   if (!r200_validate_texture(ctx, ctx->Texture.Unit[unit]._Current, unit)) {
+    _mesa_warning(ctx,
+		  "failed to validate texture for unit %d.\n",
+		  unit);
+    rmesa->state.texture.unit[unit].texobj = NULL;
+    return GL_FALSE;
+  }
+
+   rmesa->state.texture.unit[unit].texobj = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
+  return GL_TRUE;
 }
 
 
@@ -1850,11 +1619,11 @@ void r200UpdateTextureState( GLcontext *ctx )
 
    FALLBACK( rmesa, R200_FALLBACK_TEXTURE, !ok );
 
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       r200ChooseVertexState( ctx );
 
 
-   if (rmesa->r200Screen->chip_family == CHIP_FAMILY_R200) {
+   if (rmesa->radeon.radeonScreen->chip_family == CHIP_FAMILY_R200) {
 
       /*
        * T0 hang workaround -------------
@@ -1867,7 +1636,7 @@ void r200UpdateTextureState( GLcontext *ctx )
 	 R200_STATECHANGE(rmesa, tex[1]);
 	 rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= R200_TEX_1_ENABLE;
 	 if (!(rmesa->hw.cst.cmd[CST_PP_CNTL_X] & R200_PPX_TEX_1_ENABLE))
-	    rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
+	   rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
 	 rmesa->hw.tex[1].cmd[TEX_PP_TXFORMAT] |= R200_TXFORMAT_LOOKUP_DISABLE;
       }
       else if (!ctx->ATIFragmentShader._Enabled) {
diff --git a/src/mesa/drivers/dri/r200/r200_vertprog.c b/src/mesa/drivers/dri/r200/r200_vertprog.c
index 4ce93b51453..620f29b5c6e 100644
--- a/src/mesa/drivers/dri/r200/r200_vertprog.c
+++ b/src/mesa/drivers/dri/r200/r200_vertprog.c
@@ -1110,9 +1110,9 @@ void r200SetupVertexProg( GLcontext *ctx ) {
    }
    /* could optimize setting up vertex progs away for non-tcl hw */
    fallback = !(vp->native && r200VertexProgUpdateParams(ctx, vp) &&
-      rmesa->r200Screen->drmSupportsVertexProgram);
+      rmesa->radeon.radeonScreen->drmSupportsVertexProgram);
    TCL_FALLBACK(ctx, R200_TCL_FALLBACK_VERTEX_PROGRAM, fallback);
-   if (rmesa->TclFallback) return;
+   if (rmesa->radeon.TclFallback) return;
 
    R200_STATECHANGE( rmesa, vap );
    /* FIXME: fglrx sets R200_VAP_SINGLE_BUF_STATE_ENABLE too. Do we need it?
diff --git a/src/mesa/drivers/dri/r300/.gitignore b/src/mesa/drivers/dri/r300/.gitignore
index 3689a6a78e5..2f9cd1a987b 100644
--- a/src/mesa/drivers/dri/r300/.gitignore
+++ b/src/mesa/drivers/dri/r300/.gitignore
@@ -1,4 +1,15 @@
+radeon_bocs_wrapper.h
+radeon_bo_legacy.[ch]
 radeon_chipset.h
+radeon_cmdbuf.h
+radeon_common.[ch]
+radeon_common_context.[ch]
+radeon_cs_legacy.[ch]
+radeon_dma.[ch]
+radeon_fbo.c
+radeon_lock.[ch]
+radeon_mipmap_tree.[ch]
 radeon_screen.[ch]
-radeon_span.h
+radeon_span.[ch]
+radeon_texture.[ch]
 server
diff --git a/src/mesa/drivers/dri/r300/Makefile b/src/mesa/drivers/dri/r300/Makefile
index 6ca934204f3..a77209074ae 100644
--- a/src/mesa/drivers/dri/r300/Makefile
+++ b/src/mesa/drivers/dri/r300/Makefile
@@ -3,6 +3,8 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = r300_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c
@@ -11,6 +13,10 @@ ifeq ($(USING_EGL), 1)
 EGL_SOURCES = server/radeon_egl.c
 endif
 
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
 COMMON_SOURCES = \
 	../../common/driverfuncs.c \
 	../common/mm.c \
@@ -20,20 +26,26 @@ COMMON_SOURCES = \
 	../common/xmlconfig.c \
 	../common/dri_util.c
 
+RADEON_COMMON_SOURCES = \
+	radeon_texture.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_dma.c \
+	radeon_lock.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_fbo.c
+
 DRIVER_SOURCES = \
 		 radeon_screen.c \
-		 radeon_context.c \
-		 radeon_ioctl.c \
-		 radeon_lock.c \
-		 radeon_span.c \
-		 radeon_state.c \
-		 r300_mem.c \
 		 r300_context.c \
+		 r300_draw.c \
 		 r300_ioctl.c \
 		 r300_cmdbuf.c \
 		 r300_state.c \
 		 r300_render.c \
-		 r300_texmem.c \
 		 r300_tex.c \
 		 r300_texstate.c \
 		 radeon_program.c \
@@ -41,6 +53,7 @@ DRIVER_SOURCES = \
 		 radeon_program_pair.c \
 		 radeon_nqssadce.c \
 		 r300_vertprog.c \
+		 r300_fragprog_common.c \
 		 r300_fragprog.c \
 		 r300_fragprog_swizzle.c \
 		 r300_fragprog_emit.c \
@@ -49,12 +62,16 @@ DRIVER_SOURCES = \
 		 r300_shader.c \
 		 r300_emit.c \
 		 r300_swtcl.c \
-		 $(EGL_SOURCES)
+		 $(RADEON_COMMON_SOURCES) \
+		 $(EGL_SOURCES) \
+		 $(CS_SOURCES)
 
 C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
 
 DRIVER_DEFINES = -DCOMPILE_R300 -DR200_MERGED=0 \
-	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300
+	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R300 \
+#	-DRADEON_BO_TRACK \
+	-Wall
 
 SYMLINKS = \
 	server/radeon_dri.c \
@@ -68,7 +85,30 @@ COMMON_SYMLINKS = \
 	radeon_chipset.h \
 	radeon_screen.c \
 	radeon_screen.h \
-	radeon_span.h
+	radeon_span.h \
+	radeon_span.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_bo_legacy.h \
+	radeon_cs_legacy.h \
+	radeon_bocs_wrapper.h \
+	radeon_lock.c \
+	radeon_lock.h \
+	radeon_common.c \
+	radeon_common.h \
+	radeon_common_context.c \
+	radeon_common_context.h \
+	radeon_cmdbuf.h \
+	radeon_dma.c \
+	radeon_dma.h \
+	radeon_mipmap_tree.c \
+	radeon_mipmap_tree.h \
+	radeon_texture.c \
+	radeon_texture.h \
+	radeon_fbo.c \
+	$(CS_SOURCES)
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
 
 ##### TARGETS #####
 
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.c b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
index f4472756f10..af535037d06 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.c
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.c
@@ -44,245 +44,438 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "radeon_drm.h"
 
-#include "radeon_ioctl.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "radeon_reg.h"
 #include "r300_reg.h"
 #include "r300_cmdbuf.h"
 #include "r300_emit.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_state.h"
+#include "radeon_reg.h"
 
-// Set this to 1 for extremely verbose debugging of command buffers
-#define DEBUG_CMDBUF		0
-
-/**
- * Send the current command buffer via ioctl to the hardware.
+/** # of dwords reserved for additional instructions that may need to be written
+ * during flushing.
  */
-int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller)
+#define SPACE_FOR_FLUSHING	4
+
+static unsigned packet0_count(r300ContextPtr r300, uint32_t *pkt)
 {
-	int ret;
-	int i;
-	drm_radeon_cmd_buffer_t cmd;
-	int start;
-
-	if (r300->radeon.lost_context) {
-		start = 0;
-		r300->radeon.lost_context = GL_FALSE;
-	} else
-		start = r300->cmdbuf.count_reemit;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "%s from %s - %i cliprects\n",
-			__FUNCTION__, caller, r300->radeon.numClipRects);
-
-		if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_VERBOSE)
-			for (i = start; i < r300->cmdbuf.count_used; ++i)
-				fprintf(stderr, "%d: %08x\n", i,
-					r300->cmdbuf.cmd_buf[i]);
-	}
+    if (r300->radeon.radeonScreen->kernel_mm) {
+        return ((((*pkt) >> 16) & 0x3FFF) + 1);
+    } else {
+        drm_r300_cmd_header_t *t = (drm_r300_cmd_header_t*)pkt;
+        return t->packet0.count;
+    }
+}
 
-	cmd.buf = (char *)(r300->cmdbuf.cmd_buf + start);
-	cmd.bufsz = (r300->cmdbuf.count_used - start) * 4;
+#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
+#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
 
-	if (r300->radeon.state.scissor.enabled) {
-		cmd.nbox = r300->radeon.state.scissor.numClipRects;
-		cmd.boxes =
-		    (drm_clip_rect_t *) r300->radeon.state.scissor.pClipRects;
-	} else {
-		cmd.nbox = r300->radeon.numClipRects;
-		cmd.boxes = (drm_clip_rect_t *) r300->radeon.pClipRects;
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	drm_r300_cmd_header_t cmd;
+	uint32_t addr, ndw, i;
+
+	if (!r300->radeon.radeonScreen->kernel_mm) {
+		uint32_t dwords;
+		dwords = (*atom->check) (ctx, atom);
+		BEGIN_BATCH_NO_AUTOSTATE(dwords);
+		OUT_BATCH_TABLE(atom->cmd, dwords);
+		END_BATCH();
+		return;
 	}
 
-	ret = drmCommandWrite(r300->radeon.dri.fd,
-			      DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "Syncing in %s (from %s)\n\n",
-			__FUNCTION__, caller);
-		radeonWaitForIdleLocked(&r300->radeon);
+	cmd.u = atom->cmd[0];
+	addr = (cmd.vpu.adrhi << 8) | cmd.vpu.adrlo;
+	ndw = cmd.vpu.count * 4;
+	if (ndw) {
+
+		if (r300->vap_flush_needed) {
+			BEGIN_BATCH_NO_AUTOSTATE(15 + ndw);
+
+			/* flush processing vertices */
+			OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0);
+			OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+			OUT_BATCH_REGVAL(RADEON_WAIT_UNTIL, RADEON_WAIT_3D_IDLECLEAN);
+			OUT_BATCH_REGVAL(R300_SC_SCREENDOOR, 0xffffff);
+			OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+			r300->vap_flush_needed = GL_FALSE;
+		} else {
+			BEGIN_BATCH_NO_AUTOSTATE(5 + ndw);
+		}
+		OUT_BATCH_REGVAL(R300_VAP_PVS_VECTOR_INDX_REG, addr);
+		OUT_BATCH(CP_PACKET0(R300_VAP_PVS_UPLOAD_DATA, ndw-1) | RADEON_ONE_REG_WR);
+		for (i = 0; i < ndw; i++) {
+			OUT_BATCH(atom->cmd[i+1]);
+		}
+		OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+		END_BATCH();
 	}
-
-	r300->dma.nr_released_bufs = 0;
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
-
-	return ret;
 }
 
-int r300FlushCmdBuf(r300ContextPtr r300, const char *caller)
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	int ret;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	drm_r300_cmd_header_t cmd;
+	uint32_t addr, ndw, i, sz;
+	int type, clamp, stride;
+
+	if (!r300->radeon.radeonScreen->kernel_mm) {
+		uint32_t dwords;
+		dwords = (*atom->check) (ctx, atom);
+		BEGIN_BATCH_NO_AUTOSTATE(dwords);
+		OUT_BATCH_TABLE(atom->cmd, dwords);
+		END_BATCH();
+		return;
+	}
 
-	LOCK_HARDWARE(&r300->radeon);
+	cmd.u = atom->cmd[0];
+	sz = cmd.r500fp.count;
+	addr = ((cmd.r500fp.adrhi_flags & 1) << 8) | cmd.r500fp.adrlo;
+	type = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_TYPE);
+	clamp = !!(cmd.r500fp.adrhi_flags & R500FP_CONSTANT_CLAMP);
 
-	ret = r300FlushCmdBufLocked(r300, caller);
+	addr |= (type << 16);
+	addr |= (clamp << 17);
 
-	UNLOCK_HARDWARE(&r300->radeon);
+	stride = type ? 4 : 6;
 
-	if (ret) {
-		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
-		_mesa_exit(ret);
-	}
+	ndw = sz * stride;
+	if (ndw) {
 
-	return ret;
+		BEGIN_BATCH_NO_AUTOSTATE(3 + ndw);
+		OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_INDEX, 0));
+		OUT_BATCH(addr);
+		OUT_BATCH(CP_PACKET0(R500_GA_US_VECTOR_DATA, ndw-1) | RADEON_ONE_REG_WR);
+		for (i = 0; i < ndw; i++) {
+			OUT_BATCH(atom->cmd[i+1]);
+		}
+		END_BATCH();
+	}
 }
 
-static void r300PrintStateAtom(r300ContextPtr r300, struct r300_state_atom *state)
+static void emit_tex_offsets(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	int i, j, reg;
-	int dwords = (*state->check) (r300, state);
-	drm_r300_cmd_header_t cmd;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	int numtmus = packet0_count(r300, r300->hw.tex.offset.cmd);
+	int notexture = 0;
 
-	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords,
-		state->cmd_size);
-
-	if (RADEON_DEBUG & DEBUG_VERBOSE) {
-		for (i = 0; i < dwords;) {
-			cmd = *((drm_r300_cmd_header_t *) &state->cmd[i]);
-			reg = (cmd.packet0.reghi << 8) | cmd.packet0.reglo;
-			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
-					state->name, i, reg, cmd.packet0.count);
-			++i;
-			for (j = 0; j < cmd.packet0.count; j++) {
-				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
-					state->name, i, reg, state->cmd[i]);
-				reg += 4;
-				++i;
-			}
+	if (numtmus) {
+		int i;
+
+		for(i = 0; i < numtmus; ++i) {
+		    radeonTexObj *t = r300->hw.textures[i];
+
+		    if (!t)
+			notexture = 1;
+		}
+
+		if (r300->radeon.radeonScreen->kernel_mm && notexture) {
+			return;
+		}
+		for(i = 0; i < numtmus; ++i) {
+		    radeonTexObj *t = r300->hw.textures[i];
+		    if (t && !t->image_override) {
+                BEGIN_BATCH_NO_AUTOSTATE(4);
+                OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+			    OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+					    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                END_BATCH();
+		    } else if (!t) {
+                /* Texture unit hasn't a texture bound nothings to do */
+		    } else { /* override cases */
+			    if (t->bo) {
+                    BEGIN_BATCH_NO_AUTOSTATE(4);
+                    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+				    OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+						    RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+                    END_BATCH();
+			    } else if (!r300->radeon.radeonScreen->kernel_mm) {
+                    BEGIN_BATCH_NO_AUTOSTATE(2);
+                    OUT_BATCH_REGSEQ(R300_TX_OFFSET_0 + (i * 4), 1);
+				    OUT_BATCH(t->override_offset);
+                    END_BATCH();
+			    } else {
+                    /* Texture unit hasn't a texture bound nothings to do */
+                }
+		    }
 		}
 	}
 }
 
-/**
- * Emit all atoms with a dirty field equal to dirty.
- *
- * The caller must have ensured that there is enough space in the command
- * buffer.
- */
-static INLINE void r300EmitAtoms(r300ContextPtr r300, GLboolean dirty)
+void r300_emit_scissor(GLcontext *ctx)
 {
-	struct r300_state_atom *atom;
-	uint32_t *dest;
-	int dwords;
-
-	dest = r300->cmdbuf.cmd_buf + r300->cmdbuf.count_used;
-
-	/* Emit WAIT */
-	*dest = cmdwait(R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit cache flush */
-	*dest = cmdpacket0(R300_TX_INVALTAGS, 1);
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	*dest = R300_TX_FLUSH;
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit END3D */
-	*dest = cmdpacify();
-	dest++;
-	r300->cmdbuf.count_used++;
-
-	/* Emit actual atoms */
-
-	foreach(atom, &r300->hw.atomlist) {
-		if ((atom->dirty || r300->hw.all_dirty) == dirty) {
-			dwords = (*atom->check) (r300, atom);
-			if (dwords) {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					r300PrintStateAtom(r300, atom);
-				}
-				memcpy(dest, atom->cmd, dwords * 4);
-				dest += dwords;
-				r300->cmdbuf.count_used += dwords;
-				atom->dirty = GL_FALSE;
-			} else {
-				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
-					fprintf(stderr, "  skip state %s\n",
-						atom->name);
-				}
-			}
-		}
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    unsigned x1, y1, x2, y2;
+	struct radeon_renderbuffer *rrb;
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        return;
+    }
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
 	}
+    if (r300->radeon.state.scissor.enabled) {
+        x1 = r300->radeon.state.scissor.rect.x1;
+        y1 = r300->radeon.state.scissor.rect.y1;
+        x2 = r300->radeon.state.scissor.rect.x2 - 1;
+        y2 = r300->radeon.state.scissor.rect.y2 - 1;
+    } else {
+        x1 = 0;
+        y1 = 0;
+        x2 = rrb->base.Width - 1;
+        y2 = rrb->base.Height - 1;
+    }
+    if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
+        x1 += R300_SCISSORS_OFFSET;
+        y1 += R300_SCISSORS_OFFSET;
+        x2 += R300_SCISSORS_OFFSET;
+        y2 += R300_SCISSORS_OFFSET;
+    }
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+    OUT_BATCH((x1 << R300_SCISSORS_X_SHIFT)|(y1 << R300_SCISSORS_Y_SHIFT));
+    OUT_BATCH((x2 << R300_SCISSORS_X_SHIFT)|(y2 << R300_SCISSORS_Y_SHIFT));
+    END_BATCH();
 }
 
-/**
- * Copy dirty hardware state atoms into the command buffer.
- *
- * We also copy out clean state if we're at the start of a buffer. That makes
- * it easy to recover from lost contexts.
- */
-void r300EmitState(r300ContextPtr r300)
+static void emit_cb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
 {
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_PRIMS))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (r300->cmdbuf.count_used && !r300->hw.is_dirty
-	    && !r300->hw.all_dirty)
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t cbpitch;
+	uint32_t offset = r300->radeon.state.color.draw_offset;
+	uint32_t dw = 6;
+    int i;
+
+	rrb = radeon_get_colorbuffer(&r300->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
 		return;
+	}
+
+        if (RADEON_DEBUG & DEBUG_STATE)
+           fprintf(stderr,"rrb is %p %d %dx%d\n", rrb, offset, rrb->base.Width, rrb->base.Height);
+	cbpitch = (rrb->pitch / rrb->cpp);
+	if (rrb->cpp == 4)
+		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+	else switch (rrb->base._ActualFormat) {
+	case GL_RGB5:
+		cbpitch |= R300_COLOR_FORMAT_RGB565;
+		break;
+	case GL_RGBA4:
+		cbpitch |= R300_COLOR_FORMAT_ARGB4444;
+		break;
+	case GL_RGB5_A1:
+		cbpitch |= R300_COLOR_FORMAT_ARGB1555;
+		break;
+	}
 
-	/* To avoid going across the entire set of states multiple times, just check
-	 * for enough space for the case of emitting all state, and inline the
-	 * r300AllocCmdBuf code here without all the checks.
-	 */
-	r300EnsureCmdBufSpace(r300, r300->hw.max_state_size, __FUNCTION__);
+	if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+		cbpitch |= R300_COLOR_TILE_ENABLE;
+
+    	if (r300->radeon.radeonScreen->kernel_mm)
+		dw += 2;
+	BEGIN_BATCH_NO_AUTOSTATE(dw);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+	OUT_BATCH_RELOC(offset, rrb->bo, offset, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	OUT_BATCH_REGSEQ(R300_RB3D_COLORPITCH0, 1);
+    	if (!r300->radeon.radeonScreen->kernel_mm)
+		OUT_BATCH(cbpitch);
+	else
+		OUT_BATCH_RELOC(cbpitch, rrb->bo, cbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	END_BATCH();
+    if (r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH(0);
+            OUT_BATCH(((rrb->base.Width - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->base.Height - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((0 << R300_CLIPRECT_X_SHIFT) | (0 << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((rrb->base.Width - 1) << R300_CLIPRECT_X_SHIFT) | ((rrb->base.Height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
+            END_BATCH();
+        } else {
+            BEGIN_BATCH_NO_AUTOSTATE(3);
+            OUT_BATCH_REGSEQ(R300_SC_SCISSORS_TL, 2);
+            OUT_BATCH((R300_SCISSORS_OFFSET << R300_SCISSORS_X_SHIFT) |
+                    (R300_SCISSORS_OFFSET << R300_SCISSORS_Y_SHIFT));
+            OUT_BATCH(((rrb->base.Width + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_X_SHIFT) |
+                    ((rrb->base.Height + R300_SCISSORS_OFFSET - 1) << R300_SCISSORS_Y_SHIFT));
+            END_BATCH();
+            BEGIN_BATCH_NO_AUTOSTATE(16);
+            for (i = 0; i < 4; i++) {
+                OUT_BATCH_REGSEQ(R300_SC_CLIPRECT_TL_0 + (i * 8), 2);
+                OUT_BATCH((R300_SCISSORS_OFFSET << R300_CLIPRECT_X_SHIFT) | (R300_SCISSORS_OFFSET << R300_CLIPRECT_Y_SHIFT));
+                OUT_BATCH(((R300_SCISSORS_OFFSET + rrb->base.Width - 1) << R300_CLIPRECT_X_SHIFT) |
+                          ((R300_SCISSORS_OFFSET + rrb->base.Height - 1) << R300_CLIPRECT_Y_SHIFT));
+            }
+            OUT_BATCH_REGSEQ(R300_SC_CLIP_RULE, 1);
+            OUT_BATCH(0xAAAA);
+            OUT_BATCH_REGSEQ(R300_SC_SCREENDOOR, 1);
+            OUT_BATCH(0xffffff);
+            END_BATCH();
+        }
+    }
+}
 
-	if (!r300->cmdbuf.count_used) {
-		if (RADEON_DEBUG & DEBUG_STATE)
-			fprintf(stderr, "Begin reemit state\n");
+static void emit_zb_offset(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t zbpitch;
+	uint32_t dw;
+
+	rrb = radeon_get_depthbuffer(&r300->radeon);
+	if (!rrb)
+		return;
 
-		r300EmitAtoms(r300, GL_FALSE);
-		r300->cmdbuf.count_reemit = r300->cmdbuf.count_used;
+	zbpitch = (rrb->pitch / rrb->cpp);
+	if (!r300->radeon.radeonScreen->kernel_mm) {
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+	        zbpitch |= R300_DEPTHMACROTILE_ENABLE;
+	   }
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+	        zbpitch |= R300_DEPTHMICROTILE_TILED;
+	    }
 	}
 
-	if (RADEON_DEBUG & DEBUG_STATE)
-		fprintf(stderr, "Begin dirty state\n");
+	dw = 6;
+    	if (r300->radeon.radeonScreen->kernel_mm)
+		dw += 2;
+	BEGIN_BATCH_NO_AUTOSTATE(dw);
+	OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+	OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	OUT_BATCH_REGSEQ(R300_ZB_DEPTHPITCH, 1);
+    	if (!r300->radeon.radeonScreen->kernel_mm)
+	    OUT_BATCH(zbpitch);
+	else
+	    OUT_BATCH_RELOC(cbpitch, rrb->bo, zbpitch, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+	END_BATCH();
+}
 
-	r300EmitAtoms(r300, GL_TRUE);
+static void emit_gb_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        BEGIN_BATCH_NO_AUTOSTATE(4);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        OUT_BATCH(atom->cmd[2]);
+        OUT_BATCH(atom->cmd[3]);
+        END_BATCH();
+    }
+}
 
-	assert(r300->cmdbuf.count_used < r300->cmdbuf.size);
+static void emit_threshold_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        OUT_BATCH(atom->cmd[2]);
+        END_BATCH();
+    }
+}
 
-	r300->hw.is_dirty = GL_FALSE;
-	r300->hw.all_dirty = GL_FALSE;
+static void emit_shade_misc(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+
+    if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+        BEGIN_BATCH_NO_AUTOSTATE(2);
+        OUT_BATCH(atom->cmd[0]);
+        OUT_BATCH(atom->cmd[1]);
+        END_BATCH();
+    }
 }
 
-#define packet0_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->packet0.count)
-#define vpu_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->vpu.count)
-#define r500fp_count(ptr) (((drm_r300_cmd_header_t*)(ptr))->r500fp.count)
+static void emit_zstencil_format(GLcontext *ctx, struct radeon_state_atom * atom)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&r300->radeon);
+	struct radeon_renderbuffer *rrb;
+	uint32_t format = 0;
+
+	rrb = radeon_get_depthbuffer(&r300->radeon);
+	if (!rrb)
+	  format = 0;
+	else {
+	  if (rrb->cpp == 2)
+	    format = R300_DEPTHFORMAT_16BIT_INT_Z;
+	  else if (rrb->cpp == 4)
+	    format = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
+	}
+
+	OUT_BATCH(atom->cmd[0]);
+	atom->cmd[1] &= ~0xf;
+	atom->cmd[1] |= format;
+	OUT_BATCH(atom->cmd[1]);
+	OUT_BATCH(atom->cmd[2]);
+	OUT_BATCH(atom->cmd[3]);
+	OUT_BATCH(atom->cmd[4]);
+}
 
-static int check_always(r300ContextPtr r300, struct r300_state_atom *atom)
+static int check_always(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	return atom->cmd_size;
 }
 
-static int check_variable(r300ContextPtr r300, struct r300_state_atom *atom)
+static int check_variable(GLcontext *ctx, struct radeon_state_atom *atom)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int cnt;
-	cnt = packet0_count(atom->cmd);
+	if (atom->cmd[0] == CP_PACKET2) {
+		return 0;
+	}
+	cnt = packet0_count(r300, atom->cmd);
 	return cnt ? cnt + 1 : 0;
 }
 
-static int check_vpu(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+
 	cnt = vpu_count(atom->cmd);
 	return cnt ? (cnt * 4) + 1 : 0;
 }
 
-static int check_r500fp(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+
 	cnt = r500fp_count(atom->cmd);
 	return cnt ? (cnt * 6) + 1 : 0;
 }
 
-static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom)
 {
 	int cnt;
+
 	cnt = r500fp_count(atom->cmd);
 	return cnt ? (cnt * 4) + 1 : 0;
 }
@@ -295,8 +488,8 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
       r300->hw.ATOM.idx = (IDX);					\
       r300->hw.ATOM.check = check_##CHK;				\
       r300->hw.ATOM.dirty = GL_FALSE;					\
-      r300->hw.max_state_size += (SZ);					\
-      insert_at_tail(&r300->hw.atomlist, &r300->hw.ATOM);		\
+      r300->radeon.hw.max_state_size += (SZ);					\
+      insert_at_tail(&r300->radeon.hw.atomlist, &r300->hw.ATOM);		\
    } while (0)
 /**
  * Allocate memory for the command buffer and initialize the state atom
@@ -304,18 +497,16 @@ static int check_r500fp_const(r300ContextPtr r300, struct r300_state_atom *atom)
  */
 void r300InitCmdBuf(r300ContextPtr r300)
 {
-	int size, mtu;
-	int has_tcl = 1;
+	int mtu;
+	int has_tcl;
 	int is_r500 = 0;
-	int i;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
 		is_r500 = 1;
 
-	r300->hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
+	r300->radeon.hw.max_state_size = 2 + 2;	/* reserve extra space for WAIT_IDLE and tex cache flush */
 
 	mtu = r300->radeon.glCtx->Const.MaxTextureUnits;
 	if (RADEON_DEBUG & DEBUG_TEXTURE) {
@@ -323,232 +514,248 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	}
 
 	/* Setup the atom linked list */
-	make_empty_list(&r300->hw.atomlist);
-	r300->hw.atomlist.name = "atom-list";
+	make_empty_list(&r300->radeon.hw.atomlist);
+	r300->radeon.hw.atomlist.name = "atom-list";
 
 	/* Initialize state atoms */
 	ALLOC_STATE(vpt, always, R300_VPT_CMDSIZE, 0);
-	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(R300_SE_VPORT_XSCALE, 6);
+	r300->hw.vpt.cmd[R300_VPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VPORT_XSCALE, 6);
 	ALLOC_STATE(vap_cntl, always, R300_VAP_CNTL_SIZE, 0);
-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(R300_VAP_PVS_STATE_FLUSH_REG, 1);
+	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_STATE_FLUSH_REG, 1);
 	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_FLUSH_1] = 0;
-	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(R300_VAP_CNTL, 1);
-	if (is_r500) {
+	r300->hw.vap_cntl.cmd[R300_VAP_CNTL_CMD] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL, 1);
+	if (is_r500 && !r300->radeon.radeonScreen->kernel_mm) {
 	    ALLOC_STATE(vap_index_offset, always, 2, 0);
-	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(R500_VAP_INDEX_OFFSET, 1);
+	    r300->hw.vap_index_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_VAP_INDEX_OFFSET, 1);
 	    r300->hw.vap_index_offset.cmd[1] = 0;
 	}
 	ALLOC_STATE(vte, always, 3, 0);
-	r300->hw.vte.cmd[0] = cmdpacket0(R300_SE_VTE_CNTL, 2);
+	r300->hw.vte.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SE_VTE_CNTL, 2);
 	ALLOC_STATE(vap_vf_max_vtx_indx, always, 3, 0);
-	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(R300_VAP_VF_MAX_VTX_INDX, 2);
+	r300->hw.vap_vf_max_vtx_indx.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VF_MAX_VTX_INDX, 2);
 	ALLOC_STATE(vap_cntl_status, always, 2, 0);
-	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(R300_VAP_CNTL_STATUS, 1);
+	r300->hw.vap_cntl_status.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CNTL_STATUS, 1);
 	ALLOC_STATE(vir[0], variable, R300_VIR_CMDSIZE, 0);
 	r300->hw.vir[0].cmd[R300_VIR_CMD_0] =
-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_0, 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_0, 1);
 	ALLOC_STATE(vir[1], variable, R300_VIR_CMDSIZE, 1);
 	r300->hw.vir[1].cmd[R300_VIR_CMD_0] =
-	    cmdpacket0(R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PROG_STREAM_CNTL_EXT_0, 1);
 	ALLOC_STATE(vic, always, R300_VIC_CMDSIZE, 0);
-	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(R300_VAP_VTX_STATE_CNTL, 2);
+	r300->hw.vic.cmd[R300_VIC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_VTX_STATE_CNTL, 2);
 	ALLOC_STATE(vap_psc_sgn_norm_cntl, always, 2, 0);
-	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
+	r300->hw.vap_psc_sgn_norm_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PSC_SGN_NORM_CNTL, SGN_NORM_ZERO_CLAMP_MINUS_ONE);
 
 	if (has_tcl) {
 		ALLOC_STATE(vap_clip_cntl, always, 2, 0);
-		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(R300_VAP_CLIP_CNTL, 1);
+		r300->hw.vap_clip_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_CLIP_CNTL, 1);
 		ALLOC_STATE(vap_clip, always, 5, 0);
-		r300->hw.vap_clip.cmd[0] = cmdpacket0(R300_VAP_GB_VERT_CLIP_ADJ, 4);
+		r300->hw.vap_clip.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_VAP_GB_VERT_CLIP_ADJ, 4);
 		ALLOC_STATE(vap_pvs_vtx_timeout_reg, always, 2, 0);
-		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(VAP_PVS_VTX_TIMEOUT_REG, 1);
+		r300->hw.vap_pvs_vtx_timeout_reg.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, VAP_PVS_VTX_TIMEOUT_REG, 1);
 	}
 
 	ALLOC_STATE(vof, always, R300_VOF_CMDSIZE, 0);
 	r300->hw.vof.cmd[R300_VOF_CMD_0] =
-	    cmdpacket0(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_OUTPUT_VTX_FMT_0, 2);
 
 	if (has_tcl) {
 		ALLOC_STATE(pvs, always, R300_PVS_CMDSIZE, 0);
 		r300->hw.pvs.cmd[R300_PVS_CMD_0] =
-		    cmdpacket0(R300_VAP_PVS_CODE_CNTL_0, 3);
+		    cmdpacket0(r300->radeon.radeonScreen, R300_VAP_PVS_CODE_CNTL_0, 3);
 	}
 
 	ALLOC_STATE(gb_enable, always, 2, 0);
-	r300->hw.gb_enable.cmd[0] = cmdpacket0(R300_GB_ENABLE, 1);
+	r300->hw.gb_enable.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_ENABLE, 1);
 	ALLOC_STATE(gb_misc, always, R300_GB_MISC_CMDSIZE, 0);
-	r300->hw.gb_misc.cmd[0] = cmdpacket0(R300_GB_MSPOS0, 5);
+	r300->hw.gb_misc.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GB_MSPOS0, 3);
+	r300->hw.gb_misc.emit = emit_gb_misc;
+	ALLOC_STATE(gb_misc2, always, R300_GB_MISC2_CMDSIZE, 0);
+    r300->hw.gb_misc2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x401C, 2);
 	ALLOC_STATE(txe, always, R300_TXE_CMDSIZE, 0);
-	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(R300_TX_ENABLE, 1);
+	r300->hw.txe.cmd[R300_TXE_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_ENABLE, 1);
 	ALLOC_STATE(ga_point_s0, always, 5, 0);
-	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(R300_GA_POINT_S0, 4);
+	r300->hw.ga_point_s0.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_S0, 4);
 	ALLOC_STATE(ga_triangle_stipple, always, 2, 0);
-	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(R300_GA_TRIANGLE_STIPPLE, 1);
+	r300->hw.ga_triangle_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_TRIANGLE_STIPPLE, 1);
 	ALLOC_STATE(ps, always, R300_PS_CMDSIZE, 0);
-	r300->hw.ps.cmd[0] = cmdpacket0(R300_GA_POINT_SIZE, 1);
+	r300->hw.ps.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_SIZE, 1);
 	ALLOC_STATE(ga_point_minmax, always, 4, 0);
-	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(R300_GA_POINT_MINMAX, 3);
+	r300->hw.ga_point_minmax.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POINT_MINMAX, 3);
 	ALLOC_STATE(lcntl, always, 2, 0);
-	r300->hw.lcntl.cmd[0] = cmdpacket0(R300_GA_LINE_CNTL, 1);
+	r300->hw.lcntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_CNTL, 1);
 	ALLOC_STATE(ga_line_stipple, always, 4, 0);
-	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(R300_GA_LINE_STIPPLE_VALUE, 3);
-	ALLOC_STATE(shade, always, 5, 0);
-	r300->hw.shade.cmd[0] = cmdpacket0(R300_GA_ENHANCE, 4);
+	r300->hw.ga_line_stipple.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_LINE_STIPPLE_VALUE, 3);
+	ALLOC_STATE(shade, always, 2, 0);
+	r300->hw.shade.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_ENHANCE, 1);
+	r300->hw.shade.emit = emit_shade_misc;
+	ALLOC_STATE(shade2, always, 4, 0);
+	r300->hw.shade2.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, 0x4278, 3);
 	ALLOC_STATE(polygon_mode, always, 4, 0);
-	r300->hw.polygon_mode.cmd[0] = cmdpacket0(R300_GA_POLY_MODE, 3);
+	r300->hw.polygon_mode.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_POLY_MODE, 3);
 	ALLOC_STATE(fogp, always, 3, 0);
-	r300->hw.fogp.cmd[0] = cmdpacket0(R300_GA_FOG_SCALE, 2);
+	r300->hw.fogp.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_GA_FOG_SCALE, 2);
 	ALLOC_STATE(zbias_cntl, always, 2, 0);
-	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(R300_SU_TEX_WRAP, 1);
+	r300->hw.zbias_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_TEX_WRAP, 1);
 	ALLOC_STATE(zbs, always, R300_ZBS_CMDSIZE, 0);
 	r300->hw.zbs.cmd[R300_ZBS_CMD_0] =
-	    cmdpacket0(R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_FRONT_SCALE, 4);
 	ALLOC_STATE(occlusion_cntl, always, 2, 0);
-	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(R300_SU_POLY_OFFSET_ENABLE, 1);
+	r300->hw.occlusion_cntl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_POLY_OFFSET_ENABLE, 1);
 	ALLOC_STATE(cul, always, R300_CUL_CMDSIZE, 0);
-	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(R300_SU_CULL_MODE, 1);
+	r300->hw.cul.cmd[R300_CUL_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_CULL_MODE, 1);
 	ALLOC_STATE(su_depth_scale, always, 3, 0);
-	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(R300_SU_DEPTH_SCALE, 2);
+	r300->hw.su_depth_scale.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SU_DEPTH_SCALE, 2);
 	ALLOC_STATE(rc, always, R300_RC_CMDSIZE, 0);
-	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(R300_RS_COUNT, 2);
+	r300->hw.rc.cmd[R300_RC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_COUNT, 2);
 	if (is_r500) {
-		ALLOC_STATE(ri, always, R500_RI_CMDSIZE, 0);
-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R500_RS_IP_0, 16);
-		for (i = 0; i < 8; i++) {
-			r300->hw.ri.cmd[R300_RI_CMD_0 + i +1] =
-			  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-                          (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-                          (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT);
-		}
+		ALLOC_STATE(ri, variable, R500_RI_CMDSIZE, 0);
+		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, 16);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, 1);
+		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, 1);
 	} else {
-		ALLOC_STATE(ri, always, R300_RI_CMDSIZE, 0);
-		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(R300_RS_IP_0, 8);
+		ALLOC_STATE(ri, variable, R300_RI_CMDSIZE, 0);
+		r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, 8);
 		ALLOC_STATE(rr, variable, R300_RR_CMDSIZE, 0);
-		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, 1);
+		r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, 1);
 	}
 	ALLOC_STATE(sc_hyperz, always, 3, 0);
-	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(R300_SC_HYPERZ, 2);
+	r300->hw.sc_hyperz.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_HYPERZ, 2);
 	ALLOC_STATE(sc_screendoor, always, 2, 0);
-	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(R300_SC_SCREENDOOR, 1);
+	r300->hw.sc_screendoor.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_SC_SCREENDOOR, 1);
 	ALLOC_STATE(us_out_fmt, always, 6, 0);
-	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(R300_US_OUT_FMT, 5);
+	r300->hw.us_out_fmt.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_OUT_FMT, 5);
 
 	if (is_r500) {
 		ALLOC_STATE(fp, always, R500_FP_CMDSIZE, 0);
-		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(R500_US_CONFIG, 2);
+		r300->hw.fp.cmd[R500_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CONFIG, 2);
 		r300->hw.fp.cmd[R500_FP_CNTL] = R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO;
-		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(R500_US_CODE_ADDR, 3);
-		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(R500_US_FC_CTRL, 1);
+		r300->hw.fp.cmd[R500_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R500_US_CODE_ADDR, 3);
+		r300->hw.fp.cmd[R500_FP_CMD_2] = cmdpacket0(r300->radeon.radeonScreen, R500_US_FC_CTRL, 1);
 		r300->hw.fp.cmd[R500_FP_FC_CNTL] = 0; /* FIXME when we add flow control */
 
 		ALLOC_STATE(r500fp, r500fp, R500_FPI_CMDSIZE, 0);
-		r300->hw.r500fp.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 0, 0);
+		r300->hw.r500fp.cmd[R300_FPI_CMD_0] =
+			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 0, 0);
+		r300->hw.r500fp.emit = emit_r500fp;
 		ALLOC_STATE(r500fp_const, r500fp_const, R500_FPP_CMDSIZE, 0);
-		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] = cmdr500fp(0, 0, 1, 0);
+		r300->hw.r500fp_const.cmd[R300_FPI_CMD_0] =
+			cmdr500fp(r300->radeon.radeonScreen, 0, 0, 1, 0);
+		r300->hw.r500fp_const.emit = emit_r500fp;
 	} else {
 		ALLOC_STATE(fp, always, R300_FP_CMDSIZE, 0);
-		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(R300_US_CONFIG, 3);
-		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(R300_US_CODE_ADDR_0, 4);
+		r300->hw.fp.cmd[R300_FP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CONFIG, 3);
+		r300->hw.fp.cmd[R300_FP_CMD_1] = cmdpacket0(r300->radeon.radeonScreen, R300_US_CODE_ADDR_0, 4);
+
 		ALLOC_STATE(fpt, variable, R300_FPT_CMDSIZE, 0);
-		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(R300_US_TEX_INST_0, 0);
+		r300->hw.fpt.cmd[R300_FPT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_TEX_INST_0, 0);
 
 		ALLOC_STATE(fpi[0], variable, R300_FPI_CMDSIZE, 0);
-		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, 1);
+		r300->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, 1);
 		ALLOC_STATE(fpi[1], variable, R300_FPI_CMDSIZE, 1);
-		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, 1);
+		r300->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, 1);
 		ALLOC_STATE(fpi[2], variable, R300_FPI_CMDSIZE, 2);
-		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, 1);
+		r300->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, 1);
 		ALLOC_STATE(fpi[3], variable, R300_FPI_CMDSIZE, 3);
-		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, 1);
+		r300->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, 1);
 		ALLOC_STATE(fpp, variable, R300_FPP_CMDSIZE, 0);
-		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, 0);
+		r300->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_PFS_PARAM_0_X, 0);
 	}
 	ALLOC_STATE(fogs, always, R300_FOGS_CMDSIZE, 0);
-	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(R300_FG_FOG_BLEND, 1);
+	r300->hw.fogs.cmd[R300_FOGS_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_BLEND, 1);
 	ALLOC_STATE(fogc, always, R300_FOGC_CMDSIZE, 0);
-	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(R300_FG_FOG_COLOR_R, 3);
+	r300->hw.fogc.cmd[R300_FOGC_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_FOG_COLOR_R, 3);
 	ALLOC_STATE(at, always, R300_AT_CMDSIZE, 0);
-	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(R300_FG_ALPHA_FUNC, 2);
+	r300->hw.at.cmd[R300_AT_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_ALPHA_FUNC, 2);
 	ALLOC_STATE(fg_depth_src, always, 2, 0);
-	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(R300_FG_DEPTH_SRC, 1);
+	r300->hw.fg_depth_src.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_FG_DEPTH_SRC, 1);
 	ALLOC_STATE(rb3d_cctl, always, 2, 0);
-	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(R300_RB3D_CCTL, 1);
+	r300->hw.rb3d_cctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CCTL, 1);
 	ALLOC_STATE(bld, always, R300_BLD_CMDSIZE, 0);
-	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(R300_RB3D_CBLEND, 2);
+	r300->hw.bld.cmd[R300_BLD_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_CBLEND, 2);
 	ALLOC_STATE(cmk, always, R300_CMK_CMDSIZE, 0);
-	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(RB3D_COLOR_CHANNEL_MASK, 1);
+	r300->hw.cmk.cmd[R300_CMK_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, RB3D_COLOR_CHANNEL_MASK, 1);
 	if (is_r500) {
 		ALLOC_STATE(blend_color, always, 3, 0);
-		r300->hw.blend_color.cmd[0] = cmdpacket0(R500_RB3D_CONSTANT_COLOR_AR, 2);
+		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_CONSTANT_COLOR_AR, 2);
 	} else {
 		ALLOC_STATE(blend_color, always, 2, 0);
-		r300->hw.blend_color.cmd[0] = cmdpacket0(R300_RB3D_BLEND_COLOR, 1);
+		r300->hw.blend_color.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_BLEND_COLOR, 1);
 	}
 	ALLOC_STATE(rop, always, 2, 0);
-	r300->hw.rop.cmd[0] = cmdpacket0(R300_RB3D_ROPCNTL, 1);
+	r300->hw.rop.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_ROPCNTL, 1);
 	ALLOC_STATE(cb, always, R300_CB_CMDSIZE, 0);
-	r300->hw.cb.cmd[R300_CB_CMD_0] = cmdpacket0(R300_RB3D_COLOROFFSET0, 1);
-	r300->hw.cb.cmd[R300_CB_CMD_1] = cmdpacket0(R300_RB3D_COLORPITCH0, 1);
+	r300->hw.cb.emit = &emit_cb_offset;
 	ALLOC_STATE(rb3d_dither_ctl, always, 10, 0);
-	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(R300_RB3D_DITHER_CTL, 9);
+	r300->hw.rb3d_dither_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_DITHER_CTL, 9);
 	ALLOC_STATE(rb3d_aaresolve_ctl, always, 2, 0);
-	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(R300_RB3D_AARESOLVE_CTL, 1);
-	ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+	r300->hw.rb3d_aaresolve_ctl.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_RB3D_AARESOLVE_CTL, 1);
+    ALLOC_STATE(rb3d_discard_src_pixel_lte_threshold, always, 3, 0);
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 2);
+	r300->hw.rb3d_discard_src_pixel_lte_threshold.emit = emit_threshold_misc;
 	ALLOC_STATE(zs, always, R300_ZS_CMDSIZE, 0);
 	r300->hw.zs.cmd[R300_ZS_CMD_0] =
-	    cmdpacket0(R300_ZB_CNTL, 3);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_CNTL, 3);
+
 	ALLOC_STATE(zstencil_format, always, 5, 0);
 	r300->hw.zstencil_format.cmd[0] =
-	    cmdpacket0(R300_ZB_FORMAT, 4);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_ZB_FORMAT, 4);
+	r300->hw.zstencil_format.emit = emit_zstencil_format;
+
 	ALLOC_STATE(zb, always, R300_ZB_CMDSIZE, 0);
-	r300->hw.zb.cmd[R300_ZB_CMD_0] = cmdpacket0(R300_ZB_DEPTHOFFSET, 2);
+	r300->hw.zb.emit = emit_zb_offset;
 	ALLOC_STATE(zb_depthclearvalue, always, 2, 0);
-	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(R300_ZB_DEPTHCLEARVALUE, 1);
-	ALLOC_STATE(unk4F30, always, 3, 0);
-	r300->hw.unk4F30.cmd[0] = cmdpacket0(0x4F30, 2);
+	r300->hw.zb_depthclearvalue.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_DEPTHCLEARVALUE, 1);
+	ALLOC_STATE(zb_zmask, always, 3, 0);
+	r300->hw.zb_zmask.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_ZMASK_OFFSET, 2);
 	ALLOC_STATE(zb_hiz_offset, always, 2, 0);
-	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(R300_ZB_HIZ_OFFSET, 1);
+	r300->hw.zb_hiz_offset.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_OFFSET, 1);
 	ALLOC_STATE(zb_hiz_pitch, always, 2, 0);
-	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(R300_ZB_HIZ_PITCH, 1);
+	r300->hw.zb_hiz_pitch.cmd[0] = cmdpacket0(r300->radeon.radeonScreen, R300_ZB_HIZ_PITCH, 1);
 
 	/* VPU only on TCL */
 	if (has_tcl) {
    	        int i;
 		ALLOC_STATE(vpi, vpu, R300_VPI_CMDSIZE, 0);
-		r300->hw.vpi.cmd[R300_VPI_CMD_0] =
-		    cmdvpu(R300_PVS_CODE_START, 0);
+		r300->hw.vpi.cmd[0] =
+		    cmdvpu(r300->radeon.radeonScreen, R300_PVS_CODE_START, 0);
+		r300->hw.vpi.emit = emit_vpu;
 
 		if (is_r500) {
 		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
-			cmdvpu(R500_PVS_CONST_START, 0);
+		    r300->hw.vpp.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R500_PVS_CONST_START, 0);
+		    r300->hw.vpp.emit = emit_vpu;
 
 		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
-			cmdvpu(R500_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R500_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.emit = emit_vpu;
 
 			for (i = 0; i < 6; i++) {
-				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
-					cmdvpu(R500_PVS_UCP_START + i, 1);
+			  ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
+			  r300->hw.vpucp[i].cmd[0] =
+				  cmdvpu(r300->radeon.radeonScreen,
+                           R500_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].emit = emit_vpu;
 			}
 		} else {
 		    ALLOC_STATE(vpp, vpu, R300_VPP_CMDSIZE, 0);
-		    r300->hw.vpp.cmd[R300_VPP_CMD_0] =
-			cmdvpu(R300_PVS_CONST_START, 0);
+		    r300->hw.vpp.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R300_PVS_CONST_START, 0);
+		    r300->hw.vpp.emit = emit_vpu;
 
 		    ALLOC_STATE(vps, vpu, R300_VPS_CMDSIZE, 0);
-		    r300->hw.vps.cmd[R300_VPS_CMD_0] =
-			cmdvpu(R300_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.cmd[0] =
+			cmdvpu(r300->radeon.radeonScreen, R300_POINT_VPORT_SCALE_OFFSET, 1);
+		    r300->hw.vps.emit = emit_vpu;
 
 			for (i = 0; i < 6; i++) {
 				ALLOC_STATE(vpucp[i], vpu, R300_VPUCP_CMDSIZE, 0);
-				r300->hw.vpucp[i].cmd[R300_VPUCP_CMD_0] =
-					cmdvpu(R300_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].cmd[0] =
+					cmdvpu(r300->radeon.radeonScreen,
+					       R300_PVS_UCP_START + i, 1);
+				r300->hw.vpucp[i].emit = emit_vpu;
 			}
 		}
 	}
@@ -556,130 +763,37 @@ void r300InitCmdBuf(r300ContextPtr r300)
 	/* Textures */
 	ALLOC_STATE(tex.filter, variable, mtu + 1, 0);
 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER0_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 0);
 
 	ALLOC_STATE(tex.filter_1, variable, mtu + 1, 0);
 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER1_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, 0);
 
 	ALLOC_STATE(tex.size, variable, mtu + 1, 0);
-	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_SIZE_0, 0);
+	r300->hw.tex.size.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, 0);
 
 	ALLOC_STATE(tex.format, variable, mtu + 1, 0);
 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, 0);
 
 	ALLOC_STATE(tex.pitch, variable, mtu + 1, 0);
-	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(R300_TX_FORMAT2_0, 0);
+	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, 0);
 
-	ALLOC_STATE(tex.offset, variable, mtu + 1, 0);
+	ALLOC_STATE(tex.offset, variable, 1, 0);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_OFFSET_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, 0);
+	r300->hw.tex.offset.emit = &emit_tex_offsets;
 
 	ALLOC_STATE(tex.chroma_key, variable, mtu + 1, 0);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_CHROMA_KEY_0, 0);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, 0);
 
 	ALLOC_STATE(tex.border_color, variable, mtu + 1, 0);
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_BORDER_COLOR_0, 0);
-
-	r300->hw.is_dirty = GL_TRUE;
-	r300->hw.all_dirty = GL_TRUE;
-
-	/* Initialize command buffer */
-	size =
-	    256 * driQueryOptioni(&r300->radeon.optionCache,
-				  "command_buffer_size");
-	if (size < 2 * r300->hw.max_state_size) {
-		size = 2 * r300->hw.max_state_size + 65535;
-	}
-	if (size > 64 * 256)
-		size = 64 * 256;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
-		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
-			sizeof(drm_r300_cmd_header_t));
-		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
-			sizeof(drm_radeon_cmd_buffer_t));
-		fprintf(stderr,
-			"Allocating %d bytes command buffer (max state is %d bytes)\n",
-			size * 4, r300->hw.max_state_size * 4);
-	}
-
-	r300->cmdbuf.size = size;
-	r300->cmdbuf.cmd_buf = (uint32_t *) CALLOC(size * 4);
-	r300->cmdbuf.count_used = 0;
-	r300->cmdbuf.count_reemit = 0;
-}
-
-/**
- * Destroy the command buffer and state atoms.
- */
-void r300DestroyCmdBuf(r300ContextPtr r300)
-{
-	struct r300_state_atom *atom;
-
-	FREE(r300->cmdbuf.cmd_buf);
-
-	foreach(atom, &r300->hw.atomlist) {
-		FREE(atom->cmd);
-	}
-}
-
-void r300EmitBlit(r300ContextPtr rmesa,
-		  GLuint color_fmt,
-		  GLuint src_pitch,
-		  GLuint src_offset,
-		  GLuint dst_pitch,
-		  GLuint dst_offset,
-		  GLint srcx, GLint srcy,
-		  GLint dstx, GLint dsty, GLuint w, GLuint h)
-{
-	drm_r300_cmd_header_t *cmd;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr,
-			"%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-			__FUNCTION__, src_pitch, src_offset, srcx, srcy,
-			dst_pitch, dst_offset, dstx, dsty, w, h);
-
-	assert((src_pitch & 63) == 0);
-	assert((dst_pitch & 63) == 0);
-	assert((src_offset & 1023) == 0);
-	assert((dst_offset & 1023) == 0);
-	assert(w < (1 << 16));
-	assert(h < (1 << 16));
-
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 8, __FUNCTION__);
-
-	cmd[0].header.cmd_type = R300_CMD_PACKET3;
-	cmd[0].header.pad0 = R300_CMD_PACKET3_RAW;
-	cmd[1].u = R300_CP_CMD_BITBLT_MULTI | (5 << 16);
-	cmd[2].u = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-		    RADEON_GMC_BRUSH_NONE |
-		    (color_fmt << 8) |
-		    RADEON_GMC_SRC_DATATYPE_COLOR |
-		    RADEON_ROP3_S |
-		    RADEON_DP_SRC_SOURCE_MEMORY |
-		    RADEON_GMC_CLR_CMP_CNTL_DIS | RADEON_GMC_WR_MSK_DIS);
-
-	cmd[3].u = ((src_pitch / 64) << 22) | (src_offset >> 10);
-	cmd[4].u = ((dst_pitch / 64) << 22) | (dst_offset >> 10);
-	cmd[5].u = (srcx << 16) | srcy;
-	cmd[6].u = (dstx << 16) | dsty;	/* dst */
-	cmd[7].u = (w << 16) | h;
-}
-
-void r300EmitWait(r300ContextPtr rmesa, GLuint flags)
-{
-	drm_r300_cmd_header_t *cmd;
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, 0);
 
-	assert(!(flags & ~(R300_WAIT_2D | R300_WAIT_3D)));
+	r300->radeon.hw.is_dirty = GL_TRUE;
+	r300->radeon.hw.all_dirty = GL_TRUE;
 
-	cmd = (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].u = 0;
-	cmd[0].wait.cmd_type = R300_CMD_WAIT;
-	cmd[0].wait.flags = flags;
+	rcommonInitCmdBuf(&r300->radeon);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_cmdbuf.h b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
index a8eaa580bd9..53bcc0eeb49 100644
--- a/src/mesa/drivers/dri/r300/r300_cmdbuf.h
+++ b/src/mesa/drivers/dri/r300/r300_cmdbuf.h
@@ -38,79 +38,14 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "r300_context.h"
 
-extern int r300FlushCmdBufLocked(r300ContextPtr r300, const char *caller);
-extern int r300FlushCmdBuf(r300ContextPtr r300, const char *caller);
-
-extern void r300EmitState(r300ContextPtr r300);
-
 extern void r300InitCmdBuf(r300ContextPtr r300);
-extern void r300DestroyCmdBuf(r300ContextPtr r300);
-
-/**
- * Make sure that enough space is available in the command buffer
- * by flushing if necessary.
- *
- * \param dwords The number of dwords we need to be free on the command buffer
- */
-static INLINE void r300EnsureCmdBufSpace(r300ContextPtr r300,
-					     int dwords, const char *caller)
-{
-	assert(dwords < r300->cmdbuf.size);
-
-	if (r300->cmdbuf.count_used + dwords > r300->cmdbuf.size)
-		r300FlushCmdBuf(r300, caller);
-}
-
-/**
- * Allocate the given number of dwords in the command buffer and return
- * a pointer to the allocated area.
- * When necessary, these functions cause a flush. r300AllocCmdBuf() also
- * causes state reemission after a flush. This is necessary to ensure
- * correct hardware state after an unlock.
- */
-static INLINE uint32_t *r300RawAllocCmdBuf(r300ContextPtr r300,
-					       int dwords, const char *caller)
-{
-	uint32_t *ptr;
-
-	r300EnsureCmdBufSpace(r300, dwords, caller);
-
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
-	return ptr;
-}
-
-static INLINE uint32_t *r300AllocCmdBuf(r300ContextPtr r300,
-					    int dwords, const char *caller)
-{
-	uint32_t *ptr;
-
-	r300EnsureCmdBufSpace(r300, dwords, caller);
-
-	if (!r300->cmdbuf.count_used) {
-		if (RADEON_DEBUG & DEBUG_IOCTL)
-			fprintf(stderr,
-				"Reemit state after flush (from %s)\n", caller);
-		r300EmitState(r300);
-	}
-
-	ptr = &r300->cmdbuf.cmd_buf[r300->cmdbuf.count_used];
-	r300->cmdbuf.count_used += dwords;
-	return ptr;
-}
+void r300_emit_scissor(GLcontext *ctx);
 
-extern void r300EmitBlit(r300ContextPtr rmesa,
-			 GLuint color_fmt,
-			 GLuint src_pitch,
-			 GLuint src_offset,
-			 GLuint dst_pitch,
-			 GLuint dst_offset,
-			 GLint srcx, GLint srcy,
-			 GLint dstx, GLint dsty, GLuint w, GLuint h);
+void emit_vpu(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_vpu(GLcontext *ctx, struct radeon_state_atom *atom);
 
-extern void r300EmitWait(r300ContextPtr rmesa, GLuint flags);
-extern void r300EmitLOAD_VBPNTR(r300ContextPtr rmesa, int start);
-extern void r300EmitVertexShader(r300ContextPtr rmesa);
-extern void r300EmitPixelShader(r300ContextPtr rmesa);
+void emit_r500fp(GLcontext *ctx, struct radeon_state_atom * atom);
+int check_r500fp(GLcontext *ctx, struct radeon_state_atom *atom);
+int check_r500fp_const(GLcontext *ctx, struct radeon_state_atom *atom);
 
 #endif				/* __R300_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index 7d6705058fe..6f3aab986d2 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -43,8 +43,8 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/matrix.h"
 #include "main/extensions.h"
 #include "main/state.h"
-#include "main/texobj.h"
 #include "main/bufferobj.h"
+#include "main/texobj.h"
 
 #include "swrast/swrast.h"
 #include "swrast_setup/swrast_setup.h"
@@ -56,42 +56,40 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "drivers/common/driverfuncs.h"
 
-#include "radeon_ioctl.h"
-#include "radeon_span.h"
 #include "r300_context.h"
+#include "radeon_context.h"
+#include "radeon_span.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
+#include "r300_render.h"
 #include "r300_swtcl.h"
+#include "radeon_bocs_wrapper.h"
 
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
 
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h"		/* for symbolic values of enum-type options */
 
-/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
-int future_hw_tcl_on = 1;
-int hw_tcl_on = 1;
-
 #define need_GL_VERSION_2_0
 #define need_GL_ARB_point_parameters
 #define need_GL_ARB_vertex_program
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
 #define need_GL_EXT_blend_minmax
+#define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_gpu_program_parameters
 #define need_GL_EXT_secondary_color
 #define need_GL_EXT_stencil_two_side
 #define need_GL_ATI_separate_stencil
 #define need_GL_NV_vertex_program
+
 #include "extension_helper.h"
 
+
 const struct dri_extension card_extensions[] = {
   /* *INDENT-OFF* */
   {"GL_ARB_depth_texture",		NULL},
@@ -112,6 +110,7 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
   {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
   {"GL_EXT_blend_subtract",		NULL},
+  {"GL_EXT_packed_depth_stencil",	NULL},
   {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
   {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
   {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
@@ -125,6 +124,8 @@ const struct dri_extension card_extensions[] = {
   {"GL_EXT_texture_lod_bias",		NULL},
   {"GL_EXT_texture_mirror_clamp",	NULL},
   {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_EXT_texture_sRGB",		NULL},
+  {"GL_EXT_vertex_array_bgra",		NULL},
   {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
   {"GL_ATI_texture_env_combine3",	NULL},
   {"GL_ATI_texture_mirror_once",	NULL},
@@ -139,6 +140,11 @@ const struct dri_extension card_extensions[] = {
 };
 
 
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
 /**
  * The GL 2.0 functions are needed to make display lists work with
  * functions added by GL_ATI_separate_stencil.
@@ -147,16 +153,8 @@ const struct dri_extension gl_20_extension[] = {
   {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
 };
 
-
-extern struct tnl_pipeline_stage _r300_render_stage;
-extern const struct tnl_pipeline_stage _r300_tcl_stage;
-
 static const struct tnl_pipeline_stage *r300_pipeline[] = {
 
-	/* Try and go straight to t&l
-	 */
-	&_r300_tcl_stage,
-
 	/* Catch any t&l fallbacks
 	 */
 	&_tnl_vertex_transform_stage,
@@ -165,6 +163,7 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	&_tnl_fog_coordinate_stage,
 	&_tnl_texgen_stage,
 	&_tnl_texture_transform_stage,
+	&_tnl_point_attenuation_stage,
 	&_tnl_vertex_program_stage,
 
 	/* Try again to go to tcl?
@@ -184,6 +183,192 @@ static const struct tnl_pipeline_stage *r300_pipeline[] = {
 	0,
 };
 
+static void r300_get_lock(radeonContextPtr rmesa)
+{
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		if (!rmesa->radeonScreen->kernel_mm)
+			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
+	}
+}
+
+static void r300_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+    /* please flush pipe do all pending work */
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_SCREENDOOR, 1));
+    radeon_cs_write_dword(cs, 0x00FFFFFF);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_SC_HYPERZ, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_US_CONFIG, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_CNTL, 1));
+    radeon_cs_write_dword(cs, 0x0);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen, R300_WAIT_3D));
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_RB3D_DSTCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+    radeon_cs_write_dword(cs, cmdpacket0(rmesa->radeonScreen,
+                                  R300_ZB_ZCACHE_CTLSTAT, 1));
+    radeon_cs_write_dword(cs, R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE);
+    radeon_cs_write_dword(cs, cmdwait(rmesa->radeonScreen,
+                               R300_WAIT_3D | R300_WAIT_3D_CLEAN));
+}
+
+static void r300_vtbl_pre_emit_atoms(radeonContextPtr radeon)
+{
+	r300ContextPtr r300 = (r300ContextPtr)radeon;
+	BATCH_LOCALS(radeon);
+
+	r300->vap_flush_needed = GL_TRUE;
+
+	cp_wait(radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_TX_INVALTAGS, R300_TX_FLUSH);
+	END_BATCH();
+	end_3d(radeon);
+}
+
+static void r300_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	if (mode)
+		r300->radeon.Fallback |= bit;
+	else
+		r300->radeon.Fallback &= ~bit;
+}
+
+static void r300_init_vtbl(radeonContextPtr radeon)
+{
+	radeon->vtbl.get_lock = r300_get_lock;
+	radeon->vtbl.update_viewport_offset = r300UpdateViewportOffset;
+	radeon->vtbl.emit_cs_header = r300_vtbl_emit_cs_header;
+	radeon->vtbl.swtcl_flush = r300_swtcl_flush;
+	radeon->vtbl.pre_emit_atoms = r300_vtbl_pre_emit_atoms;
+	radeon->vtbl.fallback = r300_fallback;
+}
+
+static void r300InitConstValues(GLcontext *ctx, radeonScreenPtr screen)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits = MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+	ctx->Const.MaxTextureLodBias = 16.0;
+
+	if (screen->chip_family >= CHIP_FAMILY_RV515) {
+		ctx->Const.MaxTextureLevels = 13;
+		ctx->Const.MaxCubeTextureLevels = 13;
+		ctx->Const.MaxTextureRectSize = 4096;
+	}
+	else {
+		ctx->Const.MaxTextureLevels = 12;
+		ctx->Const.MaxCubeTextureLevels = 12;
+		ctx->Const.MaxTextureRectSize = 2048;
+	}
+
+	ctx->Const.MinPointSize = 1.0;
+	ctx->Const.MinPointSizeAA = 1.0;
+	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
+	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
+
+	ctx->Const.MinLineWidth = 1.0;
+	ctx->Const.MinLineWidthAA = 1.0;
+	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
+	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
+
+	ctx->Const.MaxDrawBuffers = 1;
+
+	/* currently bogus data */
+	if (r300->options.hw_tcl_enabled) {
+		ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeInstructions =
+		  VSF_MAX_FRAGMENT_LENGTH / 4;
+		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+		ctx->Const.VertexProgram.MaxTemps = 32;
+		ctx->Const.VertexProgram.MaxNativeTemps =
+		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
+		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+	}
+
+	if (screen->chip_family >= CHIP_FAMILY_RV515) {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R500_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R500_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R500_PFS_MAX_INST;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	} else {
+		ctx->Const.FragmentProgram.MaxNativeTemps = R300_PFS_NUM_TEMP_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+		ctx->Const.FragmentProgram.MaxNativeParameters = R300_PFS_NUM_CONST_REGS;
+		ctx->Const.FragmentProgram.MaxNativeAluInstructions = R300_PFS_MAX_ALU_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexInstructions = R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeInstructions = R300_PFS_MAX_ALU_INST + R300_PFS_MAX_TEX_INST;
+		ctx->Const.FragmentProgram.MaxNativeTexIndirections = R300_PFS_MAX_TEX_INDIRECT;
+		ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;
+	}
+}
+
+static void r300ParseOptions(r300ContextPtr r300, radeonScreenPtr screen)
+{
+	struct r300_options options = { 0 };
+
+	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r300");
+
+	r300->radeon.initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache, "def_max_anisotropy");
+
+	options.stencil_two_side_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_stencil_two_side");
+	options.s3tc_force_enabled = driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable");
+	options.s3tc_force_disabled = driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc");
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL) || driQueryOptioni(&r300->radeon.optionCache, "tcl_mode") == DRI_CONF_TCL_SW)
+		options.hw_tcl_enabled = 0;
+	else
+		options.hw_tcl_enabled = 1;
+
+	options.conformance_mode = !driQueryOptionb(&r300->radeon.optionCache, "disable_lowimpact_fallback");
+
+	r300->options = options;
+}
+
+static void r300InitGLExtensions(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r300->radeon.radeonScreen->kernel_mm)
+		driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+	if (r300->options.stencil_two_side_disabled)
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (ctx->Mesa_DXTn && !r300->options.s3tc_force_enabled) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else if (r300->options.s3tc_force_disabled) {
+		_mesa_disable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+}
+
 /* Create the device specific rendering context.
  */
 GLboolean r300CreateContext(const __GLcontextModes * glVisual,
@@ -195,42 +380,25 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	struct dd_function_table functions;
 	r300ContextPtr r300;
 	GLcontext *ctx;
-	int tcl_mode, i;
 
 	assert(glVisual);
 	assert(driContextPriv);
 	assert(screen);
 
-	/* Allocate the R300 context */
 	r300 = (r300ContextPtr) CALLOC(sizeof(*r300));
 	if (!r300)
 		return GL_FALSE;
 
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-		hw_tcl_on = future_hw_tcl_on = 0;
+	r300ParseOptions(r300, screen);
 
-	/* Parse configuration files.
-	 * Do this here so that initialMaxAnisotropy is set before we create
-	 * the default textures.
-	 */
-	driParseConfigFiles(&r300->radeon.optionCache, &screen->optionCache,
-			    screen->driScreen->myNum, "r300");
-	r300->initialMaxAnisotropy = driQueryOptionf(&r300->radeon.optionCache,
-						     "def_max_anisotropy");
+	r300_init_vtbl(&r300->radeon);
 
-	/* Init default driver functions then plug in our R300-specific functions
-	 * (the texture functions are especially important)
-	 */
 	_mesa_init_driver_functions(&functions);
 	r300InitIoctlFuncs(&functions);
 	r300InitStateFuncs(&functions);
 	r300InitTextureFuncs(&functions);
 	r300InitShaderFuncs(&functions);
 
-#ifdef USER_BUFFERS
-	r300_mem_init(r300);
-#endif
-
 	if (!radeonInitContext(&r300->radeon, &functions,
 			       glVisual, driContextPriv,
 			       sharedContextPrivate)) {
@@ -238,94 +406,15 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 		return GL_FALSE;
 	}
 
-	/* Init r300 context data */
-	r300->dma.buf0_address =
-	    r300->radeon.radeonScreen->buffers->list[0].address;
-
-	(void)memset(r300->texture_heaps, 0, sizeof(r300->texture_heaps));
-	make_empty_list(&r300->swapped);
-
-	r300->nr_heaps = 1 /* screen->numTexHeaps */ ;
-	assert(r300->nr_heaps < RADEON_NR_TEX_HEAPS);
-	for (i = 0; i < r300->nr_heaps; i++) {
-		/* *INDENT-OFF* */
-		r300->texture_heaps[i] = driCreateTextureHeap(i, r300,
-							       screen->
-							       texSize[i], 12,
-							       RADEON_NR_TEX_REGIONS,
-							       (drmTextureRegionPtr)
-							       r300->radeon.sarea->
-							       tex_list[i],
-							       &r300->radeon.sarea->
-							       tex_age[i],
-							       &r300->swapped,
-							       sizeof
-							       (r300TexObj),
-							       (destroy_texture_object_t
-								*)
-							       r300DestroyTexObj);
-		/* *INDENT-ON* */
-	}
-	r300->texture_depth = driQueryOptioni(&r300->radeon.optionCache,
-					      "texture_depth");
-	if (r300->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-		r300->texture_depth = (screen->cpp == 4) ?
-		    DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
-
-	/* Set the maximum texture size small enough that we can guarentee that
-	 * all texture units can bind a maximal texture and have them both in
-	 * texturable memory at once.
-	 */
-
 	ctx = r300->radeon.glCtx;
 
-	ctx->Const.MaxTextureImageUnits =
-	    driQueryOptioni(&r300->radeon.optionCache, "texture_image_units");
-	ctx->Const.MaxTextureCoordUnits =
-	    driQueryOptioni(&r300->radeon.optionCache, "texture_coord_units");
-	ctx->Const.MaxTextureUnits =
-	    MIN2(ctx->Const.MaxTextureImageUnits,
-		 ctx->Const.MaxTextureCoordUnits);
-	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
-	ctx->Const.MaxTextureLodBias = 16.0;
+	r300->fallback = 0;
+	if (r300->options.hw_tcl_enabled)
+		ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
 
-	if (screen->chip_family >= CHIP_FAMILY_RV515)
-	    ctx->Const.MaxTextureLevels = 13;
-	else
-	    ctx->Const.MaxTextureLevels = 12;
-
-        driCalculateMaxTextureLevels( r300->texture_heaps,
-                                      r300->nr_heaps,
-                                      & ctx->Const,
-                                      4,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      MIN2(ctx->Const.MaxTextureLevels,
-                                           MAX_3D_TEXTURE_LEVELS) - 1,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      ctx->Const.MaxTextureLevels - 1,
-                                      GL_FALSE,
-                                      2 );
-
-	ctx->Const.MinPointSize = 1.0;
-	ctx->Const.MinPointSizeAA = 1.0;
-	ctx->Const.MaxPointSize = R300_POINTSIZE_MAX;
-	ctx->Const.MaxPointSizeAA = R300_POINTSIZE_MAX;
-
-	ctx->Const.MinLineWidth = 1.0;
-	ctx->Const.MinLineWidthAA = 1.0;
-	ctx->Const.MaxLineWidth = R300_LINESIZE_MAX;
-	ctx->Const.MaxLineWidthAA = R300_LINESIZE_MAX;
-
-#ifdef USER_BUFFERS
-	/* Needs further modifications */
-#if 0
-	ctx->Const.MaxArrayLockSize =
-	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
-#endif
-#endif
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-	ctx->Const.MaxDrawBuffers = 1;
+	r300InitConstValues(ctx, screen);
 
 	_mesa_set_mvp_with_dp4( ctx, GL_TRUE );
 
@@ -336,16 +425,12 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_tnl_CreateContext(ctx);
 	_swsetup_CreateContext(ctx);
 	_swsetup_Wakeup(ctx);
-	_ae_create_context(ctx);
 
 	/* Install the customized pipeline:
 	 */
 	_tnl_destroy_pipeline(ctx);
 	_tnl_install_pipeline(ctx, r300_pipeline);
-
-	/* Try and keep materials and vertices separate:
-	 */
-/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
 
 	/* Configure swrast and TNL to match hardware characteristics:
 	 */
@@ -354,226 +439,25 @@ GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 	_tnl_allow_pixel_fog(ctx, GL_FALSE);
 	_tnl_allow_vertex_fog(ctx, GL_TRUE);
 
-	/* currently bogus data */
-	if (screen->chip_flags & RADEON_CHIPSET_TCL) {
-	        ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeInstructions =
-		  VSF_MAX_FRAGMENT_LENGTH / 4;
-		ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
-		ctx->Const.VertexProgram.MaxTemps = 32;
-		ctx->Const.VertexProgram.MaxNativeTemps =
-		  /*VSF_MAX_FRAGMENT_TEMPS */ 32;
-		ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
-		ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+	if (r300->options.hw_tcl_enabled) {
+		r300InitDraw(ctx);
+	} else {
+		r300InitSwtcl(ctx);
 	}
 
-	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
-	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
-	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeInstructions =
-	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
-	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
-	    PFS_MAX_TEX_INDIRECT;
-	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
-	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
-	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
-
-	driInitExtensions(ctx, card_extensions, GL_TRUE);
-
-	if (driQueryOptionb
-	    (&r300->radeon.optionCache, "disable_stencil_two_side"))
-		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
-
-	if (r300->radeon.glCtx->Mesa_DXTn
-	    && !driQueryOptionb(&r300->radeon.optionCache, "disable_s3tc")) {
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-		_mesa_enable_extension(ctx, "GL_S3_s3tc");
-	} else
-	    if (driQueryOptionb(&r300->radeon.optionCache, "force_s3tc_enable"))
-	{
-		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
-	}
-
-	r300->disable_lowimpact_fallback =
-	    driQueryOptionb(&r300->radeon.optionCache,
-			    "disable_lowimpact_fallback");
-
-	radeonInitSpanFuncs(ctx);
+	radeon_fbo_init(&r300->radeon);
+	radeonInitSpanFuncs( ctx );
 	r300InitCmdBuf(r300);
 	r300InitState(r300);
-	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
-	        r300InitSwtcl(ctx);
-
-	TNL_CONTEXT(ctx)->Driver.RunPipeline = _tnl_run_pipeline;
+	r300InitShaderFunctions(r300);
 
-	tcl_mode = driQueryOptioni(&r300->radeon.optionCache, "tcl_mode");
-	if (driQueryOptionb(&r300->radeon.optionCache, "no_rast")) {
-		fprintf(stderr, "disabling 3D acceleration\n");
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_DISABLE, 1);
-#endif
-	}
-	if (tcl_mode == DRI_CONF_TCL_SW ||
-	    !(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
-		if (r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-			r300->radeon.radeonScreen->chip_flags &=
-			    ~RADEON_CHIPSET_TCL;
-			fprintf(stderr, "Disabling HW TCL support\n");
-		}
-		TCL_FALLBACK(r300->radeon.glCtx,
-			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+	if (screen->chip_family == CHIP_FAMILY_RS600 ||	screen->chip_family == CHIP_FAMILY_RS690 ||
+		screen->chip_family == CHIP_FAMILY_RS740) {
+		r300->radeon.texture_row_align = 64;
 	}
 
-	return GL_TRUE;
-}
-
-static void r300FreeGartAllocations(r300ContextPtr r300)
-{
-	int i, ret, tries = 0, done_age, in_use = 0;
-	drm_radeon_mem_free_t memfree;
+	r300InitGLExtensions(ctx);
 
-	memfree.region = RADEON_MEM_REGION_GART;
-
-#ifdef USER_BUFFERS
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
-
-		/* check whether this buffer is still in use */
-		if (r300->rmm->u_list[i].pending) {
-			in_use++;
-		}
-	}
-	/* Cannot flush/lock if no context exists. */
-	if (in_use)
-		r300FlushCmdBuf(r300, __FUNCTION__);
-
-	done_age = radeonGetAge((radeonContextPtr) r300);
-
-	for (i = r300->rmm->u_last; i > 0; i--) {
-		if (r300->rmm->u_list[i].ptr == NULL) {
-			continue;
-		}
-
-		/* check whether this buffer is still in use */
-		if (!r300->rmm->u_list[i].pending) {
-			continue;
-		}
-
-		assert(r300->rmm->u_list[i].h_pending == 0);
-
-		tries = 0;
-		while (r300->rmm->u_list[i].age > done_age && tries++ < 1000) {
-			usleep(10);
-			done_age = radeonGetAge((radeonContextPtr) r300);
-		}
-		if (tries >= 1000) {
-			WARN_ONCE("Failed to idle region!");
-		}
-
-		memfree.region_offset = (char *)r300->rmm->u_list[i].ptr -
-		    (char *)r300->radeon.radeonScreen->gartTextures.map;
-
-		ret = drmCommandWrite(r300->radeon.radeonScreen->driScreen->fd,
-				      DRM_RADEON_FREE, &memfree,
-				      sizeof(memfree));
-		if (ret) {
-			fprintf(stderr, "Failed to free at %p\nret = %s\n",
-				r300->rmm->u_list[i].ptr, strerror(-ret));
-		} else {
-			if (i == r300->rmm->u_last)
-				r300->rmm->u_last--;
-
-			r300->rmm->u_list[i].pending = 0;
-			r300->rmm->u_list[i].ptr = NULL;
-		}
-	}
-	r300->rmm->u_head = i;
-#endif				/* USER_BUFFERS */
+	return GL_TRUE;
 }
 
-/* Destroy the device specific context.
- */
-void r300DestroyContext(__DRIcontextPrivate * driContextPriv)
-{
-	GET_CURRENT_CONTEXT(ctx);
-	r300ContextPtr r300 = (r300ContextPtr) driContextPriv->driverPrivate;
-	radeonContextPtr radeon = (radeonContextPtr) r300;
-	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
-	int i;
-
-	if (RADEON_DEBUG & DEBUG_DRI) {
-		fprintf(stderr, "Destroying context !\n");
-	}
-
-	/* check if we're deleting the currently bound context */
-	if (&r300->radeon == current) {
-		radeonFlush(r300->radeon.glCtx);
-		_mesa_make_current(NULL, NULL, NULL);
-	}
-
-	/* Free r300 context resources */
-	assert(r300);		/* should never be null */
-
-	if (r300) {
-		GLboolean release_texture_heaps;
-
-		release_texture_heaps =
-		    (r300->radeon.glCtx->Shared->RefCount == 1);
-		_swsetup_DestroyContext(r300->radeon.glCtx);
-		_tnl_DestroyContext(r300->radeon.glCtx);
-		_vbo_DestroyContext(r300->radeon.glCtx);
-		_swrast_DestroyContext(r300->radeon.glCtx);
-
-		if (r300->dma.current.buf) {
-			r300ReleaseDmaRegion(r300, &r300->dma.current,
-					     __FUNCTION__);
-#ifndef USER_BUFFERS
-			r300FlushCmdBuf(r300, __FUNCTION__);
-#endif
-		}
-		r300FreeGartAllocations(r300);
-		r300DestroyCmdBuf(r300);
-
-		if (radeon->state.scissor.pClipRects) {
-			FREE(radeon->state.scissor.pClipRects);
-			radeon->state.scissor.pClipRects = NULL;
-		}
-
-		if (release_texture_heaps) {
-			/* This share group is about to go away, free our private
-			 * texture object data.
-			 */
-			int i;
-
-			for (i = 0; i < r300->nr_heaps; i++) {
-				driDestroyTextureHeap(r300->texture_heaps[i]);
-				r300->texture_heaps[i] = NULL;
-			}
-
-			assert(is_empty_list(&r300->swapped));
-		}
-
-                /* Drop texture object references from current hardware state */
-		for (i = 0; i < 8; i++) {
-			_mesa_reference_texobj(&r300->state.texture.unit[i].texobj, NULL);
-		}
-
-		radeonCleanupContext(&r300->radeon);
-
-#ifdef USER_BUFFERS
-		/* the memory manager might be accessed when Mesa frees the shared
-		 * state, so don't destroy it earlier
-		 */
-		r300_mem_destroy(r300);
-#endif
-
-		/* free the option cache */
-		driDestroyOptionCache(&r300->radeon.optionCache);
-
-		FREE(r300);
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/r300_context.h b/src/mesa/drivers/dri/r300/r300_context.h
index 96a3205f1a3..f7af7d4e574 100644
--- a/src/mesa/drivers/dri/r300/r300_context.h
+++ b/src/mesa/drivers/dri/r300/r300_context.h
@@ -37,26 +37,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __R300_CONTEXT_H__
 #define __R300_CONTEXT_H__
 
-#include "tnl/t_vertex.h"
 #include "drm.h"
 #include "radeon_drm.h"
 #include "dri_util.h"
-#include "texmem.h"
+#include "radeon_common.h"
 
-#include "main/macros.h"
 #include "main/mtypes.h"
-#include "main/colormac.h"
-
-#define USER_BUFFERS
+#include "shader/prog_instruction.h"
 
 struct r300_context;
 typedef struct r300_context r300ContextRec;
 typedef struct r300_context *r300ContextPtr;
 
-#include "radeon_lock.h"
-#include "main/mm.h"
 
-/* From http://gcc.gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+/* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
    I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
    with other compilers ... GLUE!
 */
@@ -73,180 +67,14 @@ typedef struct r300_context *r300ContextPtr;
 	}
 
 #include "r300_vertprog.h"
-#include "r500_fragprog.h"
-
-/**
- * This function takes a float and packs it into a uint32_t
- */
-static INLINE uint32_t r300PackFloat32(float fl)
-{
-	union {
-		float fl;
-		uint32_t u;
-	} u;
-
-	u.fl = fl;
-	return u.u;
-}
-
-/* This is probably wrong for some values, I need to test this
- * some more.  Range checking would be a good idea also..
- *
- * But it works for most things.  I'll fix it later if someone
- * else with a better clue doesn't
- */
-static INLINE uint32_t r300PackFloat24(float f)
-{
-	float mantissa;
-	int exponent;
-	uint32_t float24 = 0;
-
-	if (f == 0.0)
-		return 0;
-
-	mantissa = frexpf(f, &exponent);
-
-	/* Handle -ve */
-	if (mantissa < 0) {
-		float24 |= (1 << 23);
-		mantissa = mantissa * -1.0;
-	}
-	/* Handle exponent, bias of 63 */
-	exponent += 62;
-	float24 |= (exponent << 16);
-	/* Kill 7 LSB of mantissa */
-	float24 |= (r300PackFloat32(mantissa) & 0x7FFFFF) >> 7;
-
-	return float24;
-}
-
-/************ DMA BUFFERS **************/
-
-/* Need refcounting on dma buffers:
- */
-struct r300_dma_buffer {
-	int refcount;		/**< the number of retained regions in buf */
-	drmBufPtr buf;
-	int id;
-};
-#undef GET_START
-#ifdef USER_BUFFERS
-#define GET_START(rvb) (r300GartOffsetFromVirtual(rmesa, (rvb)->address+(rvb)->start))
-#else
-#define GET_START(rvb) (rmesa->radeon.radeonScreen->gart_buffer_offset +		\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-#endif
-/* A retained region, eg vertices for indexed vertices.
- */
-struct r300_dma_region {
-	struct r300_dma_buffer *buf;
-	char *address;		/* == buf->address */
-	int start, end, ptr;	/* offsets from start of buf */
-
-	int aos_offset;		/* address in GART memory */
-	int aos_stride;		/* distance between elements, in dwords */
-	int aos_size;		/* number of components (1-4) */
-};
-
-struct r300_dma {
-	/* Active dma region.  Allocations for vertices and retained
-	 * regions come from here.  Also used for emitting random vertices,
-	 * these may be flushed by calling flush_current();
-	 */
-	struct r300_dma_region current;
 
-	void (*flush) (r300ContextPtr);
-
-	char *buf0_address;	/* start of buf[0], for index calcs */
-
-	/* Number of "in-flight" DMA buffers, i.e. the number of buffers
-	 * for which a DISCARD command is currently queued in the command buffer.
-	 */
-	GLuint nr_released_bufs;
-};
-
-       /* Texture related */
-
-typedef struct r300_tex_obj r300TexObj, *r300TexObjPtr;
-
-/* Maximum number of mipmap levels supported by any supported GPU
- */
-#define R300_MAX_TEXTURE_LEVELS 13
-
-/* Texture object in locally shared texture space.
- */
-struct r300_tex_obj {
-	driTextureObject base;
-
-	GLuint bufAddr;		/* Offset to start of locally
-				   shared texture block */
-
-	drm_radeon_tex_image_t image[6][R300_MAX_TEXTURE_LEVELS];
-	/* Six, for the cube faces */
-
-	GLboolean image_override;	/* Image overridden by GLX_EXT_tfp */
-
-	GLuint pitch;		/* this isn't sent to hardware just used in calculations */
-	/* hardware register values */
-	/* Note that R200 has 8 registers per texture and R300 only 7 */
-	GLuint filter;
-	GLuint filter_1;
-	GLuint pitch_reg;
-	GLuint size;		/* npot only */
-	GLuint format;
-	GLuint offset;		/* Image location in the card's address space.
-				   All cube faces follow. */
-	GLuint unknown4;
-	GLuint unknown5;
-	/* end hardware registers */
-
-	/* registers computed by r200 code - keep them here to
-	   compare against what is actually written.
-
-	   to be removed later.. */
-	GLuint pp_border_color;
-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
-	GLuint format_x;
-
-	GLboolean border_fallback;
-
-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
-};
-
-struct r300_texture_env_state {
-	struct gl_texture_object *texobj;
-	GLenum format;
-	GLenum envMode;
-};
 
 /* The blit width for texture uploads
  */
 #define R300_BLIT_WIDTH_BYTES 1024
 #define R300_MAX_TEXTURE_UNITS 8
 
-struct r300_texture_state {
-	struct r300_texture_env_state unit[R300_MAX_TEXTURE_UNITS];
-	int tc_count;		/* number of incoming texture coordinates from VAP */
-};
 
-/**
- * A block of hardware state.
- *
- * When check returns non-zero, the returned number of dwords must be
- * copied verbatim into the command buffer in order to update a state atom
- * when it is dirty.
- */
-struct r300_state_atom {
-	struct r300_state_atom *next, *prev;
-	const char *name;	/* for debug */
-	int cmd_size;		/* maximum size in dwords */
-	GLuint idx;		/* index in an array (e.g. textures) */
-	uint32_t *cmd;
-	GLboolean dirty;
-
-	int (*check) (r300ContextPtr, struct r300_state_atom * atom);
-};
 
 #define R300_VPT_CMD_0		0
 #define R300_VPT_XSCALE		1
@@ -288,9 +116,11 @@ struct r300_state_atom {
 #define R300_GB_MISC_MSPOS_0		1
 #define R300_GB_MISC_MSPOS_1		2
 #define R300_GB_MISC_TILE_CONFIG	3
-#define R300_GB_MISC_SELECT		4
-#define R300_GB_MISC_AA_CONFIG		5
-#define R300_GB_MISC_CMDSIZE		6
+#define R300_GB_MISC_CMDSIZE		4
+#define R300_GB_MISC2_CMD_0		    0
+#define R300_GB_MISC2_SELECT		1
+#define R300_GB_MISC2_AA_CONFIG		2
+#define R300_GB_MISC2_CMDSIZE		3
 
 #define R300_TXE_CMD_0		0
 #define R300_TXE_ENABLE		1
@@ -463,124 +293,100 @@ struct r300_state_atom {
  * Cache for hardware register state.
  */
 struct r300_hw_state {
-	struct r300_state_atom atomlist;
-
-	GLboolean is_dirty;
-	GLboolean all_dirty;
-	int max_state_size;	/* in dwords */
-
-	struct r300_state_atom vpt;	/* viewport (1D98) */
-	struct r300_state_atom vap_cntl;
-        struct r300_state_atom vap_index_offset; /* 0x208c r5xx only */
-	struct r300_state_atom vof;	/* VAP output format register 0x2090 */
-	struct r300_state_atom vte;	/* (20B0) */
-	struct r300_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
-	struct r300_state_atom vap_cntl_status;
-	struct r300_state_atom vir[2];	/* vap input route (2150/21E0) */
-	struct r300_state_atom vic;	/* vap input control (2180) */
-	struct r300_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
-	struct r300_state_atom vap_clip_cntl;
-	struct r300_state_atom vap_clip;
-	struct r300_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
-	struct r300_state_atom pvs;	/* pvs_cntl (22D0) */
-	struct r300_state_atom gb_enable;	/* (4008) */
-	struct r300_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
-	struct r300_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
-	struct r300_state_atom ga_triangle_stipple;	/* (4214) */
-	struct r300_state_atom ps;	/* pointsize (421C) */
-	struct r300_state_atom ga_point_minmax;	/* (4230) */
-	struct r300_state_atom lcntl;	/* line control */
-	struct r300_state_atom ga_line_stipple;	/* (4260) */
-	struct r300_state_atom shade;
-	struct r300_state_atom polygon_mode;
-	struct r300_state_atom fogp;	/* fog parameters (4294) */
-	struct r300_state_atom ga_soft_reset;	/* (429C) */
-	struct r300_state_atom zbias_cntl;
-	struct r300_state_atom zbs;	/* zbias (42A4) */
-	struct r300_state_atom occlusion_cntl;
-	struct r300_state_atom cul;	/* cull cntl (42B8) */
-	struct r300_state_atom su_depth_scale;	/* (42C0) */
-	struct r300_state_atom rc;	/* rs control (4300) */
-	struct r300_state_atom ri;	/* rs interpolators (4310) */
-	struct r300_state_atom rr;	/* rs route (4330) */
-	struct r300_state_atom sc_hyperz;	/* (43A4) */
-	struct r300_state_atom sc_screendoor;	/* (43E8) */
-	struct r300_state_atom fp;	/* fragment program cntl + nodes (4600) */
-	struct r300_state_atom fpt;	/* texi - (4620) */
-	struct r300_state_atom us_out_fmt;	/* (46A4) */
-	struct r300_state_atom r500fp;	/* r500 fp instructions */
-	struct r300_state_atom r500fp_const;	/* r500 fp constants */
-	struct r300_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
-	struct r300_state_atom fogs;	/* fog state (4BC0) */
-	struct r300_state_atom fogc;	/* fog color (4BC8) */
-	struct r300_state_atom at;	/* alpha test (4BD4) */
-	struct r300_state_atom fg_depth_src;	/* (4BD8) */
-	struct r300_state_atom fpp;	/* 0x4C00 and following */
-	struct r300_state_atom rb3d_cctl;	/* (4E00) */
-	struct r300_state_atom bld;	/* blending (4E04) */
-	struct r300_state_atom cmk;	/* colormask (4E0C) */
-	struct r300_state_atom blend_color;	/* constant blend color */
-	struct r300_state_atom rop;	/* ropcntl */
-	struct r300_state_atom cb;	/* colorbuffer (4E28) */
-	struct r300_state_atom rb3d_dither_ctl;	/* (4E50) */
-	struct r300_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
-	struct r300_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
-	struct r300_state_atom zs;	/* zstencil control (4F00) */
-	struct r300_state_atom zstencil_format;
-	struct r300_state_atom zb;	/* z buffer (4F20) */
-	struct r300_state_atom zb_depthclearvalue;	/* (4F28) */
-	struct r300_state_atom unk4F30;	/* (4F30) */
-	struct r300_state_atom zb_hiz_offset;	/* (4F44) */
-	struct r300_state_atom zb_hiz_pitch;	/* (4F54) */
-
-	struct r300_state_atom vpi;	/* vp instructions */
-	struct r300_state_atom vpp;	/* vp parameters */
-	struct r300_state_atom vps;	/* vertex point size (?) */
-	struct r300_state_atom vpucp[6];	/* vp user clip plane - 6 */
+	struct radeon_state_atom vpt;	/* viewport (1D98) */
+	struct radeon_state_atom vap_cntl;
+	struct radeon_state_atom vap_index_offset; /* 0x208c r5xx only */
+	struct radeon_state_atom vof;	/* VAP output format register 0x2090 */
+	struct radeon_state_atom vte;	/* (20B0) */
+	struct radeon_state_atom vap_vf_max_vtx_indx;	/* Maximum Vertex Indx Clamp (2134) */
+	struct radeon_state_atom vap_cntl_status;
+	struct radeon_state_atom vir[2];	/* vap input route (2150/21E0) */
+	struct radeon_state_atom vic;	/* vap input control (2180) */
+	struct radeon_state_atom vap_psc_sgn_norm_cntl; /* Programmable Stream Control Signed Normalize Control (21DC) */
+	struct radeon_state_atom vap_clip_cntl;
+	struct radeon_state_atom vap_clip;
+	struct radeon_state_atom vap_pvs_vtx_timeout_reg;	/* Vertex timeout register (2288) */
+	struct radeon_state_atom pvs;	/* pvs_cntl (22D0) */
+	struct radeon_state_atom gb_enable;	/* (4008) */
+	struct radeon_state_atom gb_misc;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom gb_misc2;	/* Multisampling position shifts ? (4010) */
+	struct radeon_state_atom ga_point_s0;	/* S Texture Coordinate of Vertex 0 for Point texture stuffing (LLC) (4200) */
+	struct radeon_state_atom ga_triangle_stipple;	/* (4214) */
+	struct radeon_state_atom ps;	/* pointsize (421C) */
+	struct radeon_state_atom ga_point_minmax;	/* (4230) */
+	struct radeon_state_atom lcntl;	/* line control */
+	struct radeon_state_atom ga_line_stipple;	/* (4260) */
+	struct radeon_state_atom shade;
+	struct radeon_state_atom shade2;
+	struct radeon_state_atom polygon_mode;
+	struct radeon_state_atom fogp;	/* fog parameters (4294) */
+	struct radeon_state_atom ga_soft_reset;	/* (429C) */
+	struct radeon_state_atom zbias_cntl;
+	struct radeon_state_atom zbs;	/* zbias (42A4) */
+	struct radeon_state_atom occlusion_cntl;
+	struct radeon_state_atom cul;	/* cull cntl (42B8) */
+	struct radeon_state_atom su_depth_scale;	/* (42C0) */
+	struct radeon_state_atom rc;	/* rs control (4300) */
+	struct radeon_state_atom ri;	/* rs interpolators (4310) */
+	struct radeon_state_atom rr;	/* rs route (4330) */
+	struct radeon_state_atom sc_hyperz;	/* (43A4) */
+	struct radeon_state_atom sc_screendoor;	/* (43E8) */
+	struct radeon_state_atom fp;	/* fragment program cntl + nodes (4600) */
+	struct radeon_state_atom fpt;	/* texi - (4620) */
+	struct radeon_state_atom us_out_fmt;	/* (46A4) */
+	struct radeon_state_atom r500fp;	/* r500 fp instructions */
+	struct radeon_state_atom r500fp_const;	/* r500 fp constants */
+	struct radeon_state_atom fpi[4];	/* fp instructions (46C0/47C0/48C0/49C0) */
+	struct radeon_state_atom fogs;	/* fog state (4BC0) */
+	struct radeon_state_atom fogc;	/* fog color (4BC8) */
+	struct radeon_state_atom at;	/* alpha test (4BD4) */
+	struct radeon_state_atom fg_depth_src;	/* (4BD8) */
+	struct radeon_state_atom fpp;	/* 0x4C00 and following */
+	struct radeon_state_atom rb3d_cctl;	/* (4E00) */
+	struct radeon_state_atom bld;	/* blending (4E04) */
+	struct radeon_state_atom cmk;	/* colormask (4E0C) */
+	struct radeon_state_atom blend_color;	/* constant blend color */
+	struct radeon_state_atom rop;	/* ropcntl */
+	struct radeon_state_atom cb;	/* colorbuffer (4E28) */
+	struct radeon_state_atom rb3d_dither_ctl;	/* (4E50) */
+	struct radeon_state_atom rb3d_aaresolve_ctl;	/* (4E88) */
+	struct radeon_state_atom rb3d_discard_src_pixel_lte_threshold;	/* (4E88) I saw it only written on RV350 hardware..  */
+	struct radeon_state_atom zs;	/* zstencil control (4F00) */
+	struct radeon_state_atom zstencil_format;
+	struct radeon_state_atom zb;	/* z buffer (4F20) */
+	struct radeon_state_atom zb_depthclearvalue;	/* (4F28) */
+	struct radeon_state_atom zb_zmask;	/* (4F30) */
+	struct radeon_state_atom zb_hiz_offset;	/* (4F44) */
+	struct radeon_state_atom zb_hiz_pitch;	/* (4F54) */
+
+	struct radeon_state_atom vpi;	/* vp instructions */
+	struct radeon_state_atom vpp;	/* vp parameters */
+	struct radeon_state_atom vps;	/* vertex point size (?) */
+	struct radeon_state_atom vpucp[6];	/* vp user clip plane - 6 */
 	/* 8 texture units */
 	/* the state is grouped by function and not by
 	   texture unit. This makes single unit updates
 	   really awkward - we are much better off
 	   updating the whole thing at once */
 	struct {
-		struct r300_state_atom filter;
-		struct r300_state_atom filter_1;
-		struct r300_state_atom size;
-		struct r300_state_atom format;
-		struct r300_state_atom pitch;
-		struct r300_state_atom offset;
-		struct r300_state_atom chroma_key;
-		struct r300_state_atom border_color;
+		struct radeon_state_atom filter;
+		struct radeon_state_atom filter_1;
+		struct radeon_state_atom size;
+		struct radeon_state_atom format;
+		struct radeon_state_atom pitch;
+		struct radeon_state_atom offset;
+		struct radeon_state_atom chroma_key;
+		struct radeon_state_atom border_color;
 	} tex;
-	struct r300_state_atom txe;	/* tex enable (4104) */
-};
+	struct radeon_state_atom txe;	/* tex enable (4104) */
 
-/**
- * This structure holds the command buffer while it is being constructed.
- *
- * The first batch of commands in the buffer is always the state that needs
- * to be re-emitted when the context is lost. This batch can be skipped
- * otherwise.
- */
-struct r300_cmdbuf {
-	int size;		/* DWORDs allocated for buffer */
-	uint32_t *cmd_buf;
-	int count_used;		/* DWORDs filled so far */
-	int count_reemit;	/* size of re-emission batch */
+	radeonTexObj *textures[R300_MAX_TEXTURE_UNITS];
 };
 
 /**
  * State cache
  */
 
-struct r300_depthbuffer_state {
-	GLfloat scale;
-};
-
-struct r300_stencilbuffer_state {
-	GLboolean hw_stencil;
-};
-
 /* Vertex shader state */
 
 /* Perhaps more if we store programs in vmem? */
@@ -593,73 +399,60 @@ struct r300_stencilbuffer_state {
 #define STATE_R300_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
 #define STATE_R300_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
 
-struct r300_vertex_shader_fragment {
-	int length;
-	union {
-		GLuint d[VSF_MAX_FRAGMENT_LENGTH];
-		float f[VSF_MAX_FRAGMENT_LENGTH];
-		GLuint i[VSF_MAX_FRAGMENT_LENGTH];
-	} body;
-};
-
-struct r300_vertex_shader_state {
-	struct r300_vertex_shader_fragment program;
-};
-
-extern int hw_tcl_on;
-
 #define COLOR_IS_RGBA
 #define TAG(x) r300##x
 #include "tnl_dd/t_dd_vertex.h"
 #undef TAG
 
-//#define CURRENT_VERTEX_SHADER(ctx) (ctx->VertexProgram._Current)
-#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->selected_vp)
-
-/* Should but doesnt work */
-//#define CURRENT_VERTEX_SHADER(ctx) (R300_CONTEXT(ctx)->curr_vp)
-
-/* r300_vertex_shader_state and r300_vertex_program should probably be merged together someday.
- * Keeping them them seperate for now should ensure fixed pipeline keeps functioning properly.
- */
-
-struct r300_vertex_program_key {
-	GLuint InputsRead;
-	GLuint OutputsWritten;
-	GLuint OutputsAdded;
-};
-
 struct r300_vertex_program {
+	struct gl_vertex_program *Base;
 	struct r300_vertex_program *next;
-	struct r300_vertex_program_key key;
-	int translated;
 
-	struct r300_vertex_shader_fragment program;
+	struct r300_vertex_program_key {
+		GLuint FpReads;
+		GLuint FogAttr;
+		GLuint WPosAttr;
+	} key;
+	
+	struct r300_vertex_shader_hw_code {
+		int length;
+		union {
+			GLuint d[VSF_MAX_FRAGMENT_LENGTH];
+			float f[VSF_MAX_FRAGMENT_LENGTH];
+		} body;
+	} hw_code;
+
+	GLboolean translated;
+	GLboolean error;
 
 	int pos_end;
 	int num_temporaries;	/* Number of temp vars used by program */
-	int wpos_idx;
 	int inputs[VERT_ATTRIB_MAX];
 	int outputs[VERT_RESULT_MAX];
-	int native;
-	int ref_count;
-	int use_ref_count;
 };
 
 struct r300_vertex_program_cont {
-	struct gl_vertex_program mesa_program;	/* Must be first */
-	struct r300_vertex_shader_fragment params;
+	/* This is the unmodified vertex program mesa provided us with.
+	 * We need to keep it unchanged because we may need to create another
+	 * hw specific vertex program based on this.
+	 */
+	struct gl_vertex_program mesa_program;
+	/* This is the list of hw specific vertex programs derived from mesa_program */
 	struct r300_vertex_program *progs;
 };
 
-#define PFS_MAX_ALU_INST	64
-#define PFS_MAX_TEX_INST	64
-#define PFS_MAX_TEX_INDIRECT 4
-#define PFS_NUM_TEMP_REGS	32
-#define PFS_NUM_CONST_REGS	16
+#define R300_PFS_MAX_ALU_INST	64
+#define R300_PFS_MAX_TEX_INST	32
+#define R300_PFS_MAX_TEX_INDIRECT 4
+#define R300_PFS_NUM_TEMP_REGS	32
+#define R300_PFS_NUM_CONST_REGS	32
 
-struct r300_pfs_compile_state;
+#define R500_PFS_MAX_INST 512
+#define R500_PFS_NUM_TEMP_REGS 128
+#define R500_PFS_NUM_CONST_REGS 256
 
+struct r300_pfs_compile_state;
+struct r500_pfs_compile_state;
 
 /**
  * Stores state that influences the compilation of a fragment program.
@@ -702,7 +495,7 @@ struct r300_fragment_program_node {
 struct r300_fragment_program_code {
 	struct {
 		int length; /**< total # of texture instructions used */
-		GLuint inst[PFS_MAX_TEX_INST];
+		GLuint inst[R300_PFS_MAX_TEX_INST];
 	} tex;
 
 	struct {
@@ -712,7 +505,7 @@ struct r300_fragment_program_code {
 			GLuint inst1;
 			GLuint inst2;
 			GLuint inst3;
-		} inst[PFS_MAX_ALU_INST];
+		} inst[R300_PFS_MAX_ALU_INST];
 	} alu;
 
 	struct r300_fragment_program_node node[4];
@@ -723,53 +516,12 @@ struct r300_fragment_program_code {
 	 * Remember which program register a given hardware constant
 	 * belongs to.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
+	struct prog_src_register constant[R300_PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
 };
 
-/**
- * Store everything about a fragment program that is needed
- * to render with that program.
- */
-struct r300_fragment_program {
-	struct gl_fragment_program mesa_program;
-
-	GLboolean translated;
-	GLboolean error;
-
-	struct r300_fragment_program_external_state state;
-	struct r300_fragment_program_code code;
-
-	GLboolean WritesDepth;
-	GLuint optimization;
-};
-
-struct r500_pfs_compile_state;
-
-struct r500_fragment_program_external_state {
-	struct {
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is:
-		 *  0 - GL_LUMINANCE
-		 *  1 - GL_INTENSITY
-		 *  2 - GL_ALPHA
-		 * depending on the depth texture mode.
-		 */
-		GLuint depth_texture_mode : 2;
-
-		/**
-		 * If the sampler is used as a shadow sampler,
-		 * this field is (texture_compare_func - GL_NEVER).
-		 * [e.g. if compare function is GL_LEQUAL, this field is 3]
-		 *
-		 * Otherwise, this field is 0.
-		 */
-		GLuint texture_compare_func : 3;
-	} unit[16];
-};
 
 struct r500_fragment_program_code {
 	struct {
@@ -779,7 +531,7 @@ struct r500_fragment_program_code {
 		GLuint inst3;
 		GLuint inst4;
 		GLuint inst5;
-	} inst[512];
+	} inst[R500_PFS_MAX_INST];
 
 	int inst_offset;
 	int inst_end;
@@ -788,94 +540,63 @@ struct r500_fragment_program_code {
 	 * Remember which program register a given hardware constant
 	 * belongs to.
 	 */
-	struct prog_src_register constant[PFS_NUM_CONST_REGS];
+	struct prog_src_register constant[R500_PFS_NUM_CONST_REGS];
 	int const_nr;
 
 	int max_temp_idx;
 };
 
-struct r500_fragment_program {
-	struct gl_fragment_program mesa_program;
+/**
+* Store everything about a fragment program that is needed
+* to render with that program.
+*/
+struct r300_fragment_program {
+	struct gl_program *Base;
 
-	GLcontext *ctx;
 	GLboolean translated;
 	GLboolean error;
 
-	struct r500_fragment_program_external_state state;
-	struct r500_fragment_program_code code;
+	struct r300_fragment_program_external_state state;
+	union rX00_fragment_program_code {
+		struct r300_fragment_program_code r300;
+		struct r500_fragment_program_code r500;
+	} code;
 
 	GLboolean writes_depth;
-
 	GLuint optimization;
-};
-
-#define R300_MAX_AOS_ARRAYS		16
-
-#define REG_COORDS	0
-#define REG_COLOR0	1
-#define REG_TEX0	2
-
-struct r300_state {
-	struct r300_depthbuffer_state depth;
-	struct r300_texture_state texture;
-	int sw_tcl_inputs[VERT_ATTRIB_MAX];
-	struct r300_vertex_shader_state vertex_shader;
-	struct r300_dma_region aos[R300_MAX_AOS_ARRAYS];
-	int aos_count;
 
-	GLuint *Elts;
-	struct r300_dma_region elt_dma;
+	struct r300_fragment_program *next;
 
-	struct r300_dma_region swtcl_dma;
-	DECLARE_RENDERINPUTS(render_inputs_bitset);	/* actual render inputs that R300 was configured for.
-							   They are the same as tnl->render_inputs for fixed pipeline */
+	/* attribute that we are sending the WPOS in */
+	gl_frag_attrib wpos_attr;
+	/* attribute that we are sending the fog coordinate in */
+	gl_frag_attrib fog_attr;
+};
 
-	struct r300_stencilbuffer_state stencil;
+struct r300_fragment_program_cont {
+	/* This is the unmodified fragment program mesa provided us with.
+	 * We need to keep it unchanged because we may need to create another
+	 * hw specific fragment program based on this.
+	 */
+	struct gl_fragment_program Base;
+	/* This is the list of hw specific fragment programs derived from Base */
+	struct r300_fragment_program *progs;
+};
 
+struct r300_fragment_program_compiler {
+	r300ContextPtr r300;
+	struct r300_fragment_program *fp;
+	union rX00_fragment_program_code *code;
+	struct gl_program *program;
 };
 
-#define R300_FALLBACK_NONE 0
-#define R300_FALLBACK_TCL 1
-#define R300_FALLBACK_RAST 2
+#define R300_MAX_AOS_ARRAYS		16
+
 
 /* r300_swtcl.c
  */
 struct r300_swtcl_info {
-   GLuint RenderIndex;
-
-   /**
-    * Size of a hardware vertex.  This is calculated when \c ::vertex_attrs is
-    * installed in the Mesa state vector.
-    */
-   GLuint vertex_size;
-
-   /**
-    * Attributes instructing the Mesa TCL pipeline where / how to put vertex
-    * data in the hardware buffer.
-    */
-   struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
-
-   /**
-    * Number of elements of \c ::vertex_attrs that are actually used.
-    */
-   GLuint vertex_attr_count;
-
-   /**
-    * Cached pointer to the buffer where Mesa will store vertex data.
-    */
-   GLubyte *verts;
-
-   /* Fallback rasterization functions
-    */
-  //   r200_point_func draw_point;
-  //   r200_line_func draw_line;
-  //   r200_tri_func draw_tri;
-
-   GLuint hw_primitive;
-   GLenum render_primitive;
-   GLuint numverts;
-
-   /**
+  /*
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
     */
    GLuint coloroffset;
@@ -884,15 +605,44 @@ struct r300_swtcl_info {
     * Offset of the 3UB specular color data within a hardware (swtcl) vertex.
     */
    GLuint specoffset;
+};
 
-   /**
-    * Should Mesa project vertex data or will the hardware do it?
-    */
-   GLboolean needproj;
+struct r300_vtable {
+	void (* SetupRSUnit)(GLcontext *ctx);
+	void (* SetupFragmentShaderTextures)(GLcontext *ctx, int *tmu_mappings);
+	GLboolean (* BuildFragmentProgramHwCode)(struct r300_fragment_program_compiler *compiler);
+	void (* FragmentProgramDump)(union rX00_fragment_program_code *code);
+	void (* SetupPixelShader)(GLcontext *ctx);
+};
 
-   struct r300_dma_region indexed_verts;
+struct r300_vertex_buffer {
+	struct vertex_attribute {
+		/* generic */
+		GLubyte element;
+		GLvoid *data;
+		GLboolean free_needed;
+		GLuint stride;
+		GLuint dwords;
+		GLubyte size; /* number of components */
+
+		/* hw specific */
+		uint32_t data_type:4;
+		uint32_t dst_loc:5;
+		uint32_t _signed:1;
+		uint32_t normalize:1;
+		uint32_t swizzle:12;
+		uint32_t write_mask:4;
+	} attribs[VERT_ATTRIB_MAX];
+
+	GLubyte num_attribs;
 };
 
+struct r300_index_buffer {
+	GLvoid *ptr;
+	GLboolean is_32bit;
+	GLboolean free_needed;
+	GLuint count;
+};
 
 /**
  * \brief R300 context structure.
@@ -900,46 +650,34 @@ struct r300_swtcl_info {
 struct r300_context {
 	struct radeon_context radeon;	/* parent class, must be first */
 
+	struct r300_vtable vtbl;
+
 	struct r300_hw_state hw;
-	struct r300_cmdbuf cmdbuf;
-	struct r300_state state;
-	struct gl_vertex_program *curr_vp;
+
 	struct r300_vertex_program *selected_vp;
+	struct r300_fragment_program *selected_fp;
 
 	/* Vertex buffers
 	 */
-	struct r300_dma dma;
-	GLboolean save_on_next_unlock;
-	GLuint NewGLState;
-
-	/* Texture object bookkeeping
-	 */
-	unsigned nr_heaps;
-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
-	driTextureObject swapped;
-	int texture_depth;
-	float initialMaxAnisotropy;
-
-	/* Clientdata textures;
-	 */
-	GLuint prefer_gart_client_texturing;
-
-#ifdef USER_BUFFERS
-	struct r300_memory_manager *rmm;
-#endif
-
 	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
 	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
 
-	GLboolean disable_lowimpact_fallback;
-
-	DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+	struct r300_options {
+		uint32_t conformance_mode:1;
+		uint32_t hw_tcl_enabled:1;
+		uint32_t s3tc_force_enabled:1;
+		uint32_t s3tc_force_disabled:1;
+		uint32_t stencil_two_side_disabled:1;
+	} options;
+	
 	struct r300_swtcl_info swtcl;
-};
+	struct r300_vertex_buffer vbuf;
+	struct r300_index_buffer ind_buf;
+	GLboolean vap_flush_needed;
 
-struct r300_buffer_object {
-	struct gl_buffer_object mesa_obj;
-	int id;
+	uint32_t fallback;
+
+	DECLARE_RENDERINPUTS(render_inputs_bitset);
 };
 
 #define R300_CONTEXT(ctx)		((r300ContextPtr)(ctx->DriverCtx))
@@ -949,15 +687,13 @@ extern GLboolean r300CreateContext(const __GLcontextModes * glVisual,
 				   __DRIcontextPrivate * driContextPriv,
 				   void *sharedContextPrivate);
 
-extern void r300SelectVertexShader(r300ContextPtr r300);
 extern void r300InitShaderFuncs(struct dd_function_table *functions);
-extern int r300VertexProgUpdateParams(GLcontext * ctx,
-				      struct r300_vertex_program_cont *vp,
-				      float *dst);
-
-#define RADEON_D_CAPTURE 0
-#define RADEON_D_PLAYBACK 1
-#define RADEON_D_PLAYBACK_RAW 2
-#define RADEON_D_T 3
+
+extern void r300InitShaderFunctions(r300ContextPtr r300);
+
+extern void r300InitDraw(GLcontext *ctx);
+
+#define r300PackFloat32 radeonPackFloat32
+#define r300PackFloat24 radeonPackFloat24
 
 #endif				/* __R300_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_draw.c b/src/mesa/drivers/dri/r300/r300_draw.c
new file mode 100644
index 00000000000..9769ff53994
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_draw.c
@@ -0,0 +1,501 @@
+/**************************************************************************
+ *
+ * Copyright 2009 Maciej Cencora
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL THE AUTHOR(S) AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include <stdlib.h>
+
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/state.h"
+#include "main/api_validate.h"
+#include "main/enums.h"
+
+#include "r300_reg.h"
+#include "r300_context.h"
+#include "r300_emit.h"
+#include "r300_render.h"
+#include "r300_state.h"
+#include "r300_tex.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "vbo/vbo_context.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+
+static void r300FixupIndexBuffer(GLcontext *ctx, const struct _mesa_index_buffer *mesa_ind_buf, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_index_buffer *ind_buf = &r300->ind_buf;
+	GLvoid *src_ptr;
+
+	if (!mesa_ind_buf) {
+		ind_buf->ptr = NULL;
+		return;
+	}
+
+	ind_buf->count = mesa_ind_buf->count;
+	if (mesa_ind_buf->obj->Name && !mesa_ind_buf->obj->Pointer) {
+		bo[*nr_bo] = mesa_ind_buf->obj;
+		(*nr_bo)++;
+		ctx->Driver.MapBuffer(ctx, GL_ELEMENT_ARRAY_BUFFER, GL_READ_ONLY_ARB, mesa_ind_buf->obj);
+		assert(mesa_ind_buf->obj->Pointer != NULL);
+	}
+	src_ptr = ADD_POINTERS(mesa_ind_buf->obj->Pointer, mesa_ind_buf->ptr);
+
+	if (mesa_ind_buf->type == GL_UNSIGNED_BYTE) {
+		GLubyte *in = (GLubyte *)src_ptr;
+		GLuint *out = _mesa_malloc(sizeof(GLushort) * ((mesa_ind_buf->count + 1) & ~1));
+		int i;
+
+		ind_buf->ptr = out;
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+		ind_buf->free_needed = GL_TRUE;
+		ind_buf->is_32bit = GL_FALSE;
+	} else if (mesa_ind_buf->type == GL_UNSIGNED_SHORT) {
+#if MESA_BIG_ENDIAN
+		GLushort *in = (GLushort *)src_ptr;
+		GLuint *out = _mesa_malloc(sizeof(GLushort) *
+					   ((mesa_ind_buf->count + 1) & ~1));
+		int i;
+
+		ind_buf->ptr = out;
+
+		for (i = 0; i + 1 < mesa_ind_buf->count; i += 2) {
+			*out++ = in[i] | in[i + 1] << 16;
+		}
+
+		if (i < mesa_ind_buf->count) {
+			*out++ = in[i];
+		}
+
+		ind_buf->free_needed = GL_TRUE;
+#else
+		ind_buf->ptr = src_ptr;
+		ind_buf->free_needed = GL_FALSE;
+#endif
+		ind_buf->is_32bit = GL_FALSE;
+	} else {
+		ind_buf->ptr = src_ptr;
+		ind_buf->free_needed = GL_FALSE;
+		ind_buf->is_32bit = GL_TRUE;
+	}
+}
+
+static int getTypeSize(GLenum type)
+{
+	switch (type) {
+		case GL_DOUBLE:
+			return sizeof(GLdouble);
+		case GL_FLOAT:
+			return sizeof(GLfloat);
+		case GL_INT:
+			return sizeof(GLint);
+		case GL_UNSIGNED_INT:
+			return sizeof(GLuint);
+		case GL_SHORT:
+			return sizeof(GLshort);
+		case GL_UNSIGNED_SHORT:
+			return sizeof(GLushort);
+		case GL_BYTE:
+			return sizeof(GLbyte);
+		case GL_UNSIGNED_BYTE:
+			return sizeof(GLubyte);
+		default:
+			assert(0);
+			return 0;
+	}
+}
+
+#define CONVERT( TYPE, MACRO ) do {		\
+	GLuint i, j, sz;				\
+	sz = input->Size;				\
+	if (input->Normalized) {			\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = MACRO(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	} else {					\
+		for (i = 0; i < count; i++) {		\
+			const TYPE *in = (TYPE *)src_ptr;		\
+			for (j = 0; j < sz; j++) {		\
+				*dst_ptr++ = (GLfloat)(*in);		\
+				in++;				\
+			}					\
+			src_ptr += stride;			\
+		}						\
+	}						\
+} while (0)
+
+static void r300TranslateAttrib(GLcontext *ctx, GLuint attr, int count, const struct gl_client_array *input, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	struct vertex_attribute r300_attr;
+	const void *src_ptr;
+	GLenum type;
+	GLuint stride;
+
+	if (input->BufferObj->Name) {
+		if (!input->BufferObj->Pointer) {
+			bo[*nr_bo] = input->BufferObj;
+			(*nr_bo)++;
+			ctx->Driver.MapBuffer(ctx, GL_ARRAY_BUFFER, GL_READ_ONLY_ARB, input->BufferObj);
+			assert(input->BufferObj->Pointer != NULL);
+		}
+
+		src_ptr = ADD_POINTERS(input->BufferObj->Pointer, input->Ptr);
+	} else
+		src_ptr = input->Ptr;
+
+	stride = (input->StrideB == 0) ? getTypeSize(input->Type) * input->Size : input->StrideB;
+
+	if (input->Type == GL_DOUBLE || input->Type == GL_UNSIGNED_INT || input->Type == GL_INT ||
+#if MESA_BIG_ENDIAN
+	    getTypeSize(input->Type) != 4 ||
+#endif
+	    stride < 4) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS) {
+			fprintf(stderr, "%s: Converting vertex attributes, attribute data format %x,", __FUNCTION__, input->Type);
+			fprintf(stderr, "stride %d, components %d\n", stride, input->Size);
+		}
+
+		GLfloat *dst_ptr, *tmp;
+
+		/* Convert value for first element only */
+		if (input->StrideB == 0)
+			count = 1;
+
+		tmp = dst_ptr = _mesa_malloc(sizeof(GLfloat) * input->Size * count);
+
+		switch (input->Type) {
+			case GL_DOUBLE:
+				CONVERT(GLdouble, (GLfloat));
+				break;
+			case GL_UNSIGNED_INT:
+				CONVERT(GLuint, UINT_TO_FLOAT);
+				break;
+			case GL_INT:
+				CONVERT(GLint, INT_TO_FLOAT);
+				break;
+			case GL_UNSIGNED_SHORT:
+				CONVERT(GLushort, USHORT_TO_FLOAT);
+				break;
+			case GL_SHORT:
+				CONVERT(GLshort, SHORT_TO_FLOAT);
+				break;
+			case GL_UNSIGNED_BYTE:
+				assert(input->Format != GL_BGRA);
+				CONVERT(GLubyte, UBYTE_TO_FLOAT);
+				break;
+			case GL_BYTE:
+				CONVERT(GLbyte, BYTE_TO_FLOAT);
+				break;
+			default:
+				assert(0);
+				break;
+		}
+
+		type = GL_FLOAT;
+		r300_attr.free_needed = GL_TRUE;
+		r300_attr.data = tmp;
+		if (input->StrideB == 0) {
+			r300_attr.stride = 0;
+		} else {
+			r300_attr.stride = sizeof(GLfloat) * input->Size;
+		}
+		r300_attr.dwords = input->Size;
+	} else {
+		type = input->Type;
+		r300_attr.free_needed = GL_FALSE;
+		r300_attr.data = (GLvoid *)src_ptr;
+		r300_attr.stride = input->StrideB;
+		r300_attr.dwords = (getTypeSize(type) * input->Size  + 3)/ 4;
+	}
+
+	r300_attr.size = input->Size;
+	r300_attr.element = attr;
+	r300_attr.dst_loc = vbuf->num_attribs;
+
+	switch (type) {
+		case GL_FLOAT:
+			switch (input->Size) {
+				case 1: r300_attr.data_type = R300_DATA_TYPE_FLOAT_1; break;
+				case 2: r300_attr.data_type = R300_DATA_TYPE_FLOAT_2; break;
+				case 3: r300_attr.data_type = R300_DATA_TYPE_FLOAT_3; break;
+				case 4: r300_attr.data_type = R300_DATA_TYPE_FLOAT_4; break;
+			}
+			r300_attr._signed = 0;
+			r300_attr.normalize = 0;
+			break;
+		case GL_SHORT:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_BYTE:
+			r300_attr._signed = 1;
+			r300_attr.normalize = input->Normalized;
+			r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+		case GL_UNSIGNED_SHORT:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			switch (input->Size) {
+				case 1:
+				case 2:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_2;
+					break;
+				case 3:
+				case 4:
+					r300_attr.data_type = R300_DATA_TYPE_SHORT_4;
+					break;
+			}
+			break;
+		case GL_UNSIGNED_BYTE:
+			r300_attr._signed = 0;
+			r300_attr.normalize = input->Normalized;
+			if (input->Format == GL_BGRA)
+				r300_attr.data_type = R300_DATA_TYPE_D3DCOLOR;
+			else
+				r300_attr.data_type = R300_DATA_TYPE_BYTE;
+			break;
+
+		default:
+		case GL_DOUBLE:
+		case GL_INT:
+		case GL_UNSIGNED_INT:
+			assert(0);
+			break;
+	}
+
+	switch (input->Size) {
+		case 4:
+			r300_attr.swizzle = SWIZZLE_XYZW;
+			break;
+		case 3:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			break;
+		case 2:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+		case 1:
+			r300_attr.swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+			break;
+	}
+
+	r300_attr.write_mask = MASK_XYZW;
+
+	vbuf->attribs[vbuf->num_attribs] = r300_attr;
+	++vbuf->num_attribs;
+}
+
+static void r300SetVertexFormat(GLcontext *ctx, const struct gl_client_array *arrays[], int count, struct gl_buffer_object **bo, GLuint *nr_bo)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+
+	{
+		int i, tmp;
+
+		tmp = r300->selected_vp->Base->Base.InputsRead;
+		i = 0;
+		vbuf->num_attribs = 0;
+		while (tmp) {
+			/* find first enabled bit */
+			while (!(tmp & 1)) {
+				tmp >>= 1;
+				++i;
+			}
+
+			r300TranslateAttrib(ctx, i, count, arrays[i], bo, nr_bo);
+
+			tmp >>= 1;
+			++i;
+		}
+	}
+
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback)
+		return;
+
+	{
+		int i;
+
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			rcommon_emit_vector(ctx, &r300->radeon.tcl.aos[i],
+						vbuf->attribs[i].data, vbuf->attribs[i].dwords,
+						vbuf->attribs[i].stride, count);
+		}
+
+		r300->radeon.tcl.aos_count = vbuf->num_attribs;
+	}
+}
+
+static void r300FreeData(GLcontext *ctx, struct gl_buffer_object **bo, GLuint nr_bo)
+{
+	{
+		struct r300_vertex_buffer *vbuf = &R300_CONTEXT(ctx)->vbuf;
+		int i;
+
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			if (vbuf->attribs[i].free_needed)
+				_mesa_free(vbuf->attribs[i].data);
+		}
+	}
+
+	{
+		struct r300_index_buffer *ind_buf = &R300_CONTEXT(ctx)->ind_buf;
+		if (ind_buf->free_needed)
+			_mesa_free(ind_buf->ptr);
+	}
+
+	{
+		int i;
+
+		for (i = 0; i < nr_bo; ++i) {
+			ctx->Driver.UnmapBuffer(ctx, 0, bo[i]);
+		}
+	}
+}
+
+static GLboolean r300TryDrawPrims(GLcontext *ctx,
+					 const struct gl_client_array *arrays[],
+					 const struct _mesa_prim *prim,
+					 GLuint nr_prims,
+					 const struct _mesa_index_buffer *ib,
+					 GLuint min_index,
+					 GLuint max_index )
+{
+	struct r300_context *r300 = R300_CONTEXT(ctx);
+	struct gl_buffer_object *bo[VERT_ATTRIB_MAX+1];
+	GLuint i, nr_bo = 0;
+
+	if (ctx->NewState)
+		_mesa_update_state( ctx );
+
+	if (r300->options.hw_tcl_enabled)
+		_tnl_UpdateFixedFunctionProgram(ctx);
+
+	r300UpdateShaders(r300);
+
+	r300SwitchFallback(ctx, R300_FALLBACK_INVALID_BUFFERS, !r300ValidateBuffers(ctx));
+
+	r300FixupIndexBuffer(ctx, ib, bo, &nr_bo);
+
+	/* ensure we have the cmd buf space in advance to cover
+ 	 * the state + DMA AOS pointers */
+	rcommonEnsureCmdBufSpace(&r300->radeon,
+                           r300->radeon.hw.max_state_size + (50*sizeof(int)),
+                           __FUNCTION__);
+
+	r300SetVertexFormat(ctx, arrays, max_index + 1, bo, &nr_bo);
+
+	if (r300->fallback)
+		return GL_FALSE;
+
+	r300SetupVAP(ctx, r300->selected_vp->Base->Base.InputsRead, r300->selected_vp->Base->Base.OutputsWritten);
+
+	r300UpdateShaderStates(r300);
+
+	r300EmitCacheFlush(r300);
+	radeonEmitState(&r300->radeon);
+
+	for (i = 0; i < nr_prims; ++i) {
+		r300RunRenderPrimitive(ctx, prim[i].start, prim[i].start + prim[i].count, prim[i].mode);
+	}
+
+	r300EmitCacheFlush(r300);
+
+	radeonReleaseArrays(ctx, ~0);
+
+	r300FreeData(ctx, bo, nr_bo);
+
+	return GL_TRUE;
+}
+
+static void r300DrawPrims(GLcontext *ctx,
+			 const struct gl_client_array *arrays[],
+			 const struct _mesa_prim *prim,
+			 GLuint nr_prims,
+			 const struct _mesa_index_buffer *ib,
+			 GLuint min_index,
+			 GLuint max_index)
+{
+	struct split_limits limits;
+	GLboolean retval;
+
+	if (ib)
+		limits.max_verts = 0xffffffff;
+	else
+		limits.max_verts = 65535;
+
+	limits.max_indices = 65535;
+	limits.max_vb_size = 1024*1024;
+
+	if (min_index) {
+		vbo_rebase_prims( ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims );
+		return;
+	}
+	if ((ib && ib->count > 65535)) {
+		vbo_split_prims (ctx, arrays, prim, nr_prims, ib, min_index, max_index, r300DrawPrims, &limits);
+		return;
+	}
+
+	/* Make an attempt at drawing */
+	retval = r300TryDrawPrims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+
+	/* If failed run tnl pipeline - it should take care of fallbacks */
+	if (!retval)
+		_tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
+}
+
+void r300InitDraw(GLcontext *ctx)
+{
+	struct vbo_context *vbo = vbo_context(ctx);
+
+	vbo->draw_prims = r300DrawPrims;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_emit.c b/src/mesa/drivers/dri/r300/r300_emit.c
index 28c3157427c..feb3370f372 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_emit.c
@@ -31,6 +31,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \file
  *
  * \author Keith Whitwell <[email protected]>
+ * \author Maciej Cencora <[email protected]>
  */
 
 #include "main/glheader.h"
@@ -46,222 +47,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_context.h"
 
 #include "r300_context.h"
-#include "radeon_ioctl.h"
 #include "r300_state.h"
 #include "r300_emit.h"
 #include "r300_ioctl.h"
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
-
-#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
-    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
-    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
-    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
-    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
-    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
-#error Cannot change these!
-#endif
-
-#define DEBUG_ALL DEBUG_VERTS
-
-#if defined(USE_X86_ASM)
-#define COPY_DWORDS( dst, src, nr )					\
-do {									\
-	int __tmp;							\
-	__asm__ __volatile__( "rep ; movsl"				\
-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
-			      : "0" (nr),				\
-			        "D" ((long)dst),			\
-			        "S" ((long)src) );			\
-} while (0)
-#else
-#define COPY_DWORDS( dst, src, nr )		\
-do {						\
-   int j;					\
-   for ( j = 0 ; j < nr ; j++ )			\
-      dst[j] = ((int *)src)[j];			\
-   dst += nr;					\
-} while (0)
-#endif
-
-static void r300EmitVec4(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 4)
-		COPY_DWORDS(out, data, count);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out++;
-			data += stride;
-		}
-}
-
-static void r300EmitVec8(GLcontext * ctx, struct r300_dma_region *rvb,
-			 GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 8)
-		COPY_DWORDS(out, data, count * 2);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out += 2;
-			data += stride;
-		}
-}
-
-static void r300EmitVec12(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 12)
-		COPY_DWORDS(out, data, count * 3);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out[2] = *(int *)(data + 8);
-			out += 3;
-			data += stride;
-		}
-}
-
-static void r300EmitVec16(GLcontext * ctx, struct r300_dma_region *rvb,
-			  GLvoid * data, int stride, int count)
-{
-	int i;
-	int *out = (int *)(rvb->address + rvb->start);
-
-	if (RADEON_DEBUG & DEBUG_VERTS)
-		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-			__FUNCTION__, count, stride, (void *)out, (void *)data);
-
-	if (stride == 16)
-		COPY_DWORDS(out, data, count * 4);
-	else
-		for (i = 0; i < count; i++) {
-			out[0] = *(int *)data;
-			out[1] = *(int *)(data + 4);
-			out[2] = *(int *)(data + 8);
-			out[3] = *(int *)(data + 12);
-			out += 4;
-			data += stride;
-		}
-}
-
-static void r300EmitVec(GLcontext * ctx, struct r300_dma_region *rvb,
-			GLvoid * data, int size, int stride, int count)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (stride == 0) {
-		r300AllocDmaRegion(rmesa, rvb, size * 4, 4);
-		count = 1;
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = 0;
-	} else {
-		r300AllocDmaRegion(rmesa, rvb, size * count * 4, 4);
-		rvb->aos_offset = GET_START(rvb);
-		rvb->aos_stride = size;
-	}
-
-	switch (size) {
-	case 1:
-		r300EmitVec4(ctx, rvb, data, stride, count);
-		break;
-	case 2:
-		r300EmitVec8(ctx, rvb, data, stride, count);
-		break;
-	case 3:
-		r300EmitVec12(ctx, rvb, data, stride, count);
-		break;
-	case 4:
-		r300EmitVec16(ctx, rvb, data, stride, count);
-		break;
-	default:
-		assert(0);
-		break;
-	}
-}
-
-#define DW_SIZE(x) ((inputs[tab[(x)]] << R300_DST_VEC_LOC_SHIFT) |	\
-		    (attribptr[tab[(x)]]->size - 1) << R300_DATA_TYPE_0_SHIFT)
-
-GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr)
-{
-	GLuint i, dw;
-
-	/* type, inputs, stop bit, size */
-	for (i = 0; i < nr; i += 2) {
-		/* make sure input is valid, would lockup the gpu */
-		assert(inputs[tab[i]] != -1);
-		dw = (R300_SIGNED | DW_SIZE(i));
-		if (i + 1 == nr) {
-			dw |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
-		} else {
-			assert(inputs[tab[i + 1]] != -1);
-			dw |= (R300_SIGNED |
-			       DW_SIZE(i + 1)) << R300_DATA_TYPE_1_SHIFT;
-			if (i + 2 == nr) {
-				dw |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
-			}
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
-
-static GLuint r300VAPInputRoute1Swizzle(int swizzle[4])
-{
-	return (swizzle[0] << R300_SWIZZLE_SELECT_X_SHIFT) |
-	    (swizzle[1] << R300_SWIZZLE_SELECT_Y_SHIFT) |
-	    (swizzle[2] << R300_SWIZZLE_SELECT_Z_SHIFT) |
-	    (swizzle[3] << R300_SWIZZLE_SELECT_W_SHIFT);
-}
-
-GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr)
-{
-	GLuint i, dw;
-
-	for (i = 0; i < nr; i += 2) {
-		dw = (r300VAPInputRoute1Swizzle(swizzle[i]) |
-		      ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-			R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE0_SHIFT;
-		if (i + 1 < nr) {
-			dw |= (r300VAPInputRoute1Swizzle(swizzle[i + 1]) |
-			       ((R300_WRITE_ENA_X | R300_WRITE_ENA_Y |
-				 R300_WRITE_ENA_Z | R300_WRITE_ENA_W) << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
-		}
-		dst[i >> 1] = dw;
-	}
-
-	return (nr + 1) >> 1;
-}
+#include "r300_render.h"
+#include "r300_swtcl.h"
 
 GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 {
@@ -272,7 +62,6 @@ GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead)
 
 GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	GLuint i, vic_1 = 0;
 
 	if (InputsRead & (1 << VERT_ATTRIB_POS))
@@ -284,281 +73,103 @@ GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead)
 	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
 		vic_1 |= R300_INPUT_CNTL_COLOR;
 
-	rmesa->state.texture.tc_count = 0;
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
 		if (InputsRead & (1 << (VERT_ATTRIB_TEX0 + i))) {
-			rmesa->state.texture.tc_count++;
 			vic_1 |= R300_INPUT_CNTL_TC0 << i;
 		}
 
 	return vic_1;
 }
 
-GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes)
 {
 	GLuint ret = 0;
 
-	if (OutputsWritten & (1 << VERT_RESULT_HPOS))
+	if (vp_writes & (1 << VERT_RESULT_HPOS))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL0))
+	if (vp_writes & (1 << VERT_RESULT_COL0))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_COL1))
+	if (vp_writes & (1 << VERT_RESULT_COL1))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_BFC0)
-	    || OutputsWritten & (1 << VERT_RESULT_BFC1))
-		ret |=
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT |
-		    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
+	/* Two sided lighting works only if all 4 colors are written */
+	if (vp_writes & (1 << VERT_RESULT_BFC0) || vp_writes & (1 << VERT_RESULT_BFC1))
+		ret |= R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_1_PRESENT |
+			   R300_VAP_OUTPUT_VTX_FMT_0__COLOR_2_PRESENT | R300_VAP_OUTPUT_VTX_FMT_0__COLOR_3_PRESENT;
 
-	if (OutputsWritten & (1 << VERT_RESULT_PSIZ))
+	if (vp_writes & (1 << VERT_RESULT_PSIZ))
 		ret |= R300_VAP_OUTPUT_VTX_FMT_0__PT_SIZE_PRESENT;
 
 	return ret;
 }
 
-GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten)
+GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes)
 {
 	GLuint i, ret = 0, first_free_texcoord = 0;
 
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		if (OutputsWritten & (1 << (VERT_RESULT_TEX0 + i))) {
-			ret |= (4 << (3 * i));
+		if (vp_writes & (1 << (VERT_RESULT_TEX0 + i))) {
+			ret |= (4 << (3 * first_free_texcoord));
 			++first_free_texcoord;
 		}
 	}
 
-	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
-		if (first_free_texcoord > 8) {
-			fprintf(stderr, "\tout of free texcoords to write fog coord\n");
-			_mesa_exit(-1);
-		}
-		ret |= 4 << (3 * first_free_texcoord);
+	if (first_free_texcoord > 8) {
+		fprintf(stderr, "\tout of free texcoords\n");
+		_mesa_exit(-1);
 	}
 
 	return ret;
 }
 
-/* Emit vertex data to GART memory
- * Route inputs to the vertex processor
- * This function should never return R300_FALLBACK_TCL when using software tcl.
- */
-int r300EmitArrays(GLcontext * ctx)
+GLboolean r300EmitArrays(GLcontext * ctx)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
-	GLuint nr;
-	GLuint count = vb->Count;
-	GLuint i;
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int *inputs = NULL;
-	int vir_inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	struct r300_vertex_program *prog =
-	    (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-
-	if (hw_tcl_on) {
-		inputs = prog->inputs;
-		InputsRead = prog->key.InputsRead;
-		OutputsWritten = prog->key.OutputsWritten;
-	} else {
-		inputs = rmesa->state.sw_tcl_inputs;
-
-		DECLARE_RENDERINPUTS(render_inputs_bitset);
-		RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-
-		vb->AttribPtr[VERT_ATTRIB_POS] = vb->ClipPtr;
-
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS));
-		assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_NORMAL) == 0);
-		//assert(RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0));
-
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_POS)) {
-			InputsRead |= 1 << VERT_ATTRIB_POS;
-			OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		}
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_buffer *vbuf = &r300->vbuf;
+	GLuint InputsRead, OutputsWritten;
 
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR0)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR0;
-			OutputsWritten |= 1 << VERT_RESULT_COL0;
-		}
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
 
-		if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_COLOR1)) {
-			InputsRead |= 1 << VERT_ATTRIB_COLOR1;
-			OutputsWritten |= 1 << VERT_RESULT_COL1;
-		}
+	r300SwitchFallback(ctx, R300_FALLBACK_AOS_LIMIT, vbuf->num_attribs > R300_MAX_AOS_ARRAYS);
+	if (r300->fallback & R300_RASTER_FALLBACK_MASK)
+		return GL_FALSE;
 
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST(render_inputs_bitset, _TNL_ATTRIB_TEX(i))) {
-				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
-				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-			}
-		}
+	{
+		struct vertex_buffer *mesa_vb = &TNL_CONTEXT(ctx)->vb;
+		GLuint attr, i;
 
-		for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-			if (InputsRead & (1 << i)) {
-				inputs[i] = nr++;
-			} else {
-				inputs[i] = -1;
-			}
+		for (i = 0; i < vbuf->num_attribs; i++) {
+			attr = vbuf->attribs[i].element;
+			rcommon_emit_vector(ctx, &r300->radeon.tcl.aos[i], mesa_vb->AttribPtr[attr]->data,
+					mesa_vb->AttribPtr[attr]->size, mesa_vb->AttribPtr[attr]->stride, mesa_vb->Count);
 		}
 
-		/* Fixed, apply to vir0 only */
-		memcpy(vir_inputs, inputs, VERT_ATTRIB_MAX * sizeof(int));
-		inputs = vir_inputs;
-		if (InputsRead & VERT_ATTRIB_POS)
-			inputs[VERT_ATTRIB_POS] = 0;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-			inputs[VERT_ATTRIB_COLOR0] = 2;
-		if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-			inputs[VERT_ATTRIB_COLOR1] = 3;
-		for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-			if (InputsRead & (1 << i))
-				inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-
-		RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
-	}
+		r300->radeon.tcl.aos_count = vbuf->num_attribs;
 
-	assert(InputsRead);
-	assert(OutputsWritten);
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
-		}
+		/* Fill index buffer info */
+		r300->ind_buf.ptr = mesa_vb->Elts;
+		r300->ind_buf.is_32bit = GL_TRUE;
+		r300->ind_buf.free_needed = GL_FALSE;
 	}
 
-	if (nr > R300_MAX_AOS_ARRAYS) {
-		return R300_FALLBACK_TCL;
-	}
-
-	for (i = 0; i < nr; i++) {
-		int ci, fix, found = 0;
-
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
-
-		for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
-		if (r300IsGartMemory(rmesa, vb->AttribPtr[tab[i]]->data, 4)) {
-			if (vb->AttribPtr[tab[i]]->stride % 4) {
-				return R300_FALLBACK_TCL;
-			}
-			rmesa->state.aos[i].address = (void *)(vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].start = 0;
-			rmesa->state.aos[i].aos_offset = r300GartOffsetFromVirtual(rmesa, vb->AttribPtr[tab[i]]->data);
-			rmesa->state.aos[i].aos_stride = vb->AttribPtr[tab[i]]->stride / 4;
-			rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-		} else {
-			r300EmitVec(ctx, &rmesa->state.aos[i],
-				    vb->AttribPtr[tab[i]]->data,
-				    vb->AttribPtr[tab[i]]->size,
-				    vb->AttribPtr[tab[i]]->stride, count);
-		}
-
-		rmesa->state.aos[i].aos_size = vb->AttribPtr[tab[i]]->size;
-
-		for (fix = 0; fix <= 4 - vb->AttribPtr[tab[i]]->size; fix++) {
-			if ((rmesa->state.aos[i].aos_offset - _mesa_sizeof_type(GL_FLOAT) * fix) % 4) {
-				continue;
-			}
-			found = 1;
-			break;
-		}
-
-		if (found) {
-			if (fix > 0) {
-				WARN_ONCE("Feeling lucky?\n");
-			}
-			rmesa->state.aos[i].aos_offset -= _mesa_sizeof_type(GL_FLOAT) * fix;
-			for (ci = 0; ci < vb->AttribPtr[tab[i]]->size; ci++) {
-				swizzle[i][ci] += fix;
-			}
-		} else {
-			WARN_ONCE
-			    ("Cannot handle offset %x with stride %d, comp %d\n",
-			     rmesa->state.aos[i].aos_offset,
-			     rmesa->state.aos[i].aos_stride,
-			     vb->AttribPtr[tab[i]]->size);
-			return R300_FALLBACK_TCL;
-		}
-	}
-
-	/* Setup INPUT_ROUTE. */
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-	    r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-			       vb->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-	    r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-			       nr);
-
-	/* Setup INPUT_CNTL. */
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-
-	/* Setup OUTPUT_VTX_FMT. */
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] =
-	    r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] =
-	    r300VAPOutputCntl1(ctx, OutputsWritten);
-
-	rmesa->state.aos_count = nr;
-
-	return R300_FALLBACK_NONE;
-}
-
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-
-	if (rmesa->state.elt_dma.buf)
-		r300_mem_use(rmesa, rmesa->state.elt_dma.buf->id);
-
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		if (rmesa->state.aos[i].buf)
-			r300_mem_use(rmesa, rmesa->state.aos[i].buf->id);
-	}
-}
-#endif
-
-void r300ReleaseArrays(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	int i;
-
-	r300ReleaseDmaRegion(rmesa, &rmesa->state.elt_dma, __FUNCTION__);
-	for (i = 0; i < rmesa->state.aos_count; i++) {
-		r300ReleaseDmaRegion(rmesa, &rmesa->state.aos[i], __FUNCTION__);
-	}
+	return GL_TRUE;
 }
 
 void r300EmitCacheFlush(r300ContextPtr rmesa)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	reg_start(R300_RB3D_DSTCACHE_CTLSTAT, 0);
-	e32(R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
-	    R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
-
-	reg_start(R300_ZB_ZCACHE_CTLSTAT, 0);
-	e32(R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
-	    R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	BATCH_LOCALS(&rmesa->radeon);
+
+	BEGIN_BATCH_NO_AUTOSTATE(4);
+	OUT_BATCH_REGVAL(R300_RB3D_DSTCACHE_CTLSTAT,
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FREE_FREE_3D_TAGS |
+		R300_RB3D_DSTCACHE_CTLSTAT_DC_FLUSH_FLUSH_DIRTY_3D);
+	OUT_BATCH_REGVAL(R300_ZB_ZCACHE_CTLSTAT,
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FLUSH_FLUSH_AND_FREE |
+		R300_ZB_ZCACHE_CTLSTAT_ZC_FREE_FREE);
+	END_BATCH();
+	COMMIT_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_emit.h b/src/mesa/drivers/dri/r300/r300_emit.h
index 89d738339f8..3f8c60ffae1 100644
--- a/src/mesa/drivers/dri/r300/r300_emit.h
+++ b/src/mesa/drivers/dri/r300/r300_emit.h
@@ -44,28 +44,31 @@
 #include "r300_cmdbuf.h"
 #include "radeon_reg.h"
 
-/* TODO: move these defines (and the ones from DRM) into r300_reg.h and sync up
- * with DRM */
-#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
-#define CP_PACKET3( pkt, n )						\
-	(RADEON_CP_PACKET3 | (pkt) | ((n) << 16))
-
-static INLINE uint32_t cmdpacket0(int reg, int count)
+static INLINE uint32_t cmdpacket0(struct radeon_screen *rscrn,
+                                  int reg, int count)
 {
-	drm_r300_cmd_header_t cmd;
-
-	cmd.packet0.cmd_type = R300_CMD_PACKET0;
-	cmd.packet0.count = count;
-	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
-	cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
-
-	return cmd.u;
+    if (!rscrn->kernel_mm) {
+	    drm_r300_cmd_header_t cmd;
+
+	cmd.u = 0;
+    	cmd.packet0.cmd_type = R300_CMD_PACKET0;
+	    cmd.packet0.count = count;
+    	cmd.packet0.reghi = ((unsigned int)reg & 0xFF00) >> 8;
+	    cmd.packet0.reglo = ((unsigned int)reg & 0x00FF);
+
+    	return cmd.u;
+    }
+    if (count) {
+        return CP_PACKET0(reg, count - 1);
+    }
+    return CP_PACKET2;
 }
 
-static INLINE uint32_t cmdvpu(int addr, int count)
+static INLINE uint32_t cmdvpu(struct radeon_screen *rscrn, int addr, int count)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.vpu.cmd_type = R300_CMD_VPU;
 	cmd.vpu.count = count;
 	cmd.vpu.adrhi = ((unsigned int)addr & 0xFF00) >> 8;
@@ -74,10 +77,12 @@ static INLINE uint32_t cmdvpu(int addr, int count)
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
+static INLINE uint32_t cmdr500fp(struct radeon_screen *rscrn,
+                                 int addr, int count, int type, int clamp)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.r500fp.cmd_type = R300_CMD_R500FP;
 	cmd.r500fp.count = count;
 	cmd.r500fp.adrhi_flags = ((unsigned int)addr & 0x100) >> 8;
@@ -88,181 +93,139 @@ static INLINE uint32_t cmdr500fp(int addr, int count, int type, int clamp)
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdpacket3(int packet)
+static INLINE uint32_t cmdpacket3(struct radeon_screen *rscrn, int packet)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.packet3.cmd_type = R300_CMD_PACKET3;
 	cmd.packet3.packet = packet;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdcpdelay(unsigned short count)
+static INLINE uint32_t cmdcpdelay(struct radeon_screen *rscrn,  
+                                  unsigned short count)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
+
 	cmd.delay.cmd_type = R300_CMD_CP_DELAY;
 	cmd.delay.count = count;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdwait(unsigned char flags)
+static INLINE uint32_t cmdwait(struct radeon_screen *rscrn,
+                               unsigned char flags)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.wait.cmd_type = R300_CMD_WAIT;
 	cmd.wait.flags = flags;
 
 	return cmd.u;
 }
 
-static INLINE uint32_t cmdpacify(void)
+static INLINE uint32_t cmdpacify(struct radeon_screen *rscrn)
 {
 	drm_r300_cmd_header_t cmd;
 
+	cmd.u = 0;
 	cmd.header.cmd_type = R300_CMD_END3D;
 
 	return cmd.u;
 }
 
 /**
- * Prepare to write a register value to register at address reg.
- * If num_extra > 0 then the following extra values are written
- * to registers with address +4, +8 and so on..
- */
-#define reg_start(reg, num_extra)					\
-	do {								\
-		int _n;							\
-		_n=(num_extra);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+2),				\
-					__FUNCTION__);			\
-		cmd_reserved=_n+2;					\
-		cmd_written=1;						\
-		cmd[0].i=cmdpacket0((reg), _n+1);			\
-	} while (0);
-
-/**
- * Emit GLuint freestyle
+ * Write the header of a packet3 to the command buffer.
+ * Outputs 2 dwords and expects (num_extra+1) additional dwords afterwards.
  */
-#define e32(dword)							\
-	do {								\
-		if(cmd_written<cmd_reserved) {				\
-			cmd[cmd_written].i=(dword);			\
-			cmd_written++;					\
-		} else {						\
-			fprintf(stderr,					\
-				"e32 but no previous packet "		\
-				"declaration.\n"			\
-				"Aborting! in %s::%s at line %d, "	\
-				"cmd_written=%d cmd_reserved=%d\n",	\
-				__FILE__, __FUNCTION__, __LINE__,	\
-				cmd_written, cmd_reserved);		\
-			_mesa_exit(-1);					\
-		}							\
+#define OUT_BATCH_PACKET3(packet, num_extra) do {\
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		\
+    	OUT_BATCH(cmdpacket3(b_l_rmesa->radeonScreen,\
+                  R300_CMD_PACKET3_RAW)); \
+    } else b_l_rmesa->cmdbuf.cs->section_cdw++;\
+	OUT_BATCH(CP_PACKET3((packet), (num_extra))); \
 	} while(0)
 
-#define	efloat(f) e32(r300PackFloat32(f))
-
-#define vsf_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+2;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdvpu((dest), _n/4);			\
-	} while (0);
-
-#define r500fp_start_fragment(dest, length)				\
-	do {								\
-		int _n;							\
-		_n = (length);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+1),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+1;					\
-		cmd_written =1;						\
-		cmd[0].i = cmdr500fp((dest), _n/6, 0, 0);		\
-	} while (0);
-
-#define start_packet3(packet, count)					\
-	{								\
-		int _n;							\
-		GLuint _p;						\
-		_n = (count);						\
-		_p = (packet);						\
-		cmd = (drm_radeon_cmd_header_t*)			\
-			r300AllocCmdBuf(rmesa,				\
-					(_n+3),				\
-					__FUNCTION__);			\
-		cmd_reserved = _n+3;					\
-		cmd_written = 2;					\
-		if(_n > 0x3fff) {					\
-			fprintf(stderr,"Too big packet3 %08x: cannot "	\
-				"store %d dwords\n",			\
-				_p, _n);				\
-			_mesa_exit(-1);					\
-		}							\
-		cmd[0].i = cmdpacket3(R300_CMD_PACKET3_RAW);		\
-		cmd[1].i = _p | ((_n & 0x3fff)<<16);			\
-	}
-
 /**
  * Must be sent to switch to 2d commands
  */
-void static INLINE end_3d(r300ContextPtr rmesa)
+void static INLINE end_3d(radeonContextPtr radeon)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(radeon);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].header.cmd_type = R300_CMD_END3D;
+	if (!radeon->radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdpacify(radeon->radeonScreen));
+		END_BATCH();
+	}
 }
 
 void static INLINE cp_delay(r300ContextPtr rmesa, unsigned short count)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdcpdelay(count);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdcpdelay(rmesa->radeon.radeonScreen, count));
+		END_BATCH();
+	}
 }
 
-void static INLINE cp_wait(r300ContextPtr rmesa, unsigned char flags)
+void static INLINE cp_wait(radeonContextPtr radeon, unsigned char flags)
 {
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	cmd =
-	    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa, 1, __FUNCTION__);
-	cmd[0].i = cmdwait(flags);
+	BATCH_LOCALS(radeon);
+	uint32_t wait_until;
+
+	if (!radeon->radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(1);
+		OUT_BATCH(cmdwait(radeon->radeonScreen, flags));
+		END_BATCH();
+	} else {
+		switch(flags) {
+		case R300_WAIT_2D:
+			wait_until = (1 << 14);
+			break;
+		case R300_WAIT_3D:
+			wait_until = (1 << 15);
+			break;
+		case R300_NEW_WAIT_2D_3D:
+			wait_until = (1 << 14) | (1 << 15);
+			break;
+		case R300_NEW_WAIT_2D_2D_CLEAN:
+			wait_until = (1 << 14) | (1 << 16) | (1 << 18);
+			break;
+		case R300_NEW_WAIT_3D_3D_CLEAN:
+			wait_until = (1 << 15) | (1 << 17) | (1 << 18);
+			break;
+		case R300_NEW_WAIT_2D_2D_CLEAN_3D_3D_CLEAN:
+			wait_until  = (1 << 14) | (1 << 16) | (1 << 18);
+			wait_until |= (1 << 15) | (1 << 17) | (1 << 18);
+			break;
+		default:
+			return;
+		}
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH(CP_PACKET0(RADEON_WAIT_UNTIL, 0));
+		OUT_BATCH(wait_until);
+		END_BATCH();
+	}
 }
 
-extern int r300EmitArrays(GLcontext * ctx);
-
-#ifdef USER_BUFFERS
-void r300UseArrays(GLcontext * ctx);
-#endif
+extern GLboolean r300EmitArrays(GLcontext * ctx);
 
-extern void r300ReleaseArrays(GLcontext * ctx);
 extern int r300PrimitiveType(r300ContextPtr rmesa, int prim);
 extern int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim);
 
 extern void r300EmitCacheFlush(r300ContextPtr rmesa);
 
-extern GLuint r300VAPInputRoute0(uint32_t * dst, GLvector4f ** attribptr,
-				 int *inputs, GLint * tab, GLuint nr);
-extern GLuint r300VAPInputRoute1(uint32_t * dst, int swizzle[][4], GLuint nr);
 extern GLuint r300VAPInputCntl0(GLcontext * ctx, GLuint InputsRead);
 extern GLuint r300VAPInputCntl1(GLcontext * ctx, GLuint InputsRead);
-extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint OutputsWritten);
-extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint OutputsWritten);
+extern GLuint r300VAPOutputCntl0(GLcontext * ctx, GLuint vp_writes);
+extern GLuint r300VAPOutputCntl1(GLcontext * ctx, GLuint vp_writes);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.c b/src/mesa/drivers/dri/r300/r300_fragprog.c
index 873cde44144..55c1cfe6317 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.c
@@ -25,32 +25,12 @@
  *
  */
 
-/**
- * \file
- *
- * Fragment program compiler. Perform transformations on the intermediate
- * representation until the program is in a form where we can translate
- * it more or less directly into machine-readable form.
- *
- * \author Ben Skeggs <[email protected]>
- * \author Jerome Glisse <[email protected]>
- */
+#include "r300_fragprog.h"
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
-#include "shader/prog_instruction.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
 
 #include "r300_context.h"
-#include "r300_fragprog.h"
 #include "r300_fragprog_swizzle.h"
-#include "r300_state.h"
-
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
 
 static void reset_srcreg(struct prog_src_register* reg)
 {
@@ -81,7 +61,7 @@ static struct prog_src_register shadow_ambient(struct gl_program *program, int t
  * \todo If/when r5xx uses the radeon_program architecture, this can probably
  * be reused.
  */
-static GLboolean transform_TEX(
+GLboolean r300_transform_TEX(
 	struct radeon_transform_context *t,
 	struct prog_instruction* orig_inst, void* data)
 {
@@ -160,6 +140,8 @@ static GLboolean transform_TEX(
 			inst.DstReg.Index = tempreg;
 			inst.DstReg.WriteMask = WRITEMASK_XYZW;
 			destredirect = GL_TRUE;
+		} else if (inst.SaturateMode) {
+			destredirect = GL_TRUE;
 		}
 	}
 
@@ -175,7 +157,7 @@ static GLboolean transform_TEX(
 		inst.SrcReg[0].File = PROGRAM_TEMPORARY;
 		inst.SrcReg[0].Index = tmpreg;
 	}
-	
+
 	tgt = radeonAppendInstructions(t->Program, 1);
 	_mesa_copy_instructions(tgt, &inst, 1);
 
@@ -239,6 +221,7 @@ static GLboolean transform_TEX(
 
 		tgt->Opcode = OPCODE_MOV;
 		tgt->DstReg = orig_inst->DstReg;
+		tgt->SaturateMode = inst.SaturateMode;
 		tgt->SrcReg[0].File = PROGRAM_TEMPORARY;
 		tgt->SrcReg[0].Index = inst.DstReg.Index;
 	}
@@ -246,241 +229,10 @@ static GLboolean transform_TEX(
 	return GL_TRUE;
 }
 
-
-static void update_params(r300ContextPtr r300, struct r300_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
-}
-
-
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-
-void r300TranslateFragmentShader(r300ContextPtr r300,
-				 struct r300_fragment_program *fp)
-{
-	struct r300_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r300_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformTrigSimple, 0 }
-		};
-		radeonLocalTransform(
-			r300->radeon.glCtx,
-			compiler.program,
-			3, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Fragment Program: After native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
-			.BuildSwizzle = &r300FPBuildSwizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		if (!r300FragmentProgramEmit(&compiler))
-			fp->error = GL_TRUE;
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, NULL);
-
-		if (!fp->error)
-			fp->translated = GL_TRUE;
-		if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
-			r300FragmentProgramDump(fp, &fp->code);
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-	}
-
-	update_params(r300, fp);
-}
-
 /* just some random things... */
-void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code)
+void r300FragmentProgramDump(union rX00_fragment_program_code *c)
 {
+	struct r300_fragment_program_code *code = &c->r300;
 	int n, i, j;
 	static int pc = 0;
 
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog.h b/src/mesa/drivers/dri/r300/r300_fragprog.h
index 94fb554fb37..5ce6f33cee7 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r300_fragprog.h
@@ -33,9 +33,6 @@
 #ifndef __R300_FRAGPROG_H_
 #define __R300_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/program.h"
 #include "shader/prog_instruction.h"
 
@@ -105,28 +102,10 @@
 
 #endif
 
-struct r300_fragment_program;
+extern GLboolean r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r300TranslateFragmentShader(r300ContextPtr r300,
-					struct r300_fragment_program *fp);
+extern void r300FragmentProgramDump(union rX00_fragment_program_code *c);
 
-
-/**
- * Used internally by the r300 fragment program code to store compile-time
- * only data.
- */
-struct r300_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r300_fragment_program *fp;
-	struct r300_fragment_program_code *code;
-	struct gl_program *program;
-};
-
-extern GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler);
-
-
-extern void r300FragmentProgramDump(
-	struct r300_fragment_program *fp,
-	struct r300_fragment_program_code *code);
+extern GLboolean r300_transform_TEX(struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.c b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
new file mode 100644
index 00000000000..f5c4c0f4a0e
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.c
@@ -0,0 +1,399 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+/**
+ * \file
+ *
+ * Fragment program compiler. Perform transformations on the intermediate
+ * representation until the program is in a form where we can translate
+ * it more or less directly into machine-readable form.
+ *
+ * \author Ben Skeggs <[email protected]>
+ * \author Jerome Glisse <[email protected]>
+ */
+
+#include "r300_fragprog_common.h"
+
+#include "shader/program.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
+
+#include "r300_state.h"
+#include "r300_fragprog.h"
+#include "r300_fragprog_swizzle.h"
+#include "r500_fragprog.h"
+
+#include "radeon_program.h"
+#include "radeon_program_alu.h"
+
+static void nqssadce_init(struct nqssadce_state* s)
+{
+	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
+	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
+}
+
+/**
+ * Transform the program to support fragment.position.
+ *
+ * Introduce a small fragment at the start of the program that will be
+ * the only code that directly reads the FRAG_ATTRIB_WPOS input.
+ * All other code pieces that reference that input will be rewritten
+ * to read from a newly allocated temporary.
+ *
+ */
+static void insert_WPOS_trailer(struct r300_fragment_program_compiler *compiler)
+{
+	GLuint InputsRead = compiler->fp->Base->InputsRead;
+
+	if (!(InputsRead & FRAG_BIT_WPOS)) {
+		compiler->fp->wpos_attr = FRAG_ATTRIB_MAX;
+		return;
+	}
+
+	static gl_state_index tokens[STATE_LENGTH] = {
+		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
+	};
+	struct prog_instruction *fpi;
+	GLuint window_index;
+	int i = 0;
+
+	for (i = FRAG_ATTRIB_TEX0; i <= FRAG_ATTRIB_TEX7; ++i)
+	{
+		if (!(InputsRead & (1 << i))) {
+			InputsRead &= ~(1 << FRAG_ATTRIB_WPOS);
+			InputsRead |= 1 << i;
+			compiler->fp->Base->InputsRead = InputsRead;
+			compiler->fp->wpos_attr = i;
+			break;
+		}
+	}
+
+	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
+
+	_mesa_insert_instructions(compiler->program, 0, 3);
+	fpi = compiler->program->Instructions;
+	i = 0;
+
+	/* perspective divide */
+	fpi[i].Opcode = OPCODE_RCP;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_W;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = compiler->fp->wpos_attr;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	fpi[i].Opcode = OPCODE_MUL;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
+	fpi[i].SrcReg[0].Index = compiler->fp->wpos_attr;
+	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[1].Index = tempregi;
+	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
+	i++;
+
+	/* viewport transformation */
+	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
+
+	fpi[i].Opcode = OPCODE_MAD;
+
+	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
+	fpi[i].DstReg.Index = tempregi;
+	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
+	fpi[i].DstReg.CondMask = COND_TR;
+
+	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
+	fpi[i].SrcReg[0].Index = tempregi;
+	fpi[i].SrcReg[0].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[1].Index = window_index;
+	fpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+
+	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
+	fpi[i].SrcReg[2].Index = window_index;
+	fpi[i].SrcReg[2].Swizzle = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
+	i++;
+
+	for (; i < compiler->program->NumInstructions; ++i) {
+		int reg;
+		for (reg = 0; reg < 3; reg++) {
+			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
+			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
+				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
+				fpi[i].SrcReg[reg].Index = tempregi;
+			}
+		}
+	}
+}
+
+
+/**
+ * Rewrite fragment.fogcoord to use a texture coordinate slot.
+ * Note that fogcoord is forced into an X001 pattern, and this enforcement
+ * is done here.
+ *
+ * See also the counterpart rewriting for vertex programs.
+ */
+static void rewriteFog(struct r300_fragment_program_compiler *compiler)
+{
+	struct r300_fragment_program *fp = compiler->fp;
+	GLuint InputsRead;
+	int i;
+
+	InputsRead = fp->Base->InputsRead;
+
+	if (!(InputsRead & FRAG_BIT_FOGC)) {
+		fp->fog_attr = FRAG_ATTRIB_MAX;
+		return;
+	}
+
+	for (i = FRAG_ATTRIB_TEX0; i <= FRAG_ATTRIB_TEX7; ++i)
+	{
+		if (!(InputsRead & (1 << i))) {
+			InputsRead &= ~(1 << FRAG_ATTRIB_FOGC);
+			InputsRead |= 1 << i;
+			fp->Base->InputsRead = InputsRead;
+			fp->fog_attr = i;
+			break;
+		}
+	}
+
+	{
+		struct prog_instruction *inst;
+
+		inst = compiler->program->Instructions;
+		while (inst->Opcode != OPCODE_END) {
+			const int src_regs = _mesa_num_inst_src_regs(inst->Opcode);
+			for (i = 0; i < src_regs; ++i) {
+				if (inst->SrcReg[i].File == PROGRAM_INPUT && inst->SrcReg[i].Index == FRAG_ATTRIB_FOGC) {
+					inst->SrcReg[i].Index = fp->fog_attr;
+					inst->SrcReg[i].Swizzle = combine_swizzles(
+						MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE),
+						inst->SrcReg[i].Swizzle);
+				}
+			}
+			++inst;
+		}
+	}
+}
+
+static GLuint build_dtm(GLuint depthmode)
+{
+	switch(depthmode) {
+	default:
+	case GL_LUMINANCE: return 0;
+	case GL_INTENSITY: return 1;
+	case GL_ALPHA: return 2;
+	}
+}
+
+static GLuint build_func(GLuint comparefunc)
+{
+	return comparefunc - GL_NEVER;
+}
+
+/**
+ * Collect all external state that is relevant for compiling the given
+ * fragment program.
+ */
+static void build_state(
+	r300ContextPtr r300,
+	struct gl_fragment_program *fp,
+	struct r300_fragment_program_external_state *state)
+{
+	int unit;
+
+	_mesa_bzero(state, sizeof(*state));
+
+	for(unit = 0; unit < 16; ++unit) {
+		if (fp->Base.ShadowSamplers & (1 << unit)) {
+			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
+
+			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
+			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
+		}
+	}
+}
+
+static void rewrite_depth_out(struct gl_program *prog)
+{
+	struct prog_instruction *inst;
+
+	for (inst = prog->Instructions; inst->Opcode != OPCODE_END; ++inst) {
+		if (inst->DstReg.File != PROGRAM_OUTPUT || inst->DstReg.Index != FRAG_RESULT_DEPTH)
+			continue;
+
+		if (inst->DstReg.WriteMask & WRITEMASK_Z) {
+			inst->DstReg.WriteMask = WRITEMASK_W;
+		} else {
+			inst->DstReg.WriteMask = 0;
+			continue;
+		}
+
+		switch (inst->Opcode) {
+			case OPCODE_FRC:
+			case OPCODE_MOV:
+				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				break;
+			case OPCODE_ADD:
+			case OPCODE_MAX:
+			case OPCODE_MIN:
+			case OPCODE_MUL:
+				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+				break;
+			case OPCODE_CMP:
+			case OPCODE_MAD:
+				inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
+				inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
+				inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]);
+				break;
+			default:
+				// Scalar instructions needn't be reswizzled
+				break;
+		}
+	}
+}
+
+void r300TranslateFragmentShader(GLcontext *ctx, struct r300_fragment_program *fp)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program_compiler compiler;
+
+	compiler.r300 = r300;
+	compiler.fp = fp;
+	compiler.code = &fp->code;
+	compiler.program = fp->Base;
+
+	if (RADEON_DEBUG & DEBUG_PIXEL) {
+		fflush(stdout);
+		_mesa_printf("Fragment Program: Initial program:\n");
+		_mesa_print_program(compiler.program);
+		fflush(stdout);
+	}
+
+	insert_WPOS_trailer(&compiler);
+
+	rewriteFog(&compiler);
+
+	rewrite_depth_out(compiler.program);
+
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		struct radeon_program_transformation transformations[] = {
+			{ &r500_transform_TEX, &compiler },
+			{ &radeonTransformALU, 0 },
+			{ &radeonTransformDeriv, 0 },
+			{ &radeonTransformTrigScale, 0 }
+		};
+		radeonLocalTransform(ctx, compiler.program, 4, transformations);
+	} else {
+		struct radeon_program_transformation transformations[] = {
+			{ &r300_transform_TEX, &compiler },
+			{ &radeonTransformALU, 0 },
+			{ &radeonTransformTrigSimple, 0 }
+		};
+		radeonLocalTransform(ctx, compiler.program, 3, transformations);
+	}
+
+	if (RADEON_DEBUG & DEBUG_PIXEL) {
+		_mesa_printf("Fragment Program: After native rewrite:\n");
+		_mesa_print_program(compiler.program);
+		fflush(stdout);
+	}
+
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadce_init,
+			.IsNativeSwizzle = &r500FPIsNativeSwizzle,
+			.BuildSwizzle = &r500FPBuildSwizzle
+		};
+		radeonNqssaDce(ctx, compiler.program, &nqssadce);
+	} else {
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadce_init,
+			.IsNativeSwizzle = &r300FPIsNativeSwizzle,
+			.BuildSwizzle = &r300FPBuildSwizzle
+		};
+		radeonNqssaDce(ctx, compiler.program, &nqssadce);
+	}
+
+	if (RADEON_DEBUG & DEBUG_PIXEL) {
+		_mesa_printf("Compiler: after NqSSA-DCE:\n");
+		_mesa_print_program(compiler.program);
+		fflush(stdout);
+	}
+
+	if (!r300->vtbl.BuildFragmentProgramHwCode(&compiler))
+		fp->error = GL_TRUE;
+
+	fp->translated = GL_TRUE;
+
+	if (fp->error || (RADEON_DEBUG & DEBUG_PIXEL))
+		r300->vtbl.FragmentProgramDump(&fp->code);
+}
+
+struct r300_fragment_program *r300SelectFragmentShader(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_fragment_program_cont *fp_list;
+	struct r300_fragment_program *fp;
+	struct r300_fragment_program_external_state state;
+
+	fp_list = (struct r300_fragment_program_cont *)ctx->FragmentProgram._Current;
+	build_state(r300, ctx->FragmentProgram._Current, &state);
+
+	fp = fp_list->progs;
+	while (fp) {
+		if (_mesa_memcmp(&fp->state, &state, sizeof(state)) == 0) {
+			return r300->selected_fp = fp;
+		}
+		fp = fp->next;
+	}
+
+	fp = _mesa_calloc(sizeof(struct r300_fragment_program));
+
+	fp->state = state;
+	fp->translated = GL_FALSE;
+	fp->Base = _mesa_clone_program(ctx, &ctx->FragmentProgram._Current->Base);
+
+	fp->next = fp_list->progs;
+	fp_list->progs = fp;
+
+	return r300->selected_fp = fp;
+}
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_common.h b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
new file mode 100644
index 00000000000..5e103ee4086
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_common.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_FRAGPROG_COMMON_H_
+#define __R300_FRAGPROG_COMMON_H_
+
+#include "main/mtypes.h"
+
+#include "r300_context.h"
+
+extern void r300TranslateFragmentShader(GLcontext *ctx, struct r300_fragment_program *fp);
+
+struct r300_fragment_program *r300SelectFragmentShader(GLcontext *ctx);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
index 9f0b7e35349..b75656e7ee1 100644
--- a/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/r300_fragprog_emit.c
@@ -47,7 +47,7 @@
 
 #define PROG_CODE \
 	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
-	struct r300_fragment_program_code *code = c->code
+	struct r300_fragment_program_code *code = &c->code->r300
 
 #define error(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -66,7 +66,7 @@ static GLboolean emit_const(void* data, GLuint file, GLuint index, GLuint *hwind
 	}
 
 	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
+		if (*hwindex >= R300_PFS_NUM_CONST_REGS) {
 			error("Out of hw constants!\n");
 			return GL_FALSE;
 		}
@@ -138,7 +138,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->alu.length >= PFS_MAX_ALU_INST) {
+	if (code->alu.length >= R300_PFS_MAX_ALU_INST) {
 		error("Too many ALU instructions");
 		return GL_FALSE;
 	}
@@ -201,7 +201,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
 	if (inst->Alpha.DepthWriteMask) {
 		code->alu.inst[ip].inst3 |= R300_ALU_DSTA_DEPTH;
 		code->node[code->cur_node].flags |= R300_W_OUT;
-		c->fp->WritesDepth = GL_TRUE;
+		c->fp->writes_depth = GL_TRUE;
 	}
 
 	return GL_TRUE;
@@ -213,7 +213,7 @@ static GLboolean emit_alu(void* data, struct radeon_pair_instruction* inst)
  */
 static GLboolean finish_node(struct r300_fragment_program_compiler *c)
 {
-	struct r300_fragment_program_code *code = c->code;
+	struct r300_fragment_program_code *code = &c->code->r300;
 	struct r300_fragment_program_node *node = &code->node[code->cur_node];
 
 	if (node->alu_end < 0) {
@@ -275,7 +275,7 @@ static GLboolean emit_tex(void* data, struct prog_instruction* inst)
 {
 	PROG_CODE;
 
-	if (code->tex.length >= PFS_MAX_TEX_INST) {
+	if (code->tex.length >= R300_PFS_MAX_TEX_INST) {
 		error("Too many TEX instructions");
 		return GL_FALSE;
 	}
@@ -318,16 +318,16 @@ static const struct radeon_pair_handler pair_handler = {
 	.EmitPaired = &emit_alu,
 	.EmitTex = &emit_tex,
 	.BeginTexBlock = &begin_tex,
-	.MaxHwTemps = PFS_NUM_TEMP_REGS
+	.MaxHwTemps = R300_PFS_NUM_TEMP_REGS
 };
 
 /**
  * Final compilation step: Turn the intermediate radeon_program into
  * machine-readable instructions.
  */
-GLboolean r300FragmentProgramEmit(struct r300_fragment_program_compiler *compiler)
+GLboolean r300BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r300_fragment_program_code *code = compiler->code;
+	struct r300_fragment_program_code *code = &compiler->code->r300;
 
 	_mesa_bzero(code, sizeof(struct r300_fragment_program_code));
 	code->node[0].alu_end = -1;
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.c b/src/mesa/drivers/dri/r300/r300_ioctl.c
index ee85e229f0b..ddabd539925 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.c
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.c
@@ -46,8 +46,9 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "swrast/swrast.h"
 
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "r300_context.h"
-#include "radeon_ioctl.h"
 #include "r300_ioctl.h"
 #include "r300_cmdbuf.h"
 #include "r300_state.h"
@@ -55,71 +56,85 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_reg.h"
 #include "r300_emit.h"
 #include "r300_fragprog.h"
+#include "r300_context.h"
 
 #include "vblank.h"
 
+#define R200_3D_DRAW_IMMD_2      0xC0003500
+
 #define CLEARBUFFER_COLOR	0x1
 #define CLEARBUFFER_DEPTH	0x2
 #define CLEARBUFFER_STENCIL	0x4
 
-static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
+static void r300EmitClearState(GLcontext * ctx);
+
+static void r300ClearBuffer(r300ContextPtr r300, int flags,
+			    struct radeon_renderbuffer *rrb,
+			    struct radeon_renderbuffer *rrbd)
 {
+	BATCH_LOCALS(&r300->radeon);
 	GLcontext *ctx = r300->radeon.glCtx;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	GLuint cboffset, cbpitch;
-	drm_r300_cmd_header_t *cmd2;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	GLuint cbpitch = 0;
 	r300ContextPtr rmesa = r300;
 
 	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s: %s buffer (%i,%i %ix%i)\n",
-			__FUNCTION__, buffer ? "back" : "front",
-			dPriv->x, dPriv->y, dPriv->w, dPriv->h);
-
-	if (buffer) {
-		cboffset = r300->radeon.radeonScreen->backOffset;
-		cbpitch = r300->radeon.radeonScreen->backPitch;
-	} else {
-		cboffset = r300->radeon.radeonScreen->frontOffset;
-		cbpitch = r300->radeon.radeonScreen->frontPitch;
+		fprintf(stderr, "%s: buffer %p (%i,%i %ix%i)\n",
+			__FUNCTION__, rrb, dPriv->x, dPriv->y,
+			dPriv->w, dPriv->h);
+
+	if (rrb) {
+		cbpitch = (rrb->pitch / rrb->cpp);
+		if (rrb->cpp == 4)
+			cbpitch |= R300_COLOR_FORMAT_ARGB8888;
+		else
+			cbpitch |= R300_COLOR_FORMAT_RGB565;
+
+		if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+			cbpitch |= R300_COLOR_TILE_ENABLE;
+        }
 	}
 
-	cboffset += r300->radeon.radeonScreen->fbLocation;
-
-	cp_wait(r300, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
-	end_3d(rmesa);
-
-	R300_STATECHANGE(r300, cb);
-	reg_start(R300_RB3D_COLOROFFSET0, 0);
-	e32(cboffset);
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		cbpitch |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		cbpitch |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		cbpitch |= R300_COLOR_TILE_ENABLE;
-
-	reg_start(R300_RB3D_COLORPITCH0, 0);
-	e32(cbpitch);
-
-	R300_STATECHANGE(r300, cmk);
-	reg_start(RB3D_COLOR_CHANNEL_MASK, 0);
+	/* TODO in bufmgr */
+	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	end_3d(&rmesa->radeon);
 
 	if (flags & CLEARBUFFER_COLOR) {
-		e32((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
-		    (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
-		    (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
-		    (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
+		assert(rrb != 0);
+		BEGIN_BATCH_NO_AUTOSTATE(6);
+		OUT_BATCH_REGSEQ(R300_RB3D_COLOROFFSET0, 1);
+		OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		OUT_BATCH_REGVAL(R300_RB3D_COLORPITCH0, cbpitch);
+		END_BATCH();
+	}
+#if 1
+	if (flags & (CLEARBUFFER_DEPTH | CLEARBUFFER_STENCIL)) {
+		assert(rrbd != 0);
+		cbpitch = (rrbd->pitch / rrbd->cpp);
+		if (rrbd->bo->flags & RADEON_BO_FLAGS_MACRO_TILE){
+			cbpitch |= R300_DEPTHMACROTILE_ENABLE;
+        }
+		if (rrbd->bo->flags & RADEON_BO_FLAGS_MICRO_TILE){
+            cbpitch |= R300_DEPTHMICROTILE_TILED;
+        }
+		BEGIN_BATCH_NO_AUTOSTATE(6);
+		OUT_BATCH_REGSEQ(R300_ZB_DEPTHOFFSET, 1);
+		OUT_BATCH_RELOC(0, rrbd->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+		OUT_BATCH_REGVAL(R300_ZB_DEPTHPITCH, cbpitch);
+		END_BATCH();
+	}
+#endif
+	BEGIN_BATCH_NO_AUTOSTATE(6);
+	OUT_BATCH_REGSEQ(RB3D_COLOR_CHANNEL_MASK, 1);
+	if (flags & CLEARBUFFER_COLOR) {
+		OUT_BATCH((ctx->Color.ColorMask[BCOMP] ? RB3D_COLOR_CHANNEL_MASK_BLUE_MASK0 : 0) |
+			  (ctx->Color.ColorMask[GCOMP] ? RB3D_COLOR_CHANNEL_MASK_GREEN_MASK0 : 0) |
+			  (ctx->Color.ColorMask[RCOMP] ? RB3D_COLOR_CHANNEL_MASK_RED_MASK0 : 0) |
+			  (ctx->Color.ColorMask[ACOMP] ? RB3D_COLOR_CHANNEL_MASK_ALPHA_MASK0 : 0));
 	} else {
-		e32(0x0);
+		OUT_BATCH(0);
 	}
 
-	R300_STATECHANGE(r300, zs);
-	reg_start(R300_ZB_CNTL, 2);
 
 	{
 		uint32_t t1, t2;
@@ -146,73 +161,92 @@ static void r300ClearBuffer(r300ContextPtr r300, int flags, int buffer)
 			     R300_S_FRONT_ZFAIL_OP_SHIFT);
 		}
 
-		e32(t1);
-		e32(t2);
-		e32(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) << R300_STENCILWRITEMASK_SHIFT) |
-		    (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		OUT_BATCH_REGSEQ(R300_ZB_CNTL, 3);
+		OUT_BATCH(t1);
+		OUT_BATCH(t2);
+		OUT_BATCH(((ctx->Stencil.WriteMask[0] & R300_STENCILREF_MASK) <<
+                   R300_STENCILWRITEMASK_SHIFT) |
+			  (ctx->Stencil.Clear & R300_STENCILREF_MASK));
+		END_BATCH();
 	}
 
-	cmd2 = (drm_r300_cmd_header_t *) r300AllocCmdBuf(r300, 9, __FUNCTION__);
-	cmd2[0].packet3.cmd_type = R300_CMD_PACKET3;
-	cmd2[0].packet3.packet = R300_CMD_PACKET3_CLEAR;
-	cmd2[1].u = r300PackFloat32(dPriv->w / 2.0);
-	cmd2[2].u = r300PackFloat32(dPriv->h / 2.0);
-	cmd2[3].u = r300PackFloat32(ctx->Depth.Clear);
-	cmd2[4].u = r300PackFloat32(1.0);
-	cmd2[5].u = r300PackFloat32(ctx->Color.ClearColor[0]);
-	cmd2[6].u = r300PackFloat32(ctx->Color.ClearColor[1]);
-	cmd2[7].u = r300PackFloat32(ctx->Color.ClearColor[2]);
-	cmd2[8].u = r300PackFloat32(ctx->Color.ClearColor[3]);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH_NO_AUTOSTATE(9);
+		OUT_BATCH(cmdpacket3(r300->radeon.radeonScreen, R300_CMD_PACKET3_CLEAR));
+		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+		OUT_BATCH_FLOAT32(1.0);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+		END_BATCH();
+	} else {
+		OUT_BATCH(CP_PACKET3(R200_3D_DRAW_IMMD_2, 8));
+		OUT_BATCH(R300_PRIM_TYPE_POINT | R300_PRIM_WALK_RING |
+			  (1 << R300_PRIM_NUM_VERTICES_SHIFT));
+		OUT_BATCH_FLOAT32(dPriv->w / 2.0);
+		OUT_BATCH_FLOAT32(dPriv->h / 2.0);
+		OUT_BATCH_FLOAT32(ctx->Depth.Clear);
+		OUT_BATCH_FLOAT32(1.0);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[0]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[1]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[2]);
+		OUT_BATCH_FLOAT32(ctx->Color.ClearColor[3]);
+	}
 
 	r300EmitCacheFlush(rmesa);
-	cp_wait(rmesa, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+	cp_wait(&r300->radeon, R300_WAIT_3D | R300_WAIT_3D_CLEAN);
+
+	R300_STATECHANGE(r300, cb);
+	R300_STATECHANGE(r300, cmk);
+	R300_STATECHANGE(r300, zs);
 }
 
 static void r300EmitClearState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	r300ContextPtr rmesa = r300;
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
+	BATCH_LOCALS(&r300->radeon);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-	int has_tcl = 1;
+	int has_tcl;
 	int is_r500 = 0;
 	GLuint vap_cntl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
-
-        if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-                is_r500 = 1;
+	has_tcl = r300->options.hw_tcl_enabled;
 
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+		is_r500 = 1;
 
-	/* FIXME: the values written to R300_VAP_INPUT_ROUTE_0_0 and
-	 * R300_VAP_INPUT_ROUTE_0_1 are in fact known, however, the values are
-	 * quite complex; see the functions in r300_emit.c.
+	/* State atom dirty tracking is a little subtle here.
 	 *
-	 * I believe it would be a good idea to extend the functions in
-	 * r300_emit.c so that they can be used to setup the default values for
-	 * these registers, as well as the actual values used for rendering.
+	 * On the one hand, we need to make sure base state is emitted
+	 * here if we start with an empty batch buffer, otherwise clear
+	 * works incorrectly with multiple processes. Therefore, the first
+	 * BEGIN_BATCH cannot be a BEGIN_BATCH_NO_AUTOSTATE.
+	 *
+	 * On the other hand, implicit state emission clears the state atom
+	 * dirty bits, so we have to call R300_STATECHANGE later than the
+	 * first BEGIN_BATCH.
+	 *
+	 * The final trickiness is that, because we change state, we need
+	 * to ensure that any stored swtcl primitives are flushed properly
+	 * before we start changing state. See the R300_NEWPRIM in r300Clear
+	 * for this.
 	 */
-	R300_STATECHANGE(r300, vir[0]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_0, 0);
+	BEGIN_BATCH(31);
+	OUT_BATCH_REGSEQ(R300_VAP_PROG_STREAM_CNTL_0, 1);
 	if (!has_tcl)
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (2 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 	else
-	    e32(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
+		OUT_BATCH(((((0 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_0_SHIFT) |
 		 ((R300_LAST_VEC | (1 << R300_DST_VEC_LOC_SHIFT) | R300_DATA_TYPE_FLOAT_4) << R300_DATA_TYPE_1_SHIFT)));
 
-	/* disable fog */
-	R300_STATECHANGE(r300, fogs);
-	reg_start(R300_FG_FOG_BLEND, 0);
-	e32(0x0);
-
-	R300_STATECHANGE(r300, vir[1]);
-	reg_start(R300_VAP_PROG_STREAM_CNTL_EXT_0, 0);
-	e32(((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
+	OUT_BATCH_REGVAL(R300_FG_FOG_BLEND, 0);
+	OUT_BATCH_REGVAL(R300_VAP_PROG_STREAM_CNTL_EXT_0,
+	   ((((R300_SWIZZLE_SELECT_X << R300_SWIZZLE_SELECT_X_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Y << R300_SWIZZLE_SELECT_Y_SHIFT) |
 	       (R300_SWIZZLE_SELECT_Z << R300_SWIZZLE_SELECT_Z_SHIFT) |
 	       (R300_SWIZZLE_SELECT_W << R300_SWIZZLE_SELECT_W_SHIFT) |
@@ -226,619 +260,442 @@ static void r300EmitClearState(GLcontext * ctx)
 	      << R300_SWIZZLE1_SHIFT)));
 
 	/* R300_VAP_INPUT_CNTL_0, R300_VAP_INPUT_CNTL_1 */
-	R300_STATECHANGE(r300, vic);
-	reg_start(R300_VAP_VTX_STATE_CNTL, 1);
-	e32((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
-	e32(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
+	OUT_BATCH_REGSEQ(R300_VAP_VTX_STATE_CNTL, 2);
+	OUT_BATCH((R300_SEL_USER_COLOR_0 << R300_COLOR_0_ASSEMBLY_SHIFT));
+	OUT_BATCH(R300_INPUT_CNTL_POS | R300_INPUT_CNTL_COLOR | R300_INPUT_CNTL_TC0);
 
-	R300_STATECHANGE(r300, vte);
 	/* comes from fglrx startup of clear */
-	reg_start(R300_SE_VTE_CNTL, 1);
-	e32(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
-	    R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
-	    R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
-	    R300_VPORT_Z_OFFSET_ENA);
-	e32(0x8);
+	OUT_BATCH_REGSEQ(R300_SE_VTE_CNTL, 2);
+	OUT_BATCH(R300_VTX_W0_FMT | R300_VPORT_X_SCALE_ENA |
+		  R300_VPORT_X_OFFSET_ENA | R300_VPORT_Y_SCALE_ENA |
+		  R300_VPORT_Y_OFFSET_ENA | R300_VPORT_Z_SCALE_ENA |
+		  R300_VPORT_Z_OFFSET_ENA);
+	OUT_BATCH(0x8);
 
-	reg_start(R300_VAP_PSC_SGN_NORM_CNTL, 0);
-	e32(0xaaaaaaaa);
+	OUT_BATCH_REGVAL(R300_VAP_PSC_SGN_NORM_CNTL, 0xaaaaaaaa);
 
-	R300_STATECHANGE(r300, vof);
-	reg_start(R300_VAP_OUTPUT_VTX_FMT_0, 1);
-	e32(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
-	    R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
-	e32(0x0);		/* no textures */
+	OUT_BATCH_REGSEQ(R300_VAP_OUTPUT_VTX_FMT_0, 2);
+	OUT_BATCH(R300_VAP_OUTPUT_VTX_FMT_0__POS_PRESENT |
+		  R300_VAP_OUTPUT_VTX_FMT_0__COLOR_0_PRESENT);
+	OUT_BATCH(0); /* no textures */
 
-	R300_STATECHANGE(r300, txe);
-	reg_start(R300_TX_ENABLE, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_TX_ENABLE, 0);
 
-	R300_STATECHANGE(r300, vpt);
-	reg_start(R300_SE_VPORT_XSCALE, 5);
-	efloat(1.0);
-	efloat(dPriv->x);
-	efloat(1.0);
-	efloat(dPriv->y);
-	efloat(1.0);
-	efloat(0.0);
+	OUT_BATCH_REGSEQ(R300_SE_VPORT_XSCALE, 6);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->x);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(dPriv->y);
+	OUT_BATCH_FLOAT32(1.0);
+	OUT_BATCH_FLOAT32(0.0);
 
-	R300_STATECHANGE(r300, at);
-	reg_start(R300_FG_ALPHA_FUNC, 0);
-	e32(0x0);
+	OUT_BATCH_REGVAL(R300_FG_ALPHA_FUNC, 0);
 
+	OUT_BATCH_REGSEQ(R300_RB3D_CBLEND, 2);
+	OUT_BATCH(0x0);
+	OUT_BATCH(0x0);
+	END_BATCH();
+
+	R300_STATECHANGE(r300, vir[0]);
+	R300_STATECHANGE(r300, fogs);
+	R300_STATECHANGE(r300, vir[1]);
+	R300_STATECHANGE(r300, vic);
+	R300_STATECHANGE(r300, vte);
+	R300_STATECHANGE(r300, vof);
+	R300_STATECHANGE(r300, txe);
+	R300_STATECHANGE(r300, vpt);
+	R300_STATECHANGE(r300, at);
 	R300_STATECHANGE(r300, bld);
-	reg_start(R300_RB3D_CBLEND, 1);
-	e32(0x0);
-	e32(0x0);
+	R300_STATECHANGE(r300, ps);
 
 	if (has_tcl) {
-	    R300_STATECHANGE(r300, vap_clip_cntl);
-	    reg_start(R300_VAP_CLIP_CNTL, 0);
-	    e32(R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		R300_STATECHANGE(r300, vap_clip_cntl);
+
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		OUT_BATCH_REGVAL(R300_VAP_CLIP_CNTL, R300_PS_UCP_MODE_CLIP_AS_TRIFAN | R300_CLIP_DISABLE);
+		END_BATCH();
         }
 
-	R300_STATECHANGE(r300, ps);
-	reg_start(R300_GA_POINT_SIZE, 0);
-	e32(((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
-	    ((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	BEGIN_BATCH_NO_AUTOSTATE(2);
+	OUT_BATCH_REGVAL(R300_GA_POINT_SIZE,
+		((dPriv->w * 6) << R300_POINTSIZE_X_SHIFT) |
+		((dPriv->h * 6) << R300_POINTSIZE_Y_SHIFT));
+	END_BATCH();
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R300_RS_IP_0, 7);
-		for (i = 0; i < 8; ++i) {
-			e32(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
-		}
-
 		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
 		R300_STATECHANGE(r300, rr);
-		reg_start(R300_RS_INST_0, 0);
-		e32(R300_RS_INST_COL_CN_WRITE);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R300_RS_IP_0, 8);
+		for (i = 0; i < 8; ++i)
+			OUT_BATCH(R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3));
+
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
+
+		OUT_BATCH_REGVAL(R300_RS_INST_0, R300_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	} else {
 		R300_STATECHANGE(r300, ri);
-		reg_start(R500_RS_IP_0, 7);
+		R300_STATECHANGE(r300, rc);
+		R300_STATECHANGE(r300, rr);
+
+		BEGIN_BATCH(14);
+		OUT_BATCH_REGSEQ(R500_RS_IP_0, 8);
 		for (i = 0; i < 8; ++i) {
-			e32((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
-			    (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
-			    (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
+			OUT_BATCH((R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT) |
+				  (R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT) |
+				  (R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT));
 		}
 
-		R300_STATECHANGE(r300, rc);
-		/* The second constant is needed to get glxgears display anything .. */
-		reg_start(R300_RS_COUNT, 1);
-		e32((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
-		e32(0x0);
-
-		R300_STATECHANGE(r300, rr);
-		reg_start(R500_RS_INST_0, 0);
-		e32(R500_RS_INST_COL_CN_WRITE);
+		OUT_BATCH_REGSEQ(R300_RS_COUNT, 2);
+		OUT_BATCH((1 << R300_IC_COUNT_SHIFT) | R300_HIRES_EN);
+		OUT_BATCH(0x0);
 
+		OUT_BATCH_REGVAL(R500_RS_INST_0, R500_RS_INST_COL_CN_WRITE);
+		END_BATCH();
 	}
 
 	if (!is_r500) {
 		R300_STATECHANGE(r300, fp);
-		reg_start(R300_US_CONFIG, 2);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		reg_start(R300_US_CODE_ADDR_0, 3);
-		e32(0x0);
-		e32(0x0);
-		e32(0x0);
-		e32(R300_RGBA_OUT);
-
 		R300_STATECHANGE(r300, fpi[0]);
 		R300_STATECHANGE(r300, fpi[1]);
 		R300_STATECHANGE(r300, fpi[2]);
 		R300_STATECHANGE(r300, fpi[3]);
 
-		reg_start(R300_US_ALU_RGB_INST_0, 0);
-		e32(FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
-
-		reg_start(R300_US_ALU_RGB_ADDR_0, 0);
-		e32(FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
-
-		reg_start(R300_US_ALU_ALPHA_INST_0, 0);
-		e32(FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
-
-		reg_start(R300_US_ALU_ALPHA_ADDR_0, 0);
-		e32(FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		BEGIN_BATCH(17);
+		OUT_BATCH_REGSEQ(R300_US_CONFIG, 3);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R300_US_CODE_ADDR_0, 4);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(0x0);
+		OUT_BATCH(R300_RGBA_OUT);
+
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_INST_0,
+			FP_INSTRC(MAD, FP_ARGC(SRC0C_XYZ), FP_ARGC(ONE), FP_ARGC(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_RGB_ADDR_0,
+			FP_SELC(0, NO, XYZ, FP_TMP(0), 0, 0));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_INST_0,
+			FP_INSTRA(MAD, FP_ARGA(SRC0A), FP_ARGA(ONE), FP_ARGA(ZERO)));
+		OUT_BATCH_REGVAL(R300_US_ALU_ALPHA_ADDR_0,
+			FP_SELA(0, NO, W, FP_TMP(0), 0, 0));
+		END_BATCH();
 	} else {
- 		R300_STATECHANGE(r300, fp);
- 		reg_start(R500_US_CONFIG, 1);
- 		e32(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
- 		e32(0x0);
- 		reg_start(R500_US_CODE_ADDR, 2);
- 		e32(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
- 		e32(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
- 		e32(R500_US_CODE_OFFSET_ADDR(0));
+		struct radeon_state_atom r500fp;
+		uint32_t _cmd[10];
 
+		R300_STATECHANGE(r300, fp);
 		R300_STATECHANGE(r300, r500fp);
-		r500fp_start_fragment(0, 6);
-
-		e32(R500_INST_TYPE_OUT |
-		    R500_INST_TEX_SEM_WAIT |
-		    R500_INST_LAST |
-		    R500_INST_RGB_OMASK_R |
-		    R500_INST_RGB_OMASK_G |
-		    R500_INST_RGB_OMASK_B |
-		    R500_INST_ALPHA_OMASK |
-		    R500_INST_RGB_CLAMP |
-		    R500_INST_ALPHA_CLAMP);
-
-		e32(R500_RGB_ADDR0(0) |
-		    R500_RGB_ADDR1(0) |
-		    R500_RGB_ADDR1_CONST |
-		    R500_RGB_ADDR2(0) |
-		    R500_RGB_ADDR2_CONST);
-
-		e32(R500_ALPHA_ADDR0(0) |
-		    R500_ALPHA_ADDR1(0) |
-		    R500_ALPHA_ADDR1_CONST |
-		    R500_ALPHA_ADDR2(0) |
-		    R500_ALPHA_ADDR2_CONST);
-
-		e32(R500_ALU_RGB_SEL_A_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_A_R |
-		    R500_ALU_RGB_G_SWIZ_A_G |
-		    R500_ALU_RGB_B_SWIZ_A_B |
-		    R500_ALU_RGB_SEL_B_SRC0 |
-		    R500_ALU_RGB_R_SWIZ_B_R |
-		    R500_ALU_RGB_B_SWIZ_B_G |
-		    R500_ALU_RGB_G_SWIZ_B_B);
-
-		e32(R500_ALPHA_OP_CMP |
-		    R500_ALPHA_SWIZ_A_A |
-		    R500_ALPHA_SWIZ_B_A);
-
-		e32(R500_ALU_RGBA_OP_CMP |
-		    R500_ALU_RGBA_R_SWIZ_0 |
-		    R500_ALU_RGBA_G_SWIZ_0 |
-		    R500_ALU_RGBA_B_SWIZ_0 |
-		    R500_ALU_RGBA_A_SWIZ_0);
+
+		BEGIN_BATCH(7);
+		OUT_BATCH_REGSEQ(R500_US_CONFIG, 2);
+		OUT_BATCH(R500_ZERO_TIMES_ANYTHING_EQUALS_ZERO);
+		OUT_BATCH(0x0);
+		OUT_BATCH_REGSEQ(R500_US_CODE_ADDR, 3);
+		OUT_BATCH(R500_US_CODE_START_ADDR(0) | R500_US_CODE_END_ADDR(1));
+		OUT_BATCH(R500_US_CODE_RANGE_ADDR(0) | R500_US_CODE_RANGE_SIZE(1));
+		OUT_BATCH(R500_US_CODE_OFFSET_ADDR(0));
+		END_BATCH();
+
+		r500fp.check = check_r500fp;
+		r500fp.cmd = _cmd;
+		r500fp.cmd[0] = cmdr500fp(r300->radeon.radeonScreen, 0, 1, 0, 0);
+		r500fp.cmd[1] = R500_INST_TYPE_OUT |
+			R500_INST_TEX_SEM_WAIT |
+			R500_INST_LAST |
+			R500_INST_RGB_OMASK_R |
+			R500_INST_RGB_OMASK_G |
+			R500_INST_RGB_OMASK_B |
+			R500_INST_ALPHA_OMASK |
+			R500_INST_RGB_CLAMP |
+			R500_INST_ALPHA_CLAMP;
+		r500fp.cmd[2] = R500_RGB_ADDR0(0) |
+			R500_RGB_ADDR1(0) |
+			R500_RGB_ADDR1_CONST |
+			R500_RGB_ADDR2(0) |
+			R500_RGB_ADDR2_CONST;
+		r500fp.cmd[3] = R500_ALPHA_ADDR0(0) |
+			R500_ALPHA_ADDR1(0) |
+			R500_ALPHA_ADDR1_CONST |
+			R500_ALPHA_ADDR2(0) |
+			R500_ALPHA_ADDR2_CONST;
+		r500fp.cmd[4] = R500_ALU_RGB_SEL_A_SRC0 |
+			R500_ALU_RGB_R_SWIZ_A_R |
+			R500_ALU_RGB_G_SWIZ_A_G |
+			R500_ALU_RGB_B_SWIZ_A_B |
+			R500_ALU_RGB_SEL_B_SRC0 |
+			R500_ALU_RGB_R_SWIZ_B_R |
+			R500_ALU_RGB_B_SWIZ_B_G |
+			R500_ALU_RGB_G_SWIZ_B_B;
+		r500fp.cmd[5] = R500_ALPHA_OP_CMP |
+			R500_ALPHA_SWIZ_A_A |
+			R500_ALPHA_SWIZ_B_A;
+		r500fp.cmd[6] = R500_ALU_RGBA_OP_CMP |
+			R500_ALU_RGBA_R_SWIZ_0 |
+			R500_ALU_RGBA_G_SWIZ_0 |
+			R500_ALU_RGBA_B_SWIZ_0 |
+			R500_ALU_RGBA_A_SWIZ_0;
+
+		r500fp.cmd[7] = 0;
+		emit_r500fp(ctx, &r500fp);
 	}
 
-	reg_start(R300_VAP_PVS_STATE_FLUSH_REG, 0);
-	e32(0x00000000);
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_PVS_STATE_FLUSH_REG, 0);
+	END_BATCH();
+
 	if (has_tcl) {
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(12 << R300_VF_MAX_VTX_NUM_SHIFT));
-	    if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
-	} else
-	    vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
+		if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
+			vap_cntl |= R500_TCL_STATE_OPTIMIZATION;
+	} else {
+		vap_cntl = ((10 << R300_PVS_NUM_SLOTS_SHIFT) |
 			(5 << R300_PVS_NUM_CNTLRS_SHIFT) |
 			(5 << R300_VF_MAX_VTX_NUM_SHIFT));
+	}
 
 	if (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV515)
-	    vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (2 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV530) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV560) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV570))
-	    vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (5 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV410) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R420))
-	    vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (6 << R300_PVS_NUM_FPUS_SHIFT);
 	else if ((r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R520) ||
 		 (r300->radeon.radeonScreen->chip_family == CHIP_FAMILY_R580))
-	    vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (8 << R300_PVS_NUM_FPUS_SHIFT);
 	else
-	    vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
+		vap_cntl |= (4 << R300_PVS_NUM_FPUS_SHIFT);
 
-	R300_STATECHANGE(rmesa, vap_cntl);
-	reg_start(R300_VAP_CNTL, 0);
-	e32(vap_cntl);
+	R300_STATECHANGE(r300, vap_cntl);
+
+	BEGIN_BATCH(2);
+	OUT_BATCH_REGVAL(R300_VAP_CNTL, vap_cntl);
+	END_BATCH();
 
 	if (has_tcl) {
+        struct radeon_state_atom vpu;
+        uint32_t _cmd[10];
 		R300_STATECHANGE(r300, pvs);
-		reg_start(R300_VAP_PVS_CODE_CNTL_0, 2);
-
-		e32((0 << R300_PVS_FIRST_INST_SHIFT) |
-		    (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
-		    (1 << R300_PVS_LAST_INST_SHIFT));
-		e32((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-		    (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
-		e32(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-
 		R300_STATECHANGE(r300, vpi);
-		vsf_start_fragment(0x0, 8);
-
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 0, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
 
-		e32(PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf, PVS_DST_REG_OUT));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_SELECT_FORCE_0, PVS_SRC_REG_INPUT, VSF_FLAG_NONE));
-		e32(0x0);
+		BEGIN_BATCH(4);
+		OUT_BATCH_REGSEQ(R300_VAP_PVS_CODE_CNTL_0, 3);
+		OUT_BATCH((0 << R300_PVS_FIRST_INST_SHIFT) |
+			  (0 << R300_PVS_XYZW_VALID_INST_SHIFT) |
+			  (1 << R300_PVS_LAST_INST_SHIFT));
+		OUT_BATCH((0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
+			  (0 << R300_PVS_MAX_CONST_ADDR_SHIFT));
+		OUT_BATCH(1 << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
+		END_BATCH();
+
+		vpu.check = check_vpu;
+		vpu.cmd = _cmd;
+		vpu.cmd[0] = cmdvpu(r300->radeon.radeonScreen, 0, 2);
+
+		vpu.cmd[1] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE,
+                                         0, 0xf, PVS_DST_REG_OUT);
+		vpu.cmd[2] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y,
+                                      PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+		vpu.cmd[3] = PVS_SRC_OPERAND(0, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+		vpu.cmd[4] = 0x0;
+
+		vpu.cmd[5] = PVS_OP_DST_OPERAND(VE_ADD, GL_FALSE, GL_FALSE, 1, 0xf,
+                                         PVS_DST_REG_OUT);
+		vpu.cmd[6] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_X,
+                                      PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z,
+                                      PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT,
+
+                                      VSF_FLAG_NONE);
+		vpu.cmd[7] = PVS_SRC_OPERAND(1, PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_SELECT_FORCE_0,
+                                      PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
+		vpu.cmd[8] = 0x0;
+
+		r300->vap_flush_needed = GL_TRUE;
+		emit_vpu(ctx, &vpu);
 	}
 }
 
-/**
- * Buffer clear
- */
-static void r300Clear(GLcontext * ctx, GLbitfield mask)
+static int r300KernelClear(GLcontext *ctx, GLuint flags)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = r300->radeon.dri.drawable;
-	int flags = 0;
-	int bits = 0;
-	int swapped;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+	struct radeon_renderbuffer *rrb;
+	struct radeon_renderbuffer *rrbd;
+	int bits = 0, ret;
 
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300Clear\n");
+	/* Make sure it fits there. */
+	radeon_cs_space_reset_bos(r300->radeon.cmdbuf.cs);
+	
+	if (flags & BUFFER_BIT_COLOR0) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
+	}
 
-	{
-		LOCK_HARDWARE(&r300->radeon);
-		UNLOCK_HARDWARE(&r300->radeon);
-		if (dPriv->numClipRects == 0)
-			return;
+	if (flags & BUFFER_BIT_FRONT_LEFT) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (mask & BUFFER_BIT_FRONT_LEFT) {
-		flags |= BUFFER_BIT_FRONT_LEFT;
-		mask &= ~BUFFER_BIT_FRONT_LEFT;
+	if (flags & BUFFER_BIT_BACK_LEFT) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (mask & BUFFER_BIT_BACK_LEFT) {
-		flags |= BUFFER_BIT_BACK_LEFT;
-		mask &= ~BUFFER_BIT_BACK_LEFT;
+	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rrbd) {
+		radeon_cs_space_add_persistent_bo(r300->radeon.cmdbuf.cs,
+						  rrbd->bo, 0, RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (mask & BUFFER_BIT_DEPTH) {
+	ret = radeon_cs_space_check(r300->radeon.cmdbuf.cs);
+	if (ret)
+	  return -1;
+
+	rcommonEnsureCmdBufSpace(&r300->radeon, 421 * 3, __FUNCTION__);
+	if (flags || bits)
+		r300EmitClearState(ctx);
+
+	rrbd = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rrbd && (flags & BUFFER_BIT_DEPTH))
 		bits |= CLEARBUFFER_DEPTH;
-		mask &= ~BUFFER_BIT_DEPTH;
-	}
 
-	if ((mask & BUFFER_BIT_STENCIL) && r300->state.stencil.hw_stencil) {
+	if (rrbd && (flags & BUFFER_BIT_STENCIL))
 		bits |= CLEARBUFFER_STENCIL;
-		mask &= ~BUFFER_BIT_STENCIL;
-	}
 
-	if (mask) {
-		if (RADEON_DEBUG & DEBUG_FALLBACKS)
-			fprintf(stderr, "%s: swrast clear, mask: %x\n",
-				__FUNCTION__, mask);
-		_swrast_Clear(ctx, mask);
+	if (flags & BUFFER_BIT_COLOR0) {
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_COLOR0);
+		r300ClearBuffer(r300, CLEARBUFFER_COLOR, rrb, NULL);
+		bits = 0;
 	}
 
-	swapped = r300->radeon.sarea->pfCurrentPage == 1;
-
-	/* Make sure it fits there. */
-	r300EnsureCmdBufSpace(r300, 421 * 3, __FUNCTION__);
-	if (flags || bits)
-		r300EmitClearState(ctx);
-
 	if (flags & BUFFER_BIT_FRONT_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped);
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT);
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
 		bits = 0;
 	}
 
 	if (flags & BUFFER_BIT_BACK_LEFT) {
-		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, swapped ^ 1);
+		rrb = radeon_get_renderbuffer(&rfb->base, BUFFER_BACK_LEFT);
+		r300ClearBuffer(r300, bits | CLEARBUFFER_COLOR, rrb, rrbd);
 		bits = 0;
 	}
 
 	if (bits)
-		r300ClearBuffer(r300, bits, 0);
+		r300ClearBuffer(r300, bits, NULL, rrbd);
 
+	COMMIT_BATCH();
+	return 0;
 }
 
-void r300Flush(GLcontext * ctx)
+/**
+ * Buffer clear
+ */
+static void r300Clear(GLcontext * ctx, GLbitfield mask)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&r300->radeon);
+	const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+	GLbitfield swrast_mask = 0, tri_mask = 0;
+	int i, ret;
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
 
 	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush( rmesa );
-
-	if (rmesa->cmdbuf.count_used > rmesa->cmdbuf.count_reemit)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-}
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-
-void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size)
-{
-	struct r300_dma_buffer *dmabuf;
-	size = MAX2(size, RADEON_BUFFER_SIZE * 16);
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
-	}
+		fprintf(stderr, "r300Clear\n");
 
-	if (rmesa->dma.current.buf) {
-#ifdef USER_BUFFERS
-		r300_mem_use(rmesa, rmesa->dma.current.buf->id);
-#endif
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
+	if (!r300->radeon.radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(&r300->radeon);
+		UNLOCK_HARDWARE(&r300->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
 	}
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = (void *)1;	/* hack */
-	dmabuf->refcount = 1;
 
-	dmabuf->id = r300_mem_alloc(rmesa, 4, size);
-	if (dmabuf->id == 0) {
-		LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-		r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		radeonWaitForIdleLocked(&rmesa->radeon);
+	/* Flush swtcl vertices if necessary, because we will change hardware
+	 * state during clear. See also the state-related comment in
+	 * r300EmitClearState.
+	 */
+	R300_NEWPRIM(r300);
 
-		dmabuf->id = r300_mem_alloc(rmesa, 4, size);
+	if (colorMask == ~0)
+	  tri_mask |= (mask & BUFFER_BITS_COLOR);
+	else
+	  tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
 
-		UNLOCK_HARDWARE(&rmesa->radeon);
 
-		if (dmabuf->id == 0) {
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
-		}
+	/* HW stencil */
+	if (mask & BUFFER_BIT_STENCIL) {
+		tri_mask |= BUFFER_BIT_STENCIL;
 	}
 
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = r300_mem_ptr(rmesa, dmabuf->id);
-	rmesa->dma.current.end = size;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		r300_mem_free(rmesa, region->buf->id);
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
+	/* HW depth */
+	if (mask & BUFFER_BIT_DEPTH) {
+    	        tri_mask |= BUFFER_BIT_DEPTH;
 	}
 
-	region->buf = 0;
-	region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa, (bytes + 0x7) & ~0x7);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
+	/* If we're doing a tri pass for depth/stencil, include a likely color
+	 * buffer with it.
+	 */
 
-#else
-static void r300RefillCurrentDmaRegion(r300ContextPtr rmesa)
-{
-	struct r300_dma_buffer *dmabuf;
-	int fd = rmesa->radeon.dri.fd;
-	int index = 0;
-	int size = 0;
-	drmDMAReq dma;
-	int ret;
-
-	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (rmesa->dma.flush) {
-		rmesa->dma.flush(rmesa);
+	for (i = 0; i < BUFFER_COUNT; i++) {
+	  GLuint bufBit = 1 << i;
+	  if ((tri_mask) & bufBit) {
+	    if (!fb->Attachment[i].Renderbuffer->ClassID) {
+	      tri_mask &= ~bufBit;
+	      swrast_mask |= bufBit;
+	    }
+	  }
 	}
 
-	if (rmesa->dma.current.buf)
-		r300ReleaseDmaRegion(rmesa, &rmesa->dma.current, __FUNCTION__);
-
-	if (rmesa->dma.nr_released_bufs > 4)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	dma.context = rmesa->radeon.dri.hwContext;
-	dma.send_count = 0;
-	dma.send_list = NULL;
-	dma.send_sizes = NULL;
-	dma.flags = 0;
-	dma.request_count = 1;
-	dma.request_size = RADEON_BUFFER_SIZE;
-	dma.request_list = &index;
-	dma.request_sizes = &size;
-	dma.granted_count = 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* no need to validate */
-
-	ret = drmDMA(fd, &dma);
-
-	if (ret != 0) {
-		/* Try to release some buffers and wait until we can't get any more */
-		if (rmesa->dma.nr_released_bufs) {
-			r300FlushCmdBufLocked(rmesa, __FUNCTION__);
-		}
-
-		if (RADEON_DEBUG & DEBUG_DMA)
-			fprintf(stderr, "Waiting for buffers\n");
-
-		radeonWaitForIdleLocked(&rmesa->radeon);
-		ret = drmDMA(fd, &dma);
-
-		if (ret != 0) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			fprintf(stderr,
-				"Error: Could not get dma buffer... exiting\n");
-			_mesa_exit(-1);
+	/* SW fallback clearing */
+	swrast_mask = mask & ~tri_mask;
+
+	ret = 0;
+	if (tri_mask) {
+		if (r300->radeon.radeonScreen->kernel_mm)
+			radeonUserClear(ctx, tri_mask);
+		else {
+			/* if kernel clear fails due to size restraints fallback */
+			ret = r300KernelClear(ctx, tri_mask);
+			if (ret < 0)
+				swrast_mask |= tri_mask;
 		}
 	}
 
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (RADEON_DEBUG & DEBUG_DMA)
-		fprintf(stderr, "Allocated buffer %d\n", index);
-
-	dmabuf = CALLOC_STRUCT(r300_dma_buffer);
-	dmabuf->buf = &rmesa->radeon.radeonScreen->buffers->list[index];
-	dmabuf->refcount = 1;
-
-	rmesa->dma.current.buf = dmabuf;
-	rmesa->dma.current.address = dmabuf->buf->address;
-	rmesa->dma.current.end = dmabuf->buf->total;
-	rmesa->dma.current.start = 0;
-	rmesa->dma.current.ptr = 0;
-}
-
-void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-			  struct r300_dma_region *region, const char *caller)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s from %s\n", __FUNCTION__, caller);
-
-	if (!region->buf)
-		return;
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (--region->buf->refcount == 0) {
-		drm_radeon_cmd_header_t *cmd;
-
-		if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
-			fprintf(stderr, "%s -- DISCARD BUF %d\n",
-				__FUNCTION__, region->buf->buf->idx);
-		cmd =
-		    (drm_radeon_cmd_header_t *) r300AllocCmdBuf(rmesa,
-								sizeof
-								(*cmd) / 4,
-								__FUNCTION__);
-		cmd->dma.cmd_type = R300_CMD_DMA_DISCARD;
-		cmd->dma.buf_idx = region->buf->buf->idx;
-
-		FREE(region->buf);
-		rmesa->dma.nr_released_bufs++;
+	if (swrast_mask) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS)
+			fprintf(stderr, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, swrast_mask);
+		_swrast_Clear(ctx, swrast_mask);
 	}
-
-	region->buf = 0;
-	region->start = 0;
 }
 
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void r300AllocDmaRegion(r300ContextPtr rmesa,
-			struct r300_dma_region *region,
-			int bytes, int alignment)
-{
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-	if (rmesa->dma.flush)
-		rmesa->dma.flush(rmesa);
-
-	if (region->buf)
-		r300ReleaseDmaRegion(rmesa, region, __FUNCTION__);
-
-	alignment--;
-	rmesa->dma.current.start = rmesa->dma.current.ptr =
-	    (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-	if (rmesa->dma.current.ptr + bytes > rmesa->dma.current.end)
-		r300RefillCurrentDmaRegion(rmesa);
-
-	region->start = rmesa->dma.current.start;
-	region->ptr = rmesa->dma.current.start;
-	region->end = rmesa->dma.current.start + bytes;
-	region->address = rmesa->dma.current.address;
-	region->buf = rmesa->dma.current.buf;
-	region->buf->refcount++;
-
-	rmesa->dma.current.ptr += bytes;	/* bug - if alignment > 7 */
-	rmesa->dma.current.start =
-	    rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;
-
-	assert(rmesa->dma.current.ptr <= rmesa->dma.current.end);
-}
-
-#endif
-
-GLboolean r300IsGartMemory(r300ContextPtr rmesa, const GLvoid * pointer,
-			   GLint size)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	int valid = (size >= 0 && offset >= 0
-		     && offset + size <
-		     rmesa->radeon.radeonScreen->gartTextures.size);
-
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "r300IsGartMemory( %p ) : %d\n", pointer,
-			valid);
-
-	return valid;
-}
-
-GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa, const GLvoid * pointer)
-{
-	int offset =
-	    (char *)pointer -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-
-	//fprintf(stderr, "offset=%08x\n", offset);
-
-	if (offset < 0
-	    || offset > rmesa->radeon.radeonScreen->gartTextures.size)
-		return ~0;
-	else
-		return rmesa->radeon.radeonScreen->gart_texture_offset + offset;
-}
 
 void r300InitIoctlFuncs(struct dd_function_table *functions)
 {
 	functions->Clear = r300Clear;
 	functions->Finish = radeonFinish;
-	functions->Flush = r300Flush;
+	functions->Flush = radeonFlush;
 }
diff --git a/src/mesa/drivers/dri/r300/r300_ioctl.h b/src/mesa/drivers/dri/r300/r300_ioctl.h
index e1143fb6c34..3abfa71a6e8 100644
--- a/src/mesa/drivers/dri/r300/r300_ioctl.h
+++ b/src/mesa/drivers/dri/r300/r300_ioctl.h
@@ -39,22 +39,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "radeon_drm.h"
 
-extern GLboolean r300IsGartMemory(r300ContextPtr rmesa,
-				  const GLvoid * pointer, GLint size);
-
-extern GLuint r300GartOffsetFromVirtual(r300ContextPtr rmesa,
-					const GLvoid * pointer);
-
-extern void r300Flush(GLcontext * ctx);
-
-extern void r300ReleaseDmaRegion(r300ContextPtr rmesa,
-				 struct r300_dma_region *region,
-				 const char *caller);
-extern void r300AllocDmaRegion(r300ContextPtr rmesa,
-			       struct r300_dma_region *region, int bytes,
-			       int alignment);
-
 extern void r300InitIoctlFuncs(struct dd_function_table *functions);
 
-extern void r300RefillCurrentDmaRegion(r300ContextPtr rmesa, int size);
 #endif				/* __R300_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_mem.c b/src/mesa/drivers/dri/r300/r300_mem.c
deleted file mode 100644
index f8f9d4fcdf1..00000000000
--- a/src/mesa/drivers/dri/r300/r300_mem.c
+++ /dev/null
@@ -1,385 +0,0 @@
-/*
- * Copyright (C) 2005 Aapo Tahkola.
- *
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining
- * a copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sublicense, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial
- * portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
- * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
- * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
- * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
- * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- */
-
-/**
- * \file
- *
- * \author Aapo Tahkola <[email protected]>
- */
-
-#include <unistd.h>
-
-#include "r300_context.h"
-#include "r300_cmdbuf.h"
-#include "r300_ioctl.h"
-#include "r300_mem.h"
-#include "radeon_ioctl.h"
-
-#ifdef USER_BUFFERS
-
-static void resize_u_list(r300ContextPtr rmesa)
-{
-	void *temp;
-	int nsize;
-
-	temp = rmesa->rmm->u_list;
-	nsize = rmesa->rmm->u_size * 2;
-
-	rmesa->rmm->u_list = _mesa_malloc(nsize * sizeof(*rmesa->rmm->u_list));
-	_mesa_memset(rmesa->rmm->u_list, 0,
-		     nsize * sizeof(*rmesa->rmm->u_list));
-
-	if (temp) {
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-		_mesa_memcpy(rmesa->rmm->u_list, temp,
-			     rmesa->rmm->u_size * sizeof(*rmesa->rmm->u_list));
-		_mesa_free(temp);
-	}
-
-	rmesa->rmm->u_size = nsize;
-}
-
-void r300_mem_init(r300ContextPtr rmesa)
-{
-	rmesa->rmm = malloc(sizeof(struct r300_memory_manager));
-	memset(rmesa->rmm, 0, sizeof(struct r300_memory_manager));
-
-	rmesa->rmm->u_size = 128;
-	resize_u_list(rmesa);
-}
-
-void r300_mem_destroy(r300ContextPtr rmesa)
-{
-	_mesa_free(rmesa->rmm->u_list);
-	rmesa->rmm->u_list = NULL;
-
-	_mesa_free(rmesa->rmm);
-	rmesa->rmm = NULL;
-}
-
-void *r300_mem_ptr(r300ContextPtr rmesa, int id)
-{
-	assert(id <= rmesa->rmm->u_last);
-	return rmesa->rmm->u_list[id].ptr;
-}
-
-int r300_mem_find(r300ContextPtr rmesa, void *ptr)
-{
-	int i;
-
-	for (i = 1; i < rmesa->rmm->u_size + 1; i++)
-		if (rmesa->rmm->u_list[i].ptr &&
-		    ptr >= rmesa->rmm->u_list[i].ptr &&
-		    ptr <
-		    rmesa->rmm->u_list[i].ptr + rmesa->rmm->u_list[i].size)
-			break;
-
-	if (i < rmesa->rmm->u_size + 1)
-		return i;
-
-	fprintf(stderr, "%p failed\n", ptr);
-	return 0;
-}
-
-//#define MM_DEBUG
-int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size)
-{
-	drm_radeon_mem_alloc_t alloc;
-	int offset = 0, ret;
-	int i, free = -1;
-	int done_age;
-	drm_radeon_mem_free_t memfree;
-	int tries = 0;
-	static int bytes_wasted = 0, allocated = 0;
-
-	if (size < 4096)
-		bytes_wasted += 4096 - size;
-
-	allocated += size;
-
-#if 0
-	static int t = 0;
-	if (t != time(NULL)) {
-		t = time(NULL);
-		fprintf(stderr, "slots used %d, wasted %d kb, allocated %d\n",
-			rmesa->rmm->u_last, bytes_wasted / 1024,
-			allocated / 1024);
-	}
-#endif
-
-	memfree.region = RADEON_MEM_REGION_GART;
-
-      again:
-
-	done_age = radeonGetAge((radeonContextPtr) rmesa);
-
-	if (rmesa->rmm->u_last + 1 >= rmesa->rmm->u_size)
-		resize_u_list(rmesa);
-
-	for (i = rmesa->rmm->u_last + 1; i > 0; i--) {
-		if (rmesa->rmm->u_list[i].ptr == NULL) {
-			free = i;
-			continue;
-		}
-
-		if (rmesa->rmm->u_list[i].h_pending == 0 &&
-		    rmesa->rmm->u_list[i].pending
-		    && rmesa->rmm->u_list[i].age <= done_age) {
-			memfree.region_offset =
-			    (char *)rmesa->rmm->u_list[i].ptr -
-			    (char *)rmesa->radeon.radeonScreen->gartTextures.
-			    map;
-
-			ret =
-			    drmCommandWrite(rmesa->radeon.radeonScreen->
-					    driScreen->fd, DRM_RADEON_FREE,
-					    &memfree, sizeof(memfree));
-
-			if (ret) {
-				fprintf(stderr, "Failed to free at %p\n",
-					rmesa->rmm->u_list[i].ptr);
-				fprintf(stderr, "ret = %s\n", strerror(-ret));
-				exit(1);
-			} else {
-#ifdef MM_DEBUG
-				fprintf(stderr, "really freed %d at age %x\n",
-					i,
-					radeonGetAge((radeonContextPtr) rmesa));
-#endif
-				if (i == rmesa->rmm->u_last)
-					rmesa->rmm->u_last--;
-
-				if (rmesa->rmm->u_list[i].size < 4096)
-					bytes_wasted -=
-					    4096 - rmesa->rmm->u_list[i].size;
-
-				allocated -= rmesa->rmm->u_list[i].size;
-				rmesa->rmm->u_list[i].pending = 0;
-				rmesa->rmm->u_list[i].ptr = NULL;
-				free = i;
-			}
-		}
-	}
-	rmesa->rmm->u_head = i;
-
-	if (free == -1) {
-		WARN_ONCE("Ran out of slots!\n");
-		//usleep(100);
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		tries++;
-		if (tries > 100) {
-			WARN_ONCE("Ran out of slots!\n");
-			exit(1);
-		}
-		goto again;
-	}
-
-	alloc.region = RADEON_MEM_REGION_GART;
-	alloc.alignment = alignment;
-	alloc.size = size;
-	alloc.region_offset = &offset;
-
-	ret =
-	    drmCommandWriteRead(rmesa->radeon.dri.fd, DRM_RADEON_ALLOC, &alloc,
-				sizeof(alloc));
-	if (ret) {
-#if 0
-		WARN_ONCE("Ran out of mem!\n");
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-		//usleep(100);
-		tries2++;
-		tries = 0;
-		if (tries2 > 100) {
-			WARN_ONCE("Ran out of GART memory!\n");
-			exit(1);
-		}
-		goto again;
-#else
-		WARN_ONCE
-		    ("Ran out of GART memory (for %d)!\nPlease consider adjusting GARTSize option.\n",
-		     size);
-		return 0;
-#endif
-	}
-
-	i = free;
-
-	if (i > rmesa->rmm->u_last)
-		rmesa->rmm->u_last = i;
-
-	rmesa->rmm->u_list[i].ptr =
-	    ((GLubyte *) rmesa->radeon.radeonScreen->gartTextures.map) + offset;
-	rmesa->rmm->u_list[i].size = size;
-	rmesa->rmm->u_list[i].age = 0;
-	//fprintf(stderr, "alloc %p at id %d\n", rmesa->rmm->u_list[i].ptr, i);
-
-#ifdef MM_DEBUG
-	fprintf(stderr, "allocated %d at age %x\n", i,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	return i;
-}
-
-void r300_mem_use(r300ContextPtr rmesa, int id)
-{
-	uint64_t ull;
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	drm_r300_cmd_header_t *cmd;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (id == 0)
-		return;
-
-	cmd =
-	    (drm_r300_cmd_header_t *) r300AllocCmdBuf(rmesa,
-						      2 + sizeof(ull) / 4,
-						      __FUNCTION__);
-	cmd[0].scratch.cmd_type = R300_CMD_SCRATCH;
-	cmd[0].scratch.reg = R300_MEM_SCRATCH;
-	cmd[0].scratch.n_bufs = 1;
-	cmd[0].scratch.flags = 0;
-	cmd++;
-
-	ull = (uint64_t) (intptr_t) & rmesa->rmm->u_list[id].age;
-	_mesa_memcpy(cmd, &ull, sizeof(ull));
-	cmd += sizeof(ull) / 4;
-
-	cmd[0].u = /*id */ 0;
-
-	LOCK_HARDWARE(&rmesa->radeon);	/* Protect from DRM. */
-	rmesa->rmm->u_list[id].h_pending++;
-	UNLOCK_HARDWARE(&rmesa->radeon);
-}
-
-unsigned long r300_mem_offset(r300ContextPtr rmesa, int id)
-{
-	unsigned long offset;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	offset = (char *)rmesa->rmm->u_list[id].ptr -
-	    (char *)rmesa->radeon.radeonScreen->gartTextures.map;
-	offset += rmesa->radeon.radeonScreen->gart_texture_offset;
-
-	return offset;
-}
-
-void *r300_mem_map(r300ContextPtr rmesa, int id, int access)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-	void *ptr;
-	int tries = 0;
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (access == R300_MEM_R) {
-
-		if (rmesa->rmm->u_list[id].mapped == 1)
-			WARN_ONCE("buffer %d already mapped\n", id);
-
-		rmesa->rmm->u_list[id].mapped = 1;
-		ptr = r300_mem_ptr(rmesa, id);
-
-		return ptr;
-	}
-
-	if (rmesa->rmm->u_list[id].h_pending)
-		r300FlushCmdBuf(rmesa, __FUNCTION__);
-
-	if (rmesa->rmm->u_list[id].h_pending) {
-		return NULL;
-	}
-
-	while (rmesa->rmm->u_list[id].age >
-	       radeonGetAge((radeonContextPtr) rmesa) && tries++ < 1000)
-		usleep(10);
-
-	if (tries >= 1000) {
-		fprintf(stderr, "Idling failed (%x vs %x)\n",
-			rmesa->rmm->u_list[id].age,
-			radeonGetAge((radeonContextPtr) rmesa));
-		return NULL;
-	}
-
-	if (rmesa->rmm->u_list[id].mapped == 1)
-		WARN_ONCE("buffer %d already mapped\n", id);
-
-	rmesa->rmm->u_list[id].mapped = 1;
-	ptr = r300_mem_ptr(rmesa, id);
-
-	return ptr;
-}
-
-void r300_mem_unmap(r300ContextPtr rmesa, int id)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (rmesa->rmm->u_list[id].mapped == 0)
-		WARN_ONCE("buffer %d not mapped\n", id);
-
-	rmesa->rmm->u_list[id].mapped = 0;
-}
-
-void r300_mem_free(r300ContextPtr rmesa, int id)
-{
-#ifdef MM_DEBUG
-	fprintf(stderr, "%s: %d at age %x\n", __FUNCTION__, id,
-		radeonGetAge((radeonContextPtr) rmesa));
-#endif
-
-	assert(id <= rmesa->rmm->u_last);
-
-	if (id == 0)
-		return;
-
-	if (rmesa->rmm->u_list[id].ptr == NULL) {
-		WARN_ONCE("Not allocated!\n");
-		return;
-	}
-
-	if (rmesa->rmm->u_list[id].pending) {
-		WARN_ONCE("%p already pended!\n", rmesa->rmm->u_list[id].ptr);
-		return;
-	}
-
-	rmesa->rmm->u_list[id].pending = 1;
-}
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_mem.h b/src/mesa/drivers/dri/r300/r300_mem.h
deleted file mode 100644
index 625a7f6d8d5..00000000000
--- a/src/mesa/drivers/dri/r300/r300_mem.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#ifndef __R300_MEM_H__
-#define __R300_MEM_H__
-
-//#define R300_MEM_PDL 0
-#define R300_MEM_UL 1
-
-#define R300_MEM_R 1
-#define R300_MEM_W 2
-#define R300_MEM_RW (R300_MEM_R | R300_MEM_W)
-
-#define R300_MEM_SCRATCH 2
-
-struct r300_memory_manager {
-	struct {
-		void *ptr;
-		uint32_t size;
-		uint32_t age;
-		uint32_t h_pending;
-		int pending;
-		int mapped;
-	} *u_list;
-	int u_head, u_size, u_last;
-
-};
-
-extern void r300_mem_init(r300ContextPtr rmesa);
-extern void r300_mem_destroy(r300ContextPtr rmesa);
-extern void *r300_mem_ptr(r300ContextPtr rmesa, int id);
-extern int r300_mem_find(r300ContextPtr rmesa, void *ptr);
-extern int r300_mem_alloc(r300ContextPtr rmesa, int alignment, int size);
-extern void r300_mem_use(r300ContextPtr rmesa, int id);
-extern unsigned long r300_mem_offset(r300ContextPtr rmesa, int id);
-extern void *r300_mem_map(r300ContextPtr rmesa, int id, int access);
-extern void r300_mem_unmap(r300ContextPtr rmesa, int id);
-extern void r300_mem_free(r300ContextPtr rmesa, int id);
-
-#endif
diff --git a/src/mesa/drivers/dri/r300/r300_reg.h b/src/mesa/drivers/dri/r300/r300_reg.h
index 8f1a6630d5c..357c600af97 100644
--- a/src/mesa/drivers/dri/r300/r300_reg.h
+++ b/src/mesa/drivers/dri/r300/r300_reg.h
@@ -1467,6 +1467,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R300_TX_FORMAT_3D		   (1 << 25)
 #	define R300_TX_FORMAT_CUBIC_MAP		   (2 << 25)
 
+#	define R300_TX_FORMAT_GAMMA			(1 << 21)
+
 	/* gap */
 	/* Floating point formats */
 	/* Note - hardware supports both 16 and 32 bit floating point */
@@ -1531,6 +1533,13 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #	define R500_SEL_FILTER4_TC3		 (3 << 18)
 
 #define R300_TX_OFFSET_0                    0x4540
+#define R300_TX_OFFSET_1                    0x4544
+#define R300_TX_OFFSET_2                    0x4548
+#define R300_TX_OFFSET_3                    0x454C
+#define R300_TX_OFFSET_4                    0x4550
+#define R300_TX_OFFSET_5                    0x4554
+#define R300_TX_OFFSET_6                    0x4558
+#define R300_TX_OFFSET_7                    0x455C
 	/* BEGIN: Guess from R200 */
 #       define R300_TXO_ENDIAN_NO_SWAP           (0 << 0)
 #       define R300_TXO_ENDIAN_BYTE_SWAP         (1 << 0)
@@ -2425,6 +2434,12 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /* Z Buffer Clear Value */
 #define R300_ZB_DEPTHCLEARVALUE                  0x4f28
 
+#define R300_ZB_ZMASK_OFFSET                     0x4f30
+#define R300_ZB_ZMASK_PITCH                      0x4f34
+#define R300_ZB_ZMASK_WRINDEX                    0x4f38
+#define R300_ZB_ZMASK_DWORD                      0x4f3c
+#define R300_ZB_ZMASK_RDINDEX                    0x4f40
+
 /* Hierarchical Z Memory Offset */
 #define R300_ZB_HIZ_OFFSET                       0x4f44
 
@@ -3165,6 +3180,9 @@ enum {
 #   define R300_W_SRC_RAS				(1 << 2)
 
 
+/* Packet0 field ordering to write all values to the same reg */
+#define RADEON_ONE_REG_WR        (1 << 15)
+
 /* Draw a primitive from vertex data in arrays loaded via 3D_LOAD_VBPNTR.
  * Two parameter dwords:
  * 0. VAP_VTX_FMT: The first parameter is not written to hardware
diff --git a/src/mesa/drivers/dri/r300/r300_render.c b/src/mesa/drivers/dri/r300/r300_render.c
index 16ce4a11999..08d67b73edc 100644
--- a/src/mesa/drivers/dri/r300/r300_render.c
+++ b/src/mesa/drivers/dri/r300/r300_render.c
@@ -50,6 +50,8 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
  * no bugs...
  */
 
+#include "r300_render.h"
+
 #include "main/glheader.h"
 #include "main/state.h"
 #include "main/imports.h"
@@ -66,16 +68,14 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/t_vp_build.h"
 #include "radeon_reg.h"
 #include "radeon_macros.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_tex.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
-extern int future_hw_tcl_on;
+#include "r300_fragprog_common.h"
+#include "r300_swtcl.h"
 
 /**
  * \brief Convert a OpenGL primitive type into a R300 primitive type.
@@ -172,96 +172,186 @@ int r300NumVerts(r300ContextPtr rmesa, int num_verts, int prim)
 	return num_verts - verts_off;
 }
 
-static void r300EmitElts(GLcontext * ctx, void *elts, unsigned long n_elts)
+static void r300EmitElts(GLcontext * ctx, unsigned long n_elts)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_dma_region *rvb = &rmesa->state.elt_dma;
 	void *out;
+	GLuint size;
 
-	if (r300IsGartMemory(rmesa, elts, n_elts * 4)) {
-		rvb->address = rmesa->radeon.radeonScreen->gartTextures.map;
-		rvb->start = ((char *)elts) - rvb->address;
-		rvb->aos_offset =
-		    rmesa->radeon.radeonScreen->gart_texture_offset +
-		    rvb->start;
-		return;
-	} else if (r300IsGartMemory(rmesa, elts, 1)) {
-		WARN_ONCE("Pointer not within GART memory!\n");
-		_mesa_exit(-1);
-	}
-
-	r300AllocDmaRegion(rmesa, rvb, n_elts * 4, 4);
-	rvb->aos_offset = GET_START(rvb);
+	size = ((rmesa->ind_buf.is_32bit ? 4 : 2) * n_elts + 3) & ~3;
 
-	out = rvb->address + rvb->start;
-	memcpy(out, elts, n_elts * 4);
+	radeonAllocDmaRegion(&rmesa->radeon, &rmesa->radeon.tcl.elt_dma_bo,
+			     &rmesa->radeon.tcl.elt_dma_offset, size, 4);
+	radeon_bo_map(rmesa->radeon.tcl.elt_dma_bo, 1);
+	out = rmesa->radeon.tcl.elt_dma_bo->ptr + rmesa->radeon.tcl.elt_dma_offset;
+	memcpy(out, rmesa->ind_buf.ptr, size);
+	radeon_bo_unmap(rmesa->radeon.tcl.elt_dma_bo);
 }
 
-static void r300FireEB(r300ContextPtr rmesa, unsigned long addr,
-		       int vertex_count, int type)
+static void r300FireEB(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (vertex_count << 16) | type | R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_INDX_BUFFER, 2), 2);
-	e32(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
-	    (R300_VAP_PORT_IDX0 >> 2));
-	e32(addr);
-	e32(vertex_count);
+	BATCH_LOCALS(&rmesa->radeon);
+
+    r300_emit_scissor(rmesa->radeon.glCtx);
+	if (vertex_count > 0) {
+		int size;
+
+		BEGIN_BATCH(10);
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_INDX_2, 0);
+		if (rmesa->ind_buf.is_32bit) {
+			size = vertex_count;
+			OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+			  ((vertex_count + 0) << 16) | type |
+			  R300_VAP_VF_CNTL__INDEX_SIZE_32bit);
+		} else {
+			size = (vertex_count + 1) >> 1;
+			OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_INDICES |
+			   ((vertex_count + 0) << 16) | type);
+		}
+
+		if (!rmesa->radeon.radeonScreen->kernel_mm) {
+			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+	    			 (R300_VAP_PORT_IDX0 >> 2));
+			OUT_BATCH_RELOC(rmesa->radeon.tcl.elt_dma_offset,
+					rmesa->radeon.tcl.elt_dma_bo,
+					rmesa->radeon.tcl.elt_dma_offset,
+					RADEON_GEM_DOMAIN_GTT, 0, 0);
+			OUT_BATCH(size);
+		} else {
+			OUT_BATCH_PACKET3(R300_PACKET3_INDX_BUFFER, 2);
+			OUT_BATCH(R300_INDX_BUFFER_ONE_REG_WR | (0 << R300_INDX_BUFFER_SKIP_SHIFT) |
+	    			 (R300_VAP_PORT_IDX0 >> 2));
+			OUT_BATCH(rmesa->radeon.tcl.elt_dma_offset);
+			OUT_BATCH(size);
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.elt_dma_bo,
+					      RADEON_GEM_DOMAIN_GTT, 0, 0);
+		}
+		END_BATCH();
+	}
 }
 
 static void r300EmitAOS(r300ContextPtr rmesa, GLuint nr, GLuint offset)
 {
+	BATCH_LOCALS(&rmesa->radeon);
+	uint32_t voffset;
 	int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
 	int i;
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
 	if (RADEON_DEBUG & DEBUG_VERTS)
 		fprintf(stderr, "%s: nr=%d, ofs=0x%08x\n", __FUNCTION__, nr,
 			offset);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1), sz - 1);
-	e32(nr);
+	if (!rmesa->radeon.radeonScreen->kernel_mm) {
+		BEGIN_BATCH(sz+2+(nr * 2));
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+		OUT_BATCH(nr);
+
+		for (i = 0; i + 1 < nr; i += 2) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+				  (rmesa->radeon.tcl.aos[i].stride << 8) |
+				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[i].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+			  offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[i+1].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+		}
 
-	for (i = 0; i + 1 < nr; i += 2) {
-		e32((rmesa->state.aos[i].aos_size << 0) |
-		    (rmesa->state.aos[i].aos_stride << 8) |
-		    (rmesa->state.aos[i + 1].aos_size << 16) |
-		    (rmesa->state.aos[i + 1].aos_stride << 24));
+		if (nr & 1) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+				  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			OUT_BATCH_RELOC(voffset,
+					rmesa->radeon.tcl.aos[nr - 1].bo,
+					voffset,
+					RADEON_GEM_DOMAIN_GTT,
+					0, 0);
+		}
+		END_BATCH();
+	} else {
 
-		e32(rmesa->state.aos[i].aos_offset + offset * 4 * rmesa->state.aos[i].aos_stride);
-		e32(rmesa->state.aos[i + 1].aos_offset + offset * 4 * rmesa->state.aos[i + 1].aos_stride);
-	}
+		BEGIN_BATCH(sz+2+(nr * 2));
+		OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+		OUT_BATCH(nr);
+
+		for (i = 0; i + 1 < nr; i += 2) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+				  (rmesa->radeon.tcl.aos[i].stride << 8) |
+				  (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+				  (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			OUT_BATCH(voffset);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			OUT_BATCH(voffset);
+		}
 
-	if (nr & 1) {
-		e32((rmesa->state.aos[nr - 1].aos_size << 0) |
-		    (rmesa->state.aos[nr - 1].aos_stride << 8));
-		e32(rmesa->state.aos[nr - 1].aos_offset + offset * 4 * rmesa->state.aos[nr - 1].aos_stride);
+		if (nr & 1) {
+			OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+			  (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			OUT_BATCH(voffset);
+		}
+		for (i = 0; i + 1 < nr; i += 2) {
+			voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[i+0].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+			voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[i+1].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+		}
+		if (nr & 1) {
+			voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+				offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+			radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+					      rmesa->radeon.tcl.aos[nr-1].bo,
+					      RADEON_GEM_DOMAIN_GTT,
+					      0, 0);
+		}
+		END_BATCH();
 	}
+
 }
 
 static void r300FireAOS(r300ContextPtr rmesa, int vertex_count, int type)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
-	drm_radeon_cmd_header_t *cmd = NULL;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+    r300_emit_scissor(rmesa->radeon.glCtx);
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (vertex_count << 16) | type);
+	END_BATCH();
 }
 
-static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
-				   int start, int end, int prim)
+void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *vb = &tnl->vb;
 
 	type = r300PrimitiveType(rmesa, prim);
 	num_verts = r300NumVerts(rmesa, end - start, prim);
@@ -269,131 +359,150 @@ static void r300RunRenderPrimitive(r300ContextPtr rmesa, GLcontext * ctx,
 	if (type < 0 || num_verts <= 0)
 		return;
 
-	if (vb->Elts) {
-		if (num_verts > 65535) {
-			/* not implemented yet */
-			WARN_ONCE("Too many elts\n");
-			return;
+	/* Make space for at least 128 dwords.
+	 * This is supposed to ensure that we can get all rendering
+	 * commands into a single command buffer.
+	 */
+	rcommonEnsureCmdBufSpace(&rmesa->radeon, 128, __FUNCTION__);
+
+	if (rmesa->ind_buf.ptr) {
+		r300EmitElts(ctx, num_verts);
+		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, 0);
+		if (rmesa->radeon.radeonScreen->kernel_mm) {
+			BEGIN_BATCH_NO_AUTOSTATE(2);
+			OUT_BATCH_REGSEQ(R300_VAP_VF_MAX_VTX_INDX, 1);
+			OUT_BATCH(rmesa->radeon.tcl.aos[0].count);
+			END_BATCH();
 		}
-		/* Note: The following is incorrect, but it's the best I can do
-		 * without a major refactoring of how DMA memory is handled.
-		 * The problem: Ensuring that both vertex arrays *and* index
-		 * arrays are at the right position, and then ensuring that
-		 * the LOAD_VBPNTR, DRAW_INDX and INDX_BUFFER packets are emitted
-		 * at once.
-		 *
-		 * So why is the following incorrect? Well, it seems like
-		 * allocating the index array might actually evict the vertex
-		 * arrays. *sigh*
-		 */
-		r300EmitElts(ctx, vb->Elts, num_verts);
-		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
-		r300FireEB(rmesa, rmesa->state.elt_dma.aos_offset, num_verts, type);
+		r300FireEB(rmesa, num_verts, type);
 	} else {
-		r300EmitAOS(rmesa, rmesa->state.aos_count, start);
+		r300EmitAOS(rmesa, rmesa->radeon.tcl.aos_count, start);
 		r300FireAOS(rmesa, num_verts, type);
 	}
+	COMMIT_BATCH();
 }
 
-static GLboolean r300RunRender(GLcontext * ctx,
-			       struct tnl_pipeline_stage *stage)
+static void r300RunRender(GLcontext * ctx, struct tnl_pipeline_stage *stage)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	int i;
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *vb = &tnl->vb;
 
-
 	if (RADEON_DEBUG & DEBUG_PRIMS)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
 	r300UpdateShaders(rmesa);
-	if (r300EmitArrays(ctx))
-		return GL_TRUE;
+	r300EmitArrays(ctx);
 
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
-	r300EmitState(rmesa);
+	radeonEmitState(&rmesa->radeon);
 
 	for (i = 0; i < vb->PrimitiveCount; i++) {
 		GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
 		GLuint start = vb->Primitive[i].start;
 		GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
-		r300RunRenderPrimitive(rmesa, ctx, start, end, prim);
+		r300RunRenderPrimitive(ctx, start, end, prim);
 	}
 
 	r300EmitCacheFlush(rmesa);
 
-#ifdef USER_BUFFERS
-	r300UseArrays(ctx);
-#endif
+	radeonReleaseArrays(ctx, ~0);
+}
 
-	r300ReleaseArrays(ctx);
 
-	return GL_FALSE;
+static const char *getFallbackString(uint32_t bit)
+{
+	switch (bit) {
+		case R300_FALLBACK_VERTEX_PROGRAM :
+			return "vertex program";
+		case R300_FALLBACK_LINE_SMOOTH:
+			return "smooth lines";
+		case R300_FALLBACK_POINT_SMOOTH:
+			return "smooth points";
+		case R300_FALLBACK_POLYGON_SMOOTH:
+			return "smooth polygons";
+		case R300_FALLBACK_LINE_STIPPLE:
+			return "line stipple";
+		case R300_FALLBACK_POLYGON_STIPPLE:
+			return "polygon stipple";
+		case R300_FALLBACK_STENCIL_TWOSIDE:
+			return "two-sided stencil";
+		case R300_FALLBACK_RENDER_MODE:
+			return "render mode != GL_RENDER";
+		case R300_FALLBACK_FRAGMENT_PROGRAM:
+			return "fragment program";
+		case R300_FALLBACK_AOS_LIMIT:
+			return "aos limit";
+		case R300_FALLBACK_INVALID_BUFFERS:
+			return "invalid buffers";
+		default:
+			return "unknown";
+	}
 }
 
-#define FALLBACK_IF(expr)						\
-	do {								\
-		if (expr) {						\
-			if (1 || RADEON_DEBUG & DEBUG_FALLBACKS)	\
-				WARN_ONCE("Software fallback:%s\n",	\
-					  #expr);			\
-			return R300_FALLBACK_RAST;			\
-		}							\
-	} while(0)
-
-static int r300Fallback(GLcontext * ctx)
+void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode)
 {
-	r300ContextPtr r300 = R300_CONTEXT(ctx);
-	const unsigned back = ctx->Stencil._BackFace;
-
-	/* Do we need to use new-style shaders?
-	 * Also is there a better way to do this? */
-	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-		struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r500TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
+	TNLcontext *tnl = TNL_CONTEXT(ctx);
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	uint32_t old_fallback = rmesa->fallback;
+	static uint32_t fallback_warn = 0;
+	
+	if (mode) {
+		if ((fallback_warn & bit) == 0) {
+			if (RADEON_DEBUG & DEBUG_FALLBACKS)
+				_mesa_fprintf(stderr, "WARNING! Falling back to software for %s\n", getFallbackString(bit));
+			fallback_warn |= bit;
+		}
+		rmesa->fallback |= bit;
+
+		/* update only if we change from no tcl fallbacks to some tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if (((old_fallback & R300_TCL_FALLBACK_MASK) == 0) &&
+				((bit & R300_TCL_FALLBACK_MASK) > 0)) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] |= R300_VAP_TCL_BYPASS;
 			}
 		}
+
+		/* update only if we change from no raster fallbacks to some raster fallbacks */
+		if (((old_fallback & R300_RASTER_FALLBACK_MASK) == 0) &&
+			((bit & R300_RASTER_FALLBACK_MASK) > 0)) {
+			
+			radeon_firevertices(&rmesa->radeon);
+			rmesa->radeon.swtcl.RenderIndex = ~0;
+			_swsetup_Wakeup( ctx );
+		}
 	} else {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-		if (fp) {
-			if (!fp->translated) {
-				r300TranslateFragmentShader(r300, fp);
-				FALLBACK_IF(!fp->translated);
+		rmesa->fallback &= ~bit;
+
+		/* update only if we have disabled all tcl fallbacks */
+		if (rmesa->options.hw_tcl_enabled) {
+			if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+				R300_STATECHANGE(rmesa, vap_cntl_status);
+				rmesa->hw.vap_cntl_status.cmd[1] &= ~R300_VAP_TCL_BYPASS;
 			}
 		}
-	}
-
-	FALLBACK_IF(ctx->RenderMode != GL_RENDER);
 
-	/* If GL_EXT_stencil_two_side is disabled, this fallback check can
-	 * be removed.
-	 */
-	FALLBACK_IF(ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
-		    || ctx->Stencil.ValueMask[0] !=
-		    ctx->Stencil.ValueMask[back]
-		    || ctx->Stencil.WriteMask[0] !=
-		    ctx->Stencil.WriteMask[back]);
-
-	if (ctx->Extensions.NV_point_sprite || ctx->Extensions.ARB_point_sprite)
-		FALLBACK_IF(ctx->Point.PointSprite);
-
-	if (!r300->disable_lowimpact_fallback) {
-		FALLBACK_IF(ctx->Polygon.StippleFlag);
-		FALLBACK_IF(ctx->Multisample._Enabled);
-		FALLBACK_IF(ctx->Line.StippleFlag);
-		FALLBACK_IF(ctx->Line.SmoothFlag);
-		FALLBACK_IF(ctx->Point.SmoothFlag);
+		/* update only if we have disabled all raster fallbacks */
+		if ((old_fallback & R300_RASTER_FALLBACK_MASK) == bit) {
+			_swrast_flush( ctx );
+			
+			tnl->Driver.Render.Start = r300RenderStart;
+			tnl->Driver.Render.Finish = r300RenderFinish;
+			tnl->Driver.Render.PrimitiveNotify = r300RenderPrimitive;
+			tnl->Driver.Render.ResetLineStipple = r300ResetLineStipple;
+			tnl->Driver.Render.BuildVertices = _tnl_build_vertices;
+			tnl->Driver.Render.CopyPV = _tnl_copy_pv;
+			tnl->Driver.Render.Interp = _tnl_interp;
+			
+			_tnl_invalidate_vertex_state( ctx, ~0 );
+			_tnl_invalidate_vertices( ctx, ~0 );
+		}
 	}
-
-	return R300_FALLBACK_NONE;
+	
 }
 
 static GLboolean r300RunNonTCLRender(GLcontext * ctx,
@@ -404,43 +513,15 @@ static GLboolean r300RunNonTCLRender(GLcontext * ctx,
 	if (RADEON_DEBUG & DEBUG_PRIMS)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	if (r300Fallback(ctx) >= R300_FALLBACK_RAST)
+	if (rmesa->fallback & R300_RASTER_FALLBACK_MASK)
 		return GL_TRUE;
 
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
- 	        return GL_TRUE;
-
-	return r300RunRender(ctx, stage);
-}
-
-static GLboolean r300RunTCLRender(GLcontext * ctx,
-				  struct tnl_pipeline_stage *stage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_vertex_program *vp;
-
-	hw_tcl_on = future_hw_tcl_on;
-
-	if (RADEON_DEBUG & DEBUG_PRIMS)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	if (hw_tcl_on == GL_FALSE)
-		return GL_TRUE;
-
-	if (r300Fallback(ctx) >= R300_FALLBACK_TCL) {
-		hw_tcl_on = GL_FALSE;
+	if (rmesa->options.hw_tcl_enabled == GL_FALSE)
 		return GL_TRUE;
-	}
 
-	r300UpdateShaders(rmesa);
-
-	vp = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	if (vp->native == GL_FALSE) {
-		hw_tcl_on = GL_FALSE;
-		return GL_TRUE;
-	}
+	r300RunRender(ctx, stage);
 
-	return r300RunRender(ctx, stage);
+	return GL_FALSE;
 }
 
 const struct tnl_pipeline_stage _r300_render_stage = {
@@ -451,12 +532,3 @@ const struct tnl_pipeline_stage _r300_render_stage = {
 	NULL,
 	r300RunNonTCLRender
 };
-
-const struct tnl_pipeline_stage _r300_tcl_stage = {
-	"r300 Hardware Transform, Clipping and Lighting",
-	NULL,
-	NULL,
-	NULL,
-	NULL,
-	r300RunTCLRender
-};
diff --git a/src/mesa/drivers/dri/r300/r300_render.h b/src/mesa/drivers/dri/r300/r300_render.h
new file mode 100644
index 00000000000..ec785474a67
--- /dev/null
+++ b/src/mesa/drivers/dri/r300/r300_render.h
@@ -0,0 +1,69 @@
+/*
+ * Copyright 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __R300_RENDER_H__
+#define __R300_RENDER_H__
+
+#include "main/mtypes.h"
+
+#define R300_FALLBACK_VERTEX_PROGRAM    (1 << 0)
+#define R300_TCL_FALLBACK_MASK           0x0000ffff
+
+#define R300_FALLBACK_LINE_SMOOTH       (1 << 16)
+#define R300_FALLBACK_POINT_SMOOTH      (1 << 17)
+#define R300_FALLBACK_POLYGON_SMOOTH    (1 << 18)
+#define R300_FALLBACK_LINE_STIPPLE      (1 << 19)
+#define R300_FALLBACK_POLYGON_STIPPLE   (1 << 20)
+#define R300_FALLBACK_STENCIL_TWOSIDE   (1 << 21)
+#define R300_FALLBACK_RENDER_MODE       (1 << 22)
+#define R300_FALLBACK_FRAGMENT_PROGRAM  (1 << 23)
+#define R300_FALLBACK_AOS_LIMIT         (1 << 30)
+#define R300_FALLBACK_INVALID_BUFFERS   (1 << 31)
+#define R300_RASTER_FALLBACK_MASK        0xffff0000
+
+#define MASK_XYZW (R300_WRITE_ENA_X | R300_WRITE_ENA_Y | R300_WRITE_ENA_Z | R300_WRITE_ENA_W)
+#define MASK_X R300_WRITE_ENA_X
+#define MASK_Y R300_WRITE_ENA_Y
+#define MASK_Z R300_WRITE_ENA_Z
+#define MASK_W R300_WRITE_ENA_W
+
+#if SWIZZLE_X != R300_INPUT_ROUTE_SELECT_X || \
+    SWIZZLE_Y != R300_INPUT_ROUTE_SELECT_Y || \
+    SWIZZLE_Z != R300_INPUT_ROUTE_SELECT_Z || \
+    SWIZZLE_W != R300_INPUT_ROUTE_SELECT_W || \
+    SWIZZLE_ZERO != R300_INPUT_ROUTE_SELECT_ZERO || \
+    SWIZZLE_ONE != R300_INPUT_ROUTE_SELECT_ONE
+#error Cannot change these!
+#endif
+
+extern const struct tnl_pipeline_stage _r300_render_stage;
+
+extern void r300SwitchFallback(GLcontext *ctx, uint32_t bit, GLboolean mode);
+
+extern void r300RunRenderPrimitive(GLcontext * ctx, int start, int end, int prim);
+
+#endif
diff --git a/src/mesa/drivers/dri/r300/r300_shader.c b/src/mesa/drivers/dri/r300/r300_shader.c
index f30fd986e0f..62228a3786e 100644
--- a/src/mesa/drivers/dri/r300/r300_shader.c
+++ b/src/mesa/drivers/dri/r300/r300_shader.c
@@ -1,47 +1,78 @@
+/*
+ * Copyright 2009 Maciej Cencora <[email protected]>
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
 
 #include "main/glheader.h"
 
 #include "shader/program.h"
 #include "tnl/tnl.h"
 #include "r300_context.h"
-#include "r300_fragprog.h"
+#include "r300_fragprog_common.h"
+
+static void freeFragProgCache(GLcontext *ctx, struct r300_fragment_program_cont *cache)
+{
+	struct r300_fragment_program *tmp, *fp = cache->progs;
+
+	while (fp) {
+		tmp = fp->next;
+		_mesa_reference_program(ctx, &fp->Base, NULL);
+		_mesa_free(fp);
+		fp = tmp;
+	}
+}
+
+static void freeVertProgCache(GLcontext *ctx, struct r300_vertex_program_cont *cache)
+{
+	struct r300_vertex_program *tmp, *vp = cache->progs;
+
+	while (vp) {
+		tmp = vp->next;
+		_mesa_reference_vertprog(ctx, &vp->Base, NULL);
+		_mesa_free(vp);
+		vp = tmp;
+	}
+}
 
 static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 					 GLuint id)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct r300_vertex_program_cont *vp;
-	struct r300_fragment_program *r300_fp;
-	struct r500_fragment_program *r500_fp;
+	struct r300_fragment_program_cont *fp;
 
 	switch (target) {
 	case GL_VERTEX_STATE_PROGRAM_NV:
 	case GL_VERTEX_PROGRAM_ARB:
 		vp = CALLOC_STRUCT(r300_vertex_program_cont);
-		return _mesa_init_vertex_program(ctx, &vp->mesa_program,
-						 target, id);
-	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			r500_fp->ctx = ctx;
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
+		return _mesa_init_vertex_program(ctx, &vp->mesa_program, target, id);
 
 	case GL_FRAGMENT_PROGRAM_NV:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-			r500_fp = CALLOC_STRUCT(r500_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r500_fp->mesa_program,
-							   target, id);
-		} else {
-			r300_fp = CALLOC_STRUCT(r300_fragment_program);
-			return _mesa_init_fragment_program(ctx, &r300_fp->mesa_program,
-							   target, id);
-		}
+	case GL_FRAGMENT_PROGRAM_ARB:
+		fp = CALLOC_STRUCT(r300_fragment_program_cont);
+		return _mesa_init_fragment_program(ctx, &fp->Base, target, id);
+
 	default:
 		_mesa_problem(ctx, "Bad target in r300NewProgram");
 	}
@@ -51,26 +82,35 @@ static struct gl_program *r300NewProgram(GLcontext * ctx, GLenum target,
 
 static void r300DeleteProgram(GLcontext * ctx, struct gl_program *prog)
 {
+	struct r300_vertex_program_cont *vp = (struct r300_vertex_program_cont *)prog;
+	struct r300_fragment_program_cont *fp = (struct r300_fragment_program_cont *)prog;
+
+	switch (prog->Target) {
+		case GL_VERTEX_PROGRAM_ARB:
+			freeVertProgCache(ctx, vp);
+			break;
+		case GL_FRAGMENT_PROGRAM_ARB:
+			freeFragProgCache(ctx, fp);
+			break;
+	}
+
 	_mesa_delete_program(ctx, prog);
 }
 
 static void
 r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct r300_vertex_program_cont *vp = (void *)prog;
-	struct r300_fragment_program *r300_fp = (struct r300_fragment_program *)prog;
-	struct r500_fragment_program *r500_fp = (struct r500_fragment_program *)prog;
+	struct r300_vertex_program_cont *vp = (struct r300_vertex_program_cont *)prog;
+	struct r300_fragment_program_cont *fp = (struct r300_fragment_program_cont *)prog;
 
 	switch (target) {
 	case GL_VERTEX_PROGRAM_ARB:
+		freeVertProgCache(ctx, vp);
 		vp->progs = NULL;
 		break;
 	case GL_FRAGMENT_PROGRAM_ARB:
-		if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-			r500_fp->translated = GL_FALSE;
-		else
-			r300_fp->translated = GL_FALSE;
+		freeFragProgCache(ctx, fp);
+		fp->progs = NULL;
 		break;
 	}
 
@@ -81,7 +121,19 @@ r300ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
 static GLboolean
 r300IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
 {
-	return GL_TRUE;
+	if (target == GL_FRAGMENT_PROGRAM_ARB) {
+		struct r300_fragment_program *fp = r300SelectFragmentShader(ctx);
+		if (!fp->translated)
+			r300TranslateFragmentShader(ctx, fp);
+
+		return !fp->error;
+	} else {
+		struct r300_vertex_program *vp = r300SelectVertexShader(ctx);
+		if (!vp->translated)
+			r300TranslateVertexShader(vp);
+
+		return !vp->error;
+	}
 }
 
 void r300InitShaderFuncs(struct dd_function_table *functions)
diff --git a/src/mesa/drivers/dri/r300/r300_state.c b/src/mesa/drivers/dri/r300/r300_state.c
index 79f0b3625ca..12fbf281d99 100644
--- a/src/mesa/drivers/dri/r300/r300_state.c
+++ b/src/mesa/drivers/dri/r300/r300_state.c
@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/context.h"
 #include "main/dd.h"
+#include "main/framebuffer.h"
 #include "main/simple_list.h"
 #include "main/api_arrayelt.h"
 #include "main/texformat.h"
@@ -52,22 +53,22 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "shader/prog_statevars.h"
 #include "vbo/vbo.h"
 #include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
 
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
 #include "r300_context.h"
 #include "r300_ioctl.h"
 #include "r300_state.h"
 #include "r300_reg.h"
 #include "r300_emit.h"
-#include "r300_fragprog.h"
 #include "r300_tex.h"
+#include "r300_fragprog_common.h"
+#include "r300_fragprog.h"
+#include "r500_fragprog.h"
+#include "r300_render.h"
+#include "r300_vertprog.h"
 
 #include "drirenderbuffer.h"
 
-extern int future_hw_tcl_on;
-extern void _tnl_UpdateFixedFunctionProgram(GLcontext * ctx);
-
 static void r300BlendColor(GLcontext * ctx, const GLfloat cf[4])
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
@@ -366,7 +367,7 @@ static void r300ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 	GLint *ip;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!rmesa->options.hw_tcl_enabled)
 			return;
 
 	p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
@@ -385,7 +386,7 @@ static void r300SetClipPlaneState(GLcontext * ctx, GLenum cap, GLboolean state)
 	GLuint p;
 
 	/* no VAP UCP on non-TCL chipsets */
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
+	if (!r300->options.hw_tcl_enabled)
 		return;
 
 	p = cap - GL_CLIP_PLANE0;
@@ -433,6 +434,10 @@ static void r300UpdateCulling(GLcontext * ctx)
 		break;
 	}
 
+	/* Winding is inverted when rendering to FBO */
+	if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+		val ^= R300_FRONT_FACE_CW;
+
 	R300_STATECHANGE(r300, cul);
 	r300->hw.cul.cmd[R300_CUL_CULL] = val;
 }
@@ -453,22 +458,14 @@ static GLboolean current_fragment_program_writes_depth(GLcontext* ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 
-	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		struct r300_fragment_program *fp = (struct r300_fragment_program *)
-			(char *)ctx->FragmentProgram._Current;
-		return (fp && fp->WritesDepth);
-	} else {
-		struct r500_fragment_program* fp =
-			(struct r500_fragment_program*)(char*)
-			ctx->FragmentProgram._Current;
-		return (fp && fp->writes_depth);
-	}
+	return ctx->FragmentProgram._Current && r300->selected_fp->writes_depth;
 }
 
 static void r300SetEarlyZState(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	GLuint topZ = R300_ZTOP_ENABLE;
+	GLuint w_fmt, fgdepthsrc;
 
 	if (ctx->Color.AlphaEnabled && ctx->Color.AlphaFunc != GL_ALWAYS)
 		topZ = R300_ZTOP_DISABLE;
@@ -485,6 +482,26 @@ static void r300SetEarlyZState(GLcontext * ctx)
 		R300_STATECHANGE(r300, zstencil_format);
 		r300->hw.zstencil_format.cmd[2] = topZ;
 	}
+
+	/* w_fmt value is set to get best performance
+	* see p.130 R5xx 3D acceleration guide v1.3 */
+	if (current_fragment_program_writes_depth(ctx)) {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
+		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
+	} else {
+		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
+		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
+	}
+
+	if (w_fmt != r300->hw.us_out_fmt.cmd[5]) {
+		R300_STATECHANGE(r300, us_out_fmt);
+		r300->hw.us_out_fmt.cmd[5] = w_fmt;
+	}
+
+	if (fgdepthsrc != r300->hw.fg_depth_src.cmd[1]) {
+		R300_STATECHANGE(r300, fg_depth_src);
+		r300->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+	}
 }
 
 static void r300SetAlphaState(GLcontext * ctx)
@@ -535,8 +552,6 @@ static void r300SetAlphaState(GLcontext * ctx)
 	R300_STATECHANGE(r300, at);
 	r300->hw.at.cmd[R300_AT_ALPHA_TEST] = pp_misc;
 	r300->hw.at.cmd[R300_AT_UNKNOWN] = 0;
-
-	r300SetEarlyZState(ctx);
 }
 
 static void r300AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref)
@@ -584,15 +599,35 @@ static void r300SetDepthState(GLcontext * ctx)
 		r300->hw.zs.cmd[R300_ZS_CNTL_1] |=
 		    translate_func(ctx->Depth.Func) << R300_Z_FUNC_SHIFT;
 	}
+}
 
-	r300SetEarlyZState(ctx);
+static void r300CatchStencilFallback(GLcontext *ctx)
+{
+	const unsigned back = ctx->Stencil._BackFace;
+
+	if (ctx->Stencil._Enabled && (ctx->Stencil.Ref[0] != ctx->Stencil.Ref[back]
+		|| ctx->Stencil.ValueMask[0] != ctx->Stencil.ValueMask[back]
+		|| ctx->Stencil.WriteMask[0] != ctx->Stencil.WriteMask[back])) {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_TRUE);
+	} else {
+		r300SwitchFallback(ctx, R300_FALLBACK_STENCIL_TWOSIDE, GL_FALSE);
+	}
 }
 
 static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLboolean hw_stencil = GL_FALSE;
 
-	if (r300->state.stencil.hw_stencil) {
+	r300CatchStencilFallback(ctx);
+
+	if (ctx->DrawBuffer) {
+		struct radeon_renderbuffer *rrbStencil
+			= radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+		hw_stencil = (rrbStencil && rrbStencil->bo);
+	}
+
+	if (hw_stencil) {
 		R300_STATECHANGE(r300, zs);
 		if (state) {
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] |=
@@ -601,10 +636,6 @@ static void r300SetStencilState(GLcontext * ctx, GLboolean state)
 			r300->hw.zs.cmd[R300_ZS_CNTL_0] &=
 			    ~R300_STENCIL_ENABLE;
 		}
-	} else {
-#if R200_MERGED
-		FALLBACK(&r300->radeon, RADEON_FALLBACK_STENCIL, state);
-#endif
 	}
 }
 
@@ -737,7 +768,12 @@ static void r300ColorMask(GLcontext * ctx,
 static void r300PointSize(GLcontext * ctx, GLfloat size)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        /* same size limits for AA, non-AA points */
+
+	/* We need to clamp to user defined range here, because
+	 * the HW clamping happens only for per vertex point size. */
+	size = CLAMP(size, ctx->Point.MinSize, ctx->Point.MaxSize);
+
+	/* same size limits for AA, non-AA points */
 	size = CLAMP(size, ctx->Const.MinPointSize, ctx->Const.MaxPointSize);
 
 	R300_STATECHANGE(r300, ps);
@@ -830,29 +866,33 @@ static void r300ShadeModel(GLcontext * ctx, GLenum mode)
 
 	R300_STATECHANGE(rmesa, shade);
 	rmesa->hw.shade.cmd[1] = 0x00000002;
+	R300_STATECHANGE(rmesa, shade2);
 	switch (mode) {
 	case GL_FLAT:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_FLAT;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_FLAT;
 		break;
 	case GL_SMOOTH:
-		rmesa->hw.shade.cmd[2] = R300_RE_SHADE_MODEL_SMOOTH;
+		rmesa->hw.shade2.cmd[1] = R300_RE_SHADE_MODEL_SMOOTH;
 		break;
 	default:
 		return;
 	}
-	rmesa->hw.shade.cmd[3] = 0x00000000;
-	rmesa->hw.shade.cmd[4] = 0x00000000;
+	rmesa->hw.shade2.cmd[2] = 0x00000000;
+	rmesa->hw.shade2.cmd[3] = 0x00000000;
 }
 
 static void r300StencilFuncSeparate(GLcontext * ctx, GLenum face,
 				    GLenum func, GLint ref, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint refmask =
-	    ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
-	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
-	const unsigned back = ctx->Stencil._BackFace;
+	GLuint refmask;
 	GLuint flag;
+	const unsigned back = ctx->Stencil._BackFace;
+
+	r300CatchStencilFallback(ctx);
+
+	refmask = ((ctx->Stencil.Ref[0] & 0xff) << R300_STENCILREF_SHIFT)
+	     | ((ctx->Stencil.ValueMask[0] & 0xff) << R300_STENCILMASK_SHIFT);
 
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_0] |= R300_STENCIL_FRONT_BACK;
@@ -880,6 +920,8 @@ static void r300StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
+	r300CatchStencilFallback(ctx);
+
 	R300_STATECHANGE(rmesa, zs);
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_2] &=
 	    ~(R300_STENCILREF_MASK <<
@@ -896,6 +938,8 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	const unsigned back = ctx->Stencil._BackFace;
 
+	r300CatchStencilFallback(ctx);
+
 	R300_STATECHANGE(rmesa, zs);
 	/* It is easier to mask what's left.. */
 	rmesa->hw.zs.cmd[R300_ZS_CNTL_1] &=
@@ -924,28 +968,32 @@ static void r300StencilOpSeparate(GLcontext * ctx, GLenum face,
  * Window position and viewport transformation
  */
 
-/*
- * To correctly position primitives:
- */
-#define SUBPIXEL_X 0.125
-#define SUBPIXEL_Y 0.125
-
 static void r300UpdateWindow(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = rmesa->radeon.dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
 	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
+	const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+	const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+	GLfloat y_scale, y_bias;
+
+	if (render_to_fbo) {
+		y_scale = 1.0;
+		y_bias = 0;
+	} else {
+		y_scale = -1.0;
+		y_bias = yoffset;
+	}
 
 	GLfloat sx = v[MAT_SX];
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat sy = -v[MAT_SY];
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
-	GLfloat sz = v[MAT_SZ] * rmesa->state.depth.scale;
-	GLfloat tz = v[MAT_TZ] * rmesa->state.depth.scale;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat sy = v[MAT_SY] * y_scale;
+	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias;
+	GLfloat sz = v[MAT_SZ] * depthScale;
+	GLfloat tz = v[MAT_TZ] * depthScale;
 
-	R300_FIREVERTICES(rmesa);
 	R300_STATECHANGE(rmesa, vpt);
 
 	rmesa->hw.vpt.cmd[R300_VPT_XSCALE] = r300PackFloat32(sx);
@@ -964,6 +1012,8 @@ static void r300Viewport(GLcontext * ctx, GLint x, GLint y,
 	 * values, or keep the originals hanging around.
 	 */
 	r300UpdateWindow(ctx);
+
+	radeon_viewport(ctx, x, y, width, height);
 }
 
 static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
@@ -974,13 +1024,13 @@ static void r300DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval)
 void r300UpdateViewportOffset(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	__DRIdrawablePrivate *dPriv = ((radeonContextPtr) rmesa)->dri.drawable;
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
 	GLfloat xoffset = (GLfloat) dPriv->x;
 	GLfloat yoffset = (GLfloat) dPriv->y + dPriv->h;
 	const GLfloat *v = ctx->Viewport._WindowMap.m;
 
-	GLfloat tx = v[MAT_TX] + xoffset + SUBPIXEL_X;
-	GLfloat ty = (-v[MAT_TY]) + yoffset + SUBPIXEL_Y;
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat ty = (-v[MAT_TY]) + yoffset;
 
 	if (rmesa->hw.vpt.cmd[R300_VPT_XOFFSET] != r300PackFloat32(tx) ||
 	    rmesa->hw.vpt.cmd[R300_VPT_YOFFSET] != r300PackFloat32(ty)) {
@@ -996,64 +1046,6 @@ void r300UpdateViewportOffset(GLcontext * ctx)
 	radeonUpdateScissor(ctx);
 }
 
-/**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void r300UpdateDrawBuffer(GLcontext * ctx)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	r300ContextPtr r300 = rmesa;
-	struct gl_framebuffer *fb = ctx->DrawBuffer;
-	driRenderbuffer *drb;
-
-	if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-		/* draw to front */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].
-		    Renderbuffer;
-	} else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-		/* draw to back */
-		drb =
-		    (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].
-		    Renderbuffer;
-	} else {
-		/* drawing to multiple buffers, or none */
-		return;
-	}
-
-	assert(drb);
-	assert(drb->flippedPitch);
-
-	R300_STATECHANGE(rmesa, cb);
-
-	r300->hw.cb.cmd[R300_CB_OFFSET] = drb->flippedOffset +	//r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = drb->flippedPitch;	//r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-#if 0
-	R200_STATECHANGE(rmesa, ctx);
-
-	/* Note: we used the (possibly) page-flipped values */
-	rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-	    = ((drb->flippedOffset + rmesa->r200Screen->fbLocation)
-	       & R200_COLOROFFSET_MASK);
-	rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-
-	if (rmesa->sarea->tiling_enabled) {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
-		    R200_COLOR_TILE_ENABLE;
-	}
-#endif
-}
-
 static void
 r300FetchStateParameter(GLcontext * ctx,
 			const gl_state_index state[STATE_LENGTH],
@@ -1064,12 +1056,14 @@ r300FetchStateParameter(GLcontext * ctx,
 	switch (state[0]) {
 	case STATE_INTERNAL:
 		switch (state[1]) {
-		case STATE_R300_WINDOW_DIMENSION:
-			value[0] = r300->radeon.dri.drawable->w * 0.5f;	/* width*0.5 */
-			value[1] = r300->radeon.dri.drawable->h * 0.5f;	/* height*0.5 */
-			value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
-			value[3] = 1.0F;	/* not used */
-			break;
+		case STATE_R300_WINDOW_DIMENSION: {
+				__DRIdrawablePrivate * drawable = radeon_get_drawable(&r300->radeon);
+				value[0] = drawable->w * 0.5f;	/* width*0.5 */
+				value[1] = drawable->h * 0.5f;	/* height*0.5 */
+				value[2] = 0.5F;	/* for moving range [-1 1] -> [0 1] */
+				value[3] = 1.0F;	/* not used */
+				break;
+			}
 
 		case STATE_R300_TEXRECT_FACTOR:{
 				struct gl_texture_object *t =
@@ -1103,24 +1097,25 @@ r300FetchStateParameter(GLcontext * ctx,
  * Update R300's own internal state parameters.
  * For now just STATE_R300_WINDOW_DIMENSION
  */
-void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
+static void r300UpdateStateParameters(GLcontext * ctx, GLuint new_state)
 {
-	struct r300_fragment_program *fp;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	struct gl_program_parameter_list *paramList;
 	GLuint i;
 
-	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS)))
 		return;
 
-	fp = (struct r300_fragment_program *)ctx->FragmentProgram._Current;
-	if (!fp)
+	if (!ctx->FragmentProgram._Current || !rmesa->selected_fp)
 		return;
 
-	paramList = fp->mesa_program.Base.Parameters;
+	paramList = rmesa->selected_fp->Base->Parameters;
 
 	if (!paramList)
 		return;
 
+	_mesa_load_state_parameters(ctx, paramList);
+
 	for (i = 0; i < paramList->NumParameters; i++) {
 		if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) {
 			r300FetchStateParameter(ctx,
@@ -1235,9 +1230,7 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int i;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r300_fragment_program_code *code = &fp->code;
+	struct r300_fragment_program_code *code = &r300->selected_fp->code.r300;
 
 	R300_STATECHANGE(r300, fpt);
 
@@ -1271,15 +1264,15 @@ static void r300SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 	}
 
 	r300->hw.fpt.cmd[R300_FPT_CMD_0] =
-		cmdpacket0(R300_US_TEX_INST_0, code->tex.length);
+		cmdpacket0(r300->radeon.radeonScreen,
+                   R300_US_TEX_INST_0, code->tex.length);
 }
 
 static void r500SetupFragmentShaderTextures(GLcontext *ctx, int *tmu_mappings)
 {
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int i;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
-	struct r500_fragment_program_code *code = &fp->code;
+	struct r500_fragment_program_code *code = &r300->selected_fp->code.r500;
 
 	/* find all the texture instructions and relocate the texture units */
 	for (i = 0; i < code->inst_end + 1; i++) {
@@ -1322,13 +1315,11 @@ static GLuint translate_lod_bias(GLfloat bias)
 static void r300SetupTextures(GLcontext * ctx)
 {
 	int i, mtu;
-	struct r300_tex_obj *t;
+	struct radeon_tex_obj *t;
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	int hw_tmu = 0;
 	int last_hw_tmu = -1;	/* -1 translates into no setup costs for fields */
 	int tmu_mappings[R300_MAX_TEXTURE_UNITS] = { -1, };
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
 
 	R300_STATECHANGE(r300, txe);
 	R300_STATECHANGE(r300, tex.filter);
@@ -1356,21 +1347,16 @@ static void r300SetupTextures(GLcontext * ctx)
 	/* We cannot let disabled tmu offsets pass DRM */
 	for (i = 0; i < mtu; i++) {
 		if (ctx->Texture.Unit[i]._ReallyEnabled) {
-
-#if 0				/* Enables old behaviour */
-			hw_tmu = i;
-#endif
 			tmu_mappings[i] = hw_tmu;
 
-			t = (r300TexObjPtr) r300->state.texture.unit[i].texobj->DriverData;
-			/* XXX questionable fix for bug 9170: */
+			t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
 			if (!t)
 				continue;
 
-			if ((t->format & 0xffffff00) == 0xffffff00) {
+			if ((t->pp_txformat & 0xffffff00) == 0xffffff00) {
 				WARN_ONCE
 				    ("unknown texture format (entry %x) encountered. Help me !\n",
-				     t->format & 0xff);
+				     t->pp_txformat & 0xff);
 			}
 
 			if (RADEON_DEBUG & DEBUG_STATE)
@@ -1381,29 +1367,28 @@ static void r300SetupTextures(GLcontext * ctx)
 
 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0 +
 						hw_tmu] =
-			    gen_fixed_filter(t->filter) | (hw_tmu << 28);
+			    gen_fixed_filter(t->pp_txfilter) | (hw_tmu << 28);
 			/* Note: There is a LOD bias per texture unit and a LOD bias
 			 * per texture object. We add them here to get the correct behaviour.
 			 * (The per-texture object LOD bias was introduced in OpenGL 1.4
 			 * and is not present in the EXT_texture_object extension).
 			 */
 			r300->hw.tex.filter_1.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-				t->filter_1 |
-				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.tObj->LodBias);
+				t->pp_txfilter_1 |
+				translate_lod_bias(ctx->Texture.Unit[i].LodBias + t->base.LodBias);
 			r300->hw.tex.size.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-			    t->size;
+			    t->pp_txsize;
 			r300->hw.tex.format.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->format;
+						hw_tmu] = t->pp_txformat;
 			r300->hw.tex.pitch.cmd[R300_TEX_VALUE_0 + hw_tmu] =
-			    t->pitch_reg;
-			r300->hw.tex.offset.cmd[R300_TEX_VALUE_0 +
-						hw_tmu] = t->offset;
+			  t->pp_txpitch;
+			r300->hw.textures[hw_tmu] = t;
 
-			if (t->offset & R300_TXO_MACRO_TILE) {
+			if (t->tile_bits & R300_TXO_MACRO_TILE) {
 				WARN_ONCE("macro tiling enabled!\n");
 			}
 
-			if (t->offset & R300_TXO_MICRO_TILE) {
+			if (t->tile_bits & R300_TXO_MICRO_TILE) {
 				WARN_ONCE("micro tiling enabled!\n");
 			}
 
@@ -1420,37 +1405,33 @@ static void r300SetupTextures(GLcontext * ctx)
 	}
 
 	r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER0_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, last_hw_tmu + 1);
 	r300->hw.tex.filter_1.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FILTER1_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER1_0, last_hw_tmu + 1);
 	r300->hw.tex.size.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_SIZE_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_SIZE_0, last_hw_tmu + 1);
 	r300->hw.tex.format.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT_0, last_hw_tmu + 1);
 	r300->hw.tex.pitch.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_FORMAT2_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_FORMAT2_0, last_hw_tmu + 1);
 	r300->hw.tex.offset.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_OFFSET_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_OFFSET_0, last_hw_tmu + 1);
 	r300->hw.tex.chroma_key.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_CHROMA_KEY_0, last_hw_tmu + 1);
 	r300->hw.tex.border_color.cmd[R300_TEX_CMD_0] =
-	    cmdpacket0(R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
-
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
+	    cmdpacket0(r300->radeon.radeonScreen, R300_TX_BORDER_COLOR_0, last_hw_tmu + 1);
 
 	if (r300->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV515) {
-		if (fp->mesa_program.UsesKill && last_hw_tmu < 0) {
+		if (ctx->FragmentProgram._Current->UsesKill && last_hw_tmu < 0) {
 			// The KILL operation requires the first texture unit
 			// to be enabled.
 			r300->hw.txe.cmd[R300_TXE_ENABLE] |= 1;
 			r300->hw.tex.filter.cmd[R300_TEX_VALUE_0] = 0;
 			r300->hw.tex.filter.cmd[R300_TEX_CMD_0] =
-				cmdpacket0(R300_TX_FILTER0_0, 1);
+				cmdpacket0(r300->radeon.radeonScreen, R300_TX_FILTER0_0, 1);
 		}
-		r300SetupFragmentShaderTextures(ctx, tmu_mappings);
-	} else
-		r500SetupFragmentShaderTextures(ctx, tmu_mappings);
+	}
+	r300->vtbl.SetupFragmentShaderTextures(ctx, tmu_mappings);
 
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "TX_ENABLE: %08x  last_hw_tmu=%d\n",
@@ -1469,26 +1450,21 @@ union r300_outputs_written {
 static void r300SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
 	int col_ip, tex_ip;
 	int rs_tex_count = 0;
-	int i, count, col_fmt;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->Base->Base.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = r300->selected_fp->Base->InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
@@ -1507,15 +1483,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
 			++col_ip;
@@ -1527,15 +1495,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R300_RS_COL_PTR(col_ip) | R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R300_RS_INST_COL_ID(col_ip) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
 			++col_ip;
@@ -1545,6 +1505,7 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
+	/* We always route 4 texcoord components */
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
 		    continue;
@@ -1554,64 +1515,27 @@ static void r300SetupRSUnit(GLcontext * ctx)
 		    continue;
 		}
 
-		int swiz;
-
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-		  count = 4;
-		else
-		  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		switch(count) {
-		case 4: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3); break;
-		case 3: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		default:
-		case 1:
-		case 2: swiz = R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(R300_RS_SEL_K0) | R300_RS_SEL_Q(R300_RS_SEL_K1); break;
-		};
-
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz | R300_RS_TEX_PTR(rs_tex_count);
+		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) | R300_RS_TEX_PTR(rs_tex_count);
 		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
 		InputsRead &= ~(FRAG_BIT_TEX0 << i);
-		rs_tex_count += count;
-		++tex_ip;
-		++fp_reg;
-	}
-
-	if (InputsRead & FRAG_BIT_FOGC) {
-		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
-			r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
-			InputsRead &= ~FRAG_BIT_FOGC;
-			rs_tex_count += 4;
-			++tex_ip;
-			++fp_reg;
-		} else {
-			WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
-		}
-	}
-
-	if (InputsRead & FRAG_BIT_WPOS) {
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |=  R300_RS_SEL_S(0) | R300_RS_SEL_T(1) | R300_RS_SEL_R(2) | R300_RS_SEL_Q(3) |  R300_RS_TEX_PTR(rs_tex_count);
-		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R300_RS_INST_TEX_ID(tex_ip) | R300_RS_INST_TEX_CN_WRITE | R300_RS_INST_TEX_ADDR(fp_reg);
-		InputsRead &= ~FRAG_BIT_WPOS;
 		rs_tex_count += 4;
 		++tex_ip;
 		++fp_reg;
 	}
-	InputsRead &= ~FRAG_BIT_WPOS;
 
 	/* Setup default color if no color or tex was set */
 	if (rs_tex_count == 0 && col_ip == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_CN_WRITE | R300_RS_INST_COL_ADDR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		r300->hw.rr.cmd[R300_RR_INST_0] = R300_RS_INST_COL_ID(0) | R300_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R300_RS_COL_PTR(0) | R300_RS_COL_FMT(R300_RS_COL_FMT_0001);
 		++col_ip;
 	}
 
 	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
-	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
 	r300->hw.rc.cmd[2] |= high_rr - 1;
 
-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R300_RS_INST_0, high_rr);
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R300_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
@@ -1620,26 +1544,21 @@ static void r300SetupRSUnit(GLcontext * ctx)
 static void r500SetupRSUnit(GLcontext * ctx)
 {
 	r300ContextPtr r300 = R300_CONTEXT(ctx);
-        TNLcontext *tnl = TNL_CONTEXT(ctx);
-	struct vertex_buffer *VB = &tnl->vb;
 	union r300_outputs_written OutputsWritten;
 	GLuint InputsRead;
 	int fp_reg, high_rr;
 	int col_ip, tex_ip;
 	int rs_tex_count = 0;
-	int i, count, col_fmt;
+	int i, col_fmt, hw_tcl_on;
+
+	hw_tcl_on = r300->options.hw_tcl_enabled;
 
 	if (hw_tcl_on)
-		OutputsWritten.vp_outputs = CURRENT_VERTEX_SHADER(ctx)->key.OutputsWritten;
+		OutputsWritten.vp_outputs = r300->selected_vp->Base->Base.OutputsWritten;
 	else
-		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->state.render_inputs_bitset);
+		RENDERINPUTS_COPY(OutputsWritten.index_bitset, r300->render_inputs_bitset);
 
-	if (ctx->FragmentProgram._Current)
-		InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-	else {
-		fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
-		return;		/* This should only ever happen once.. */
-	}
+	InputsRead = r300->selected_fp->Base->InputsRead;
 
 	R300_STATECHANGE(r300, ri);
 	R300_STATECHANGE(r300, rc);
@@ -1658,15 +1577,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL0) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL0, _TNL_ATTRIB_COLOR0)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR0]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL0;
 			++col_ip;
@@ -1678,15 +1589,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	if (InputsRead & FRAG_BIT_COL1) {
 		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_COL1, _TNL_ATTRIB_COLOR1)) {
-			count = VB->AttribPtr[_TNL_ATTRIB_COLOR1]->size;
-			if (count == 4)
-			    col_fmt = R300_RS_COL_FMT_RGBA;
-			else if (count == 3)
-			    col_fmt = R300_RS_COL_FMT_RGB1;
-			else
-			    col_fmt = R300_RS_COL_FMT_0001;
-
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(col_fmt);
+			r300->hw.ri.cmd[R300_RI_INTERP_0 + col_ip] = R500_RS_COL_PTR(col_ip) | R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
 			r300->hw.rr.cmd[R300_RR_INST_0 + col_ip] = R500_RS_INST_COL_ID(col_ip) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(fp_reg);
 			InputsRead &= ~FRAG_BIT_COL1;
 			++col_ip;
@@ -1696,7 +1599,7 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		}
 	}
 
-
+	/* We always route 4 texcoord components */
 	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
 		if (! ( InputsRead & FRAG_BIT_TEX(i) ) )
 		    continue;
@@ -1706,74 +1609,13 @@ static void r500SetupRSUnit(GLcontext * ctx)
 		    continue;
 		}
 
-		int swiz = 0;
-
-		/* with TCL we always seem to route 4 components */
-		if (hw_tcl_on)
-		  count = 4;
-		else
-		  count = VB->AttribPtr[_TNL_ATTRIB_TEX(i)]->size;
-
-		if (count == 4) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= (rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 3) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= (rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 2) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= (rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else if (count == 1) {
-			swiz |= (rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		} else {
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_S_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_T_SHIFT;
-			swiz |= R500_RS_IP_PTR_K0 << R500_RS_IP_TEX_PTR_R_SHIFT;
-			swiz |= R500_RS_IP_PTR_K1 << R500_RS_IP_TEX_PTR_Q_SHIFT;
-		}
-
-		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= swiz;
-		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
-		InputsRead &= ~(FRAG_BIT_TEX0 << i);
-		rs_tex_count += count;
-		++tex_ip;
-		++fp_reg;
-	}
-
-	if (InputsRead & FRAG_BIT_FOGC) {
-		if (R300_OUTPUTS_WRITTEN_TEST(OutputsWritten, VERT_RESULT_FOGC, _TNL_ATTRIB_FOG)) {
-			r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
-
-			r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
-			InputsRead &= ~FRAG_BIT_FOGC;
-			rs_tex_count += 4;
-			++tex_ip;
-			++fp_reg;
-		} else {
-			WARN_ONCE("fragprog wants fogc, vp doesn't provide it\n");
-		}
-	}
-
-	if (InputsRead & FRAG_BIT_WPOS) {
 		r300->hw.ri.cmd[R300_RI_INTERP_0 + tex_ip] |= ((rs_tex_count + 0) << R500_RS_IP_TEX_PTR_S_SHIFT) |
-				((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
-				((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
-				((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
+			((rs_tex_count + 1) << R500_RS_IP_TEX_PTR_T_SHIFT) |
+			((rs_tex_count + 2) << R500_RS_IP_TEX_PTR_R_SHIFT) |
+			((rs_tex_count + 3) << R500_RS_IP_TEX_PTR_Q_SHIFT);
 
 		r300->hw.rr.cmd[R300_RR_INST_0 + tex_ip] |= R500_RS_INST_TEX_ID(tex_ip) | R500_RS_INST_TEX_CN_WRITE | R500_RS_INST_TEX_ADDR(fp_reg);
-		InputsRead &= ~FRAG_BIT_WPOS;
+		InputsRead &= ~(FRAG_BIT_TEX0 << i);
 		rs_tex_count += 4;
 		++tex_ip;
 		++fp_reg;
@@ -1781,72 +1623,25 @@ static void r500SetupRSUnit(GLcontext * ctx)
 
 	/* Setup default color if no color or tex was set */
 	if (rs_tex_count == 0 && col_ip == 0) {
-		r300->hw.rr.cmd[R300_RR_INST_0] |= R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_CN_WRITE | R500_RS_INST_COL_ADDR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
+		r300->hw.rr.cmd[R300_RR_INST_0] = R500_RS_INST_COL_ID(0) | R500_RS_INST_COL_ADDR(0);
+		r300->hw.ri.cmd[R300_RI_INTERP_0] = R500_RS_COL_PTR(0) | R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
 		++col_ip;
 	}
 
 	high_rr = (col_ip > tex_ip) ? col_ip : tex_ip;
-	r300->hw.rc.cmd[1] |= (rs_tex_count << R300_IT_COUNT_SHIFT)  | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
-	r300->hw.rc.cmd[2] |= 0xC0 | (high_rr - 1);
+	r300->hw.rc.cmd[1] = (rs_tex_count << R300_IT_COUNT_SHIFT) | (col_ip << R300_IC_COUNT_SHIFT) | R300_HIRES_EN;
+	r300->hw.rc.cmd[2] = 0xC0 | (high_rr - 1);
 
-	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(R500_RS_INST_0, high_rr);
+	r300->hw.rr.cmd[R300_RR_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_INST_0, high_rr);
+	r300->hw.ri.cmd[R300_RI_CMD_0] = cmdpacket0(r300->radeon.radeonScreen, R500_RS_IP_0, high_rr);
 
 	if (InputsRead)
 		WARN_ONCE("Don't know how to satisfy InputsRead=0x%08x\n", InputsRead);
 }
 
-
-
-
-#define bump_vpu_count(ptr, new_count)   do{\
-	drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr));\
-	int _nc=(new_count)/4; \
-	assert(_nc < 256); \
-	if(_nc>_p->vpu.count)_p->vpu.count=_nc;\
-	}while(0)
-
-static INLINE void r300SetupVertexProgramFragment(r300ContextPtr r300, int dest, struct r300_vertex_shader_fragment *vsf)
-{
-	int i;
-
-	if (vsf->length == 0)
-		return;
-
-	if (vsf->length & 0x3) {
-		fprintf(stderr, "VERTEX_SHADER_FRAGMENT must have length divisible by 4\n");
-		_mesa_exit(-1);
-	}
-
-	switch ((dest >> 8) & 0xf) {
-	case 0:
-		R300_STATECHANGE(r300, vpi);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpi.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-
-	case 2:
-		R300_STATECHANGE(r300, vpp);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vpp.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	case 4:
-		R300_STATECHANGE(r300, vps);
-		for (i = 0; i < vsf->length; i++)
-			r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (vsf->body.d[i]);
-		bump_vpu_count(r300->hw.vps.cmd, vsf->length + 4 * (dest & 0xff));
-		break;
-	default:
-		fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
-		_mesa_exit(-1);
-	}
-}
-
 #define MIN3(a, b, c)	((a) < (b) ? MIN2(a, c) : MIN2(b, c))
 
-
-static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 			GLuint output_count, GLuint temp_count)
 {
     int vtx_mem_size;
@@ -1870,7 +1665,7 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
     pvs_num_cntrls = MIN2(6, vtx_mem_size/temp_count);
 
     R300_STATECHANGE(rmesa, vap_cntl);
-    if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+    if (rmesa->options.hw_tcl_enabled) {
 	rmesa->hw.vap_cntl.cmd[R300_VAP_CNTL_INSTR] =
 	    (pvs_num_slots << R300_PVS_NUM_SLOTS_SHIFT) |
 	    (pvs_num_cntrls << R300_PVS_NUM_CNTLRS_SHIFT) |
@@ -1900,114 +1695,6 @@ static void r300VapCntl(r300ContextPtr rmesa, GLuint input_count,
 
 }
 
-static void r300SetupDefaultVertexProgram(r300ContextPtr rmesa)
-{
-	struct r300_vertex_shader_state *prog = &(rmesa->state.vertex_shader);
-	GLuint o_reg = 0;
-	GLuint i_reg = 0;
-	int i;
-	int inst_count = 0;
-	int param_count = 0;
-	int program_end = 0;
-
-	for (i = VERT_ATTRIB_POS; i < VERT_ATTRIB_MAX; i++) {
-		if (rmesa->state.sw_tcl_inputs[i] != -1) {
-			prog->program.body.i[program_end + 0] = PVS_OP_DST_OPERAND(VE_MULTIPLY, GL_FALSE, GL_FALSE, o_reg++, VSF_FLAG_ALL, PVS_DST_REG_OUT);
-			prog->program.body.i[program_end + 1] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_X, PVS_SRC_SELECT_Y, PVS_SRC_SELECT_Z, PVS_SRC_SELECT_W, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 2] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			prog->program.body.i[program_end + 3] = PVS_SRC_OPERAND(rmesa->state.sw_tcl_inputs[i], PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_SELECT_FORCE_1, PVS_SRC_REG_INPUT, VSF_FLAG_NONE);
-			program_end += 4;
-			i_reg++;
-		}
-	}
-
-	prog->program.length = program_end;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START,
-				       &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, i_reg, o_reg, 0);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	    (0 << R300_PVS_FIRST_INST_SHIFT) |
-	    (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	    (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	    (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	    (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	    (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static int bit_count (int x)
-{
-    x = ((x & 0xaaaaaaaaU) >> 1) + (x & 0x55555555U);
-    x = ((x & 0xccccccccU) >> 2) + (x & 0x33333333U);
-    x = (x >> 16) + (x & 0xffff);
-    x = ((x & 0xf0f0) >> 4) + (x & 0x0f0f);
-    return (x >> 8) + (x & 0x00ff);
-}
-
-static void r300SetupRealVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_vertex_program *prog = (struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx);
-	int inst_count = 0;
-	int param_count = 0;
-
-	/* FIXME: r300SetupVertexProgramFragment */
-	R300_STATECHANGE(rmesa, vpp);
-	param_count =
-	    r300VertexProgUpdateParams(ctx,
-				       (struct r300_vertex_program_cont *)
-				       ctx->VertexProgram._Current,
-				       (float *)&rmesa->hw.vpp.
-				       cmd[R300_VPP_PARAM_0]);
-	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
-	param_count /= 4;
-
-	r300SetupVertexProgramFragment(rmesa, R300_PVS_CODE_START, &(prog->program));
-	inst_count = (prog->program.length / 4) - 1;
-
-	r300VapCntl(rmesa, bit_count(prog->key.InputsRead),
-		    bit_count(prog->key.OutputsWritten), prog->num_temporaries);
-
-	R300_STATECHANGE(rmesa, pvs);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] =
-	  (0 << R300_PVS_FIRST_INST_SHIFT) |
-	  (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
-	  (inst_count << R300_PVS_LAST_INST_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] =
-	  (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) |
-	  (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
-	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] =
-	  (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
-}
-
-static void r300SetupVertexProgram(r300ContextPtr rmesa)
-{
-	GLcontext *ctx = rmesa->radeon.glCtx;
-
-	/* Reset state, in case we don't use something */
-	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
-	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
-
-	/* Not sure why this doesnt work...
-	   0x400 area might have something to do with pixel shaders as it appears right after pfs programming.
-	   0x406 is set to { 0.0, 0.0, 1.0, 0.0 } most of the time but should change with smooth points and in other rare cases. */
-	//setup_vertex_shader_fragment(rmesa, 0x406, &unk4);
-	if (hw_tcl_on && ((struct r300_vertex_program *)CURRENT_VERTEX_SHADER(ctx))->translated) {
-		r300SetupRealVertexProgram(rmesa);
-	} else {
-		/* FIXME: This needs to be replaced by vertex shader generation code. */
-		r300SetupDefaultVertexProgram(rmesa);
-	}
-
-}
-
 /**
  * Enable/Disable states.
  *
@@ -2015,20 +1702,13 @@ static void r300SetupVertexProgram(r300ContextPtr rmesa)
  */
 static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 {
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "%s( %s = %s )\n", __FUNCTION__,
 			_mesa_lookup_enum_by_nr(cap),
 			state ? "GL_TRUE" : "GL_FALSE");
 
 	switch (cap) {
-	case GL_TEXTURE_1D:
-	case GL_TEXTURE_2D:
-	case GL_TEXTURE_3D:
-		/* empty */
-		break;
-	case GL_FOG:
-		/* empty */
-		break;
 	case GL_ALPHA_TEST:
 		r300SetAlphaState(ctx);
 		break;
@@ -2046,22 +1726,46 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 	case GL_CLIP_PLANE5:
 		r300SetClipPlaneState(ctx, cap, state);
 		break;
+	case GL_CULL_FACE:
+		r300UpdateCulling(ctx);
+		break;
 	case GL_DEPTH_TEST:
 		r300SetDepthState(ctx);
 		break;
-	case GL_STENCIL_TEST:
-		r300SetStencilState(ctx, state);
+	case GL_LINE_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_SMOOTH, ctx->Line.SmoothFlag);
 		break;
-	case GL_CULL_FACE:
-		r300UpdateCulling(ctx);
+	case GL_LINE_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_LINE_STIPPLE, ctx->Line.StippleFlag);
+		break;
+	case GL_POINT_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POINT_SMOOTH, ctx->Point.SmoothFlag);
+		break;
+	case GL_POLYGON_SMOOTH:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_SMOOTH, ctx->Polygon.SmoothFlag);
+		break;
+	case GL_POLYGON_STIPPLE:
+		if (rmesa->options.conformance_mode)
+			r300SwitchFallback(ctx, R300_FALLBACK_POLYGON_STIPPLE, ctx->Polygon.StippleFlag);
 		break;
 	case GL_POLYGON_OFFSET_POINT:
 	case GL_POLYGON_OFFSET_LINE:
 	case GL_POLYGON_OFFSET_FILL:
 		r300SetPolygonOffsetState(ctx, state);
 		break;
+	case GL_SCISSOR_TEST:
+		radeon_firevertices(&rmesa->radeon);
+		rmesa->radeon.state.scissor.enabled = state;
+		radeonUpdateScissor( ctx );
+		break;
+	case GL_STENCIL_TEST:
+		r300SetStencilState(ctx, state);
+		break;
 	default:
-		radeonEnable(ctx, cap, state);
 		break;
 	}
 }
@@ -2072,15 +1776,14 @@ static void r300Enable(GLcontext * ctx, GLenum cap, GLboolean state)
 static void r300ResetHwState(r300ContextPtr r300)
 {
 	GLcontext *ctx = r300->radeon.glCtx;
-	int has_tcl = 1;
+	int has_tcl;
 
-	if (!(r300->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		has_tcl = 0;
+	has_tcl = r300->options.hw_tcl_enabled;
 
 	if (RADEON_DEBUG & DEBUG_STATE)
 		fprintf(stderr, "%s\n", __FUNCTION__);
 
-	r300UpdateWindow(ctx);
+	radeon_firevertices(&r300->radeon);
 
 	r300ColorMask(ctx,
 		      ctx->Color.ColorMask[RCOMP],
@@ -2102,8 +1805,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300UpdateCulling(ctx);
 
-	r300UpdateTextureState(ctx);
-
 	r300SetBlendState(ctx);
 	r300SetLogicOpState(ctx);
 
@@ -2182,8 +1883,8 @@ static void r300ResetHwState(r300ContextPtr r300)
 	}
 
 	/* XXX: Enable anti-aliasing? */
-	r300->hw.gb_misc.cmd[R300_GB_MISC_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
-	r300->hw.gb_misc.cmd[R300_GB_MISC_SELECT] = 0;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_AA_CONFIG] = GB_AA_CONFIG_AA_DISABLE;
+	r300->hw.gb_misc2.cmd[R300_GB_MISC2_SELECT] = 0;
 
 	r300->hw.ga_point_s0.cmd[1] = r300PackFloat32(0.0);
 	r300->hw.ga_point_s0.cmd[2] = r300PackFloat32(0.0);
@@ -2242,20 +1943,6 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300BlendColor(ctx, ctx->Color.BlendColor);
 
-	/* Again, r300ClearBuffer uses this */
-	r300->hw.cb.cmd[R300_CB_OFFSET] =
-	    r300->radeon.state.color.drawOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-
-	if (r300->radeon.radeonScreen->cpp == 4)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-	else
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-
-	if (r300->radeon.sarea->tiling_enabled)
-		r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-
 	r300->hw.rb3d_dither_ctl.cmd[1] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[2] = 0;
 	r300->hw.rb3d_dither_ctl.cmd[3] = 0;
@@ -2268,44 +1955,18 @@ static void r300ResetHwState(r300ContextPtr r300)
 
 	r300->hw.rb3d_aaresolve_ctl.cmd[1] = 0;
 
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
-	r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
-
-	r300->hw.zb.cmd[R300_ZB_OFFSET] =
-	    r300->radeon.radeonScreen->depthOffset +
-	    r300->radeon.radeonScreen->fbLocation;
-	r300->hw.zb.cmd[R300_ZB_PITCH] = r300->radeon.radeonScreen->depthPitch;
-
-	if (r300->radeon.sarea->tiling_enabled) {
-		/* XXX: Turn off when clearing buffers ? */
-		r300->hw.zb.cmd[R300_ZB_PITCH] |= R300_DEPTHMACROTILE_ENABLE;
-
-		if (ctx->Visual.depthBits == 24)
-			r300->hw.zb.cmd[R300_ZB_PITCH] |=
-			    R300_DEPTHMICROTILE_TILED;
-	}
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[1] = 0x00000000;
+    r300->hw.rb3d_discard_src_pixel_lte_threshold.cmd[2] = 0xffffffff;
 
 	r300->hw.zb_depthclearvalue.cmd[1] = 0;
 
-	switch (ctx->Visual.depthBits) {
-	case 16:
-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_16BIT_INT_Z;
-		break;
-	case 24:
-		r300->hw.zstencil_format.cmd[1] = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		break;
-	default:
-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n", ctx->Visual.depthBits);
-		_mesa_exit(-1);
-	}
-
 	r300->hw.zstencil_format.cmd[2] = R300_ZTOP_DISABLE;
 	r300->hw.zstencil_format.cmd[3] = 0x00000003;
 	r300->hw.zstencil_format.cmd[4] = 0x00000000;
 	r300SetEarlyZState(ctx);
 
-	r300->hw.unk4F30.cmd[1] = 0;
-	r300->hw.unk4F30.cmd[2] = 0;
+	r300->hw.zb_zmask.cmd[1] = 0;
+	r300->hw.zb_zmask.cmd[2] = 0;
 
 	r300->hw.zb_hiz_offset.cmd[1] = 0;
 
@@ -2319,49 +1980,59 @@ static void r300ResetHwState(r300ContextPtr r300)
 		r300->hw.vps.cmd[R300_VPS_ZERO_3] = 0;
 	}
 
-	r300->hw.all_dirty = GL_TRUE;
+	r300->radeon.hw.all_dirty = GL_TRUE;
 }
 
 void r300UpdateShaders(r300ContextPtr rmesa)
 {
-	GLcontext *ctx;
-	struct r300_vertex_program *vp;
-	int i;
+	GLcontext *ctx = rmesa->radeon.glCtx;
 
-	ctx = rmesa->radeon.glCtx;
+	/* should only happenen once, just after context is created */
+	/* TODO: shouldn't we fallback to sw here? */
+	if (!ctx->FragmentProgram._Current) {
+		_mesa_fprintf(stderr, "No ctx->FragmentProgram._Current!!\n");
+		return;
+	}
 
-	if (rmesa->NewGLState && hw_tcl_on) {
-		rmesa->NewGLState = 0;
+	{
+		struct r300_fragment_program *fp;
 
-		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
-			rmesa->temp_attrib[i] =
-			    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
-			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
-			    &rmesa->dummy_attrib[i];
-		}
+		fp = r300SelectFragmentShader(ctx);
+		if (!fp->translated)
+			r300TranslateFragmentShader(ctx, fp);
 
-		_tnl_UpdateFixedFunctionProgram(ctx);
+		r300SwitchFallback(ctx, R300_FALLBACK_FRAGMENT_PROGRAM, fp->error);
+	}
 
-		for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
-			TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
-			    rmesa->temp_attrib[i];
-		}
+	if (rmesa->options.hw_tcl_enabled) {
+		struct r300_vertex_program *vp;
 
-		r300SelectVertexShader(rmesa);
-		vp = (struct r300_vertex_program *)
-		    CURRENT_VERTEX_SHADER(ctx);
-		/*if (vp->translated == GL_FALSE)
-		   r300TranslateVertexShader(vp); */
-		if (vp->translated == GL_FALSE) {
-			fprintf(stderr, "Failing back to sw-tcl\n");
-			hw_tcl_on = future_hw_tcl_on = 0;
-			r300ResetHwState(rmesa);
-
-			r300UpdateStateParameters(ctx, _NEW_PROGRAM);
-			return;
+		if (rmesa->radeon.NewGLState) {
+			int i;
+			for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+				rmesa->temp_attrib[i] =
+				    TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+				TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+				    &rmesa->dummy_attrib[i];
+			}
+
+			_tnl_UpdateFixedFunctionProgram(ctx);
+
+			for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) {
+				TNL_CONTEXT(ctx)->vb.AttribPtr[i] =
+				    rmesa->temp_attrib[i];
+			}
 		}
+
+		vp = r300SelectVertexShader(ctx);
+		if (!vp->translated)
+			r300TranslateVertexShader(vp);
+
+		r300SwitchFallback(ctx, R300_FALLBACK_VERTEX_PROGRAM, vp->error);
 	}
-	r300UpdateStateParameters(ctx, _NEW_PROGRAM);
+
+	r300UpdateStateParameters(ctx, _NEW_PROGRAM | _NEW_PROGRAM_CONSTANTS);
+	rmesa->radeon.NewGLState = 0;
 }
 
 static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
@@ -2385,35 +2056,23 @@ static const GLfloat *get_fragmentprogram_constant(GLcontext *ctx,
 }
 
 
-static void r300SetupPixelShader(r300ContextPtr rmesa)
+static void r300SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r300_fragment_program *fp = (struct r300_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = rmesa->selected_fp;
 	struct r300_fragment_program_code *code;
 	int i, k;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
-	r300TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	r300SetupTextures(ctx);
+	code = &fp->code.r300;
 
 	R300_STATECHANGE(rmesa, fpi[0]);
 	R300_STATECHANGE(rmesa, fpi[1]);
 	R300_STATECHANGE(rmesa, fpi[2]);
 	R300_STATECHANGE(rmesa, fpi[3]);
-	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_INST_0, code->alu.length);
-	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_RGB_ADDR_0, code->alu.length);
-	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_INST_0, code->alu.length);
-	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
+	rmesa->hw.fpi[0].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_INST_0, code->alu.length);
+	rmesa->hw.fpi[1].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_RGB_ADDR_0, code->alu.length);
+	rmesa->hw.fpi[2].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_INST_0, code->alu.length);
+	rmesa->hw.fpi[3].cmd[R300_FPI_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_US_ALU_ALPHA_ADDR_0, code->alu.length);
 	for (i = 0; i < code->alu.length; i++) {
 		rmesa->hw.fpi[0].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst0;
 		rmesa->hw.fpi[1].cmd[R300_FPI_INSTR_0 + i] = code->alu.inst[i].inst1;
@@ -2444,10 +2103,9 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	}
 
 	R300_STATECHANGE(rmesa, fpp);
-	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(R300_PFS_PARAM_0_X, code->const_nr * 4);
+	rmesa->hw.fpp.cmd[R300_FPP_CMD_0] = cmdpacket0(rmesa->radeon.radeonScreen, R300_PFS_PARAM_0_X, code->const_nr * 4);
 	for (i = 0; i < code->const_nr; i++) {
-		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+		const GLfloat *constant = get_fragmentprogram_constant(ctx, fp->Base, code->constant[i]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat24(constant[0]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat24(constant[1]);
 		rmesa->hw.fpp.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat24(constant[2]);
@@ -2469,29 +2127,17 @@ static void r300SetupPixelShader(r300ContextPtr rmesa)
 	if(_nc>_p->r500fp.count)_p->r500fp.count=_nc;\
 } while(0)
 
-static void r500SetupPixelShader(r300ContextPtr rmesa)
+static void r500SetupPixelShader(GLcontext *ctx)
 {
-	GLcontext *ctx = rmesa->radeon.glCtx;
-	struct r500_fragment_program *fp = (struct r500_fragment_program *)
-	    (char *)ctx->FragmentProgram._Current;
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	struct r300_fragment_program *fp = rmesa->selected_fp;
 	int i;
 	struct r500_fragment_program_code *code;
 
-	if (!fp)		/* should only happenen once, just after context is created */
-		return;
-
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp.cmd)->r500fp.count = 0;
 	((drm_r300_cmd_header_t *) rmesa->hw.r500fp_const.cmd)->r500fp.count = 0;
 
-	r500TranslateFragmentShader(rmesa, fp);
-	if (!fp->translated) {
-		fprintf(stderr, "%s: No valid fragment shader, exiting\n",
-			__FUNCTION__);
-		return;
-	}
-	code = &fp->code;
-
-	r300SetupTextures(ctx);
+	code = &fp->code.r500;
 
 	R300_STATECHANGE(rmesa, fp);
 	rmesa->hw.fp.cmd[R500_FP_PIXSIZE] = code->max_temp_idx;
@@ -2520,59 +2166,93 @@ static void r500SetupPixelShader(r300ContextPtr rmesa)
 
 	R300_STATECHANGE(rmesa, r500fp_const);
 	for (i = 0; i < code->const_nr; i++) {
-		const GLfloat *constant = get_fragmentprogram_constant(ctx,
-			&fp->mesa_program.Base, code->constant[i]);
+		const GLfloat *constant = get_fragmentprogram_constant(ctx, fp->Base, code->constant[i]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 0] = r300PackFloat32(constant[0]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 1] = r300PackFloat32(constant[1]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 2] = r300PackFloat32(constant[2]);
 		rmesa->hw.r500fp_const.cmd[R300_FPP_PARAM_0 + 4 * i + 3] = r300PackFloat32(constant[3]);
 	}
 	bump_r500fp_const_count(rmesa->hw.r500fp_const.cmd, code->const_nr * 4);
-
 }
 
-void r300UpdateShaderStates(r300ContextPtr rmesa)
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten)
 {
-	GLcontext *ctx;
-	ctx = rmesa->radeon.glCtx;
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+	int i, j, reg_count;
+	uint32_t *vir0 = &rmesa->hw.vir[0].cmd[1];
+	uint32_t *vir1 = &rmesa->hw.vir[1].cmd[1];
 
-	r300UpdateTextureState(ctx);
-	r300SetEarlyZState(ctx);
+	for (i = 0; i < R300_VIR_CMDSIZE-1; ++i)
+		vir0[i] = vir1[i] = 0;
 
-	/* w_fmt value is set to get best performance
-	 * see p.130 R5xx 3D acceleration guide v1.3 */
-	GLuint w_fmt, fgdepthsrc;
-	if (current_fragment_program_writes_depth(ctx)) {
-		fgdepthsrc = R300_FG_DEPTH_SRC_SHADER;
-		w_fmt = R300_W_FMT_W24 | R300_W_SRC_US;
-	} else {
-		fgdepthsrc = R300_FG_DEPTH_SRC_SCAN;
-		w_fmt = R300_W_FMT_W0 | R300_W_SRC_US;
+	for (i = 0, j = 0; i < rmesa->vbuf.num_attribs; ++i) {
+		int tmp;
+
+		tmp = attrs[i].data_type | (attrs[i].dst_loc << R300_DST_VEC_LOC_SHIFT);
+		if (attrs[i]._signed)
+			tmp |= R300_SIGNED;
+		if (attrs[i].normalize)
+			tmp |= R300_NORMALIZE;
+
+		if (i % 2 == 0) {
+			vir0[j] = tmp << R300_DATA_TYPE_0_SHIFT;
+			vir1[j] = attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT);
+		} else {
+			vir0[j] |= tmp << R300_DATA_TYPE_1_SHIFT;
+			vir1[j] |= (attrs[i].swizzle | (attrs[i].write_mask << R300_WRITE_ENA_SHIFT)) << R300_SWIZZLE1_SHIFT;
+			++j;
+		}
 	}
 
-	if (w_fmt != rmesa->hw.us_out_fmt.cmd[5]) {
-		R300_STATECHANGE(rmesa, us_out_fmt);
-		rmesa->hw.us_out_fmt.cmd[5] = w_fmt;
+	reg_count = (rmesa->vbuf.num_attribs + 1) >> 1;
+	if (rmesa->vbuf.num_attribs % 2 != 0) {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_0_SHIFT;
+	} else {
+		vir0[reg_count-1] |= R300_LAST_VEC << R300_DATA_TYPE_1_SHIFT;
 	}
 
-	if (fgdepthsrc != rmesa->hw.fg_depth_src.cmd[1]) {
-		R300_STATECHANGE(rmesa, fg_depth_src);
-		rmesa->hw.fg_depth_src.cmd[1] = fgdepthsrc;
+	R300_STATECHANGE(rmesa, vir[0]);
+	R300_STATECHANGE(rmesa, vir[1]);
+	R300_STATECHANGE(rmesa, vof);
+	R300_STATECHANGE(rmesa, vic);
+
+	if (rmesa->radeon.radeonScreen->kernel_mm) {
+		rmesa->hw.vir[0].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[1].cmd[0] &= 0xC000FFFF;
+		rmesa->hw.vir[0].cmd[0] |= (reg_count & 0x3FFF) << 16;
+		rmesa->hw.vir[1].cmd[0] |= (reg_count & 0x3FFF) << 16;
+	} else {
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count = reg_count;
+		((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count = reg_count;
 	}
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupPixelShader(rmesa);
-	else
-		r300SetupPixelShader(rmesa);
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
+	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
+	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = r300VAPOutputCntl1(ctx, OutputsWritten);
+}
 
-	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515)
-		r500SetupRSUnit(ctx);
-	else
-		r300SetupRSUnit(ctx);
+void r300UpdateShaderStates(r300ContextPtr rmesa)
+{
+	GLcontext *ctx;
+	ctx = rmesa->radeon.glCtx;
 
-	if ((rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL))
-		r300SetupVertexProgram(rmesa);
+	/* should only happenen once, just after context is created */
+	if (!ctx->FragmentProgram._Current)
+		return;
+
+	r300SetEarlyZState(ctx);
 
+	r300SetupTextures(ctx);
+
+	rmesa->vtbl.SetupPixelShader(ctx);
+
+	rmesa->vtbl.SetupRSUnit(ctx);
+
+	if (rmesa->options.hw_tcl_enabled) {
+		r300SetupVertexProgram(rmesa);
+	}
 }
 
 /**
@@ -2586,15 +2266,16 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
 	_swsetup_InvalidateState(ctx, new_state);
 	_vbo_InvalidateState(ctx, new_state);
 	_tnl_InvalidateState(ctx, new_state);
-	_ae_invalidate_state(ctx, new_state);
 
-	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-		r300UpdateDrawBuffer(ctx);
-	}
+	if (new_state & _NEW_BUFFERS) {
+		_mesa_update_framebuffer(ctx);
+		/* this updates the DrawBuffer's Width/Height if it's a FBO */
+		_mesa_update_draw_buffer_bounds(ctx);
 
-	r300UpdateStateParameters(ctx, new_state);
+		R300_STATECHANGE(r300, cb);
+	}
 
-	r300->NewGLState |= new_state;
+	r300->radeon.NewGLState |= new_state;
 }
 
 /**
@@ -2604,58 +2285,12 @@ static void r300InvalidateState(GLcontext * ctx, GLuint new_state)
  */
 void r300InitState(r300ContextPtr r300)
 {
-	GLcontext *ctx = r300->radeon.glCtx;
-	GLuint depth_fmt;
-
-	radeonInitState(&r300->radeon);
-
-	switch (ctx->Visual.depthBits) {
-	case 16:
-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffff;
-		depth_fmt = R300_DEPTHFORMAT_16BIT_INT_Z;
-		break;
-	case 24:
-		r300->state.depth.scale = 1.0 / (GLfloat) 0xffffff;
-		depth_fmt = R300_DEPTHFORMAT_24BIT_INT_Z_8BIT_STENCIL;
-		break;
-	default:
-		fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
-			ctx->Visual.depthBits);
-		_mesa_exit(-1);
-	}
-
-	/* Only have hw stencil when depth buffer is 24 bits deep */
-	r300->state.stencil.hw_stencil = (ctx->Visual.stencilBits > 0 &&
-					  ctx->Visual.depthBits == 24);
-
-	memset(&(r300->state.texture), 0, sizeof(r300->state.texture));
-
 	r300ResetHwState(r300);
 }
 
 static void r300RenderMode(GLcontext * ctx, GLenum mode)
 {
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	(void)rmesa;
-	(void)mode;
-}
-
-void r300UpdateClipPlanes( GLcontext *ctx )
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	GLuint p;
-
-	for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
-		if (ctx->Transform.ClipPlanesEnabled & (1 << p)) {
-			GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
-
-			R300_STATECHANGE( rmesa, vpucp[p] );
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_X] = ip[0];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Y] = ip[1];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_Z] = ip[2];
-			rmesa->hw.vpucp[p].cmd[R300_VPUCP_W] = ip[3];
-		}
-	}
+	r300SwitchFallback(ctx, R300_FALLBACK_RENDER_MODE, ctx->RenderMode != GL_RENDER);
 }
 
 /**
@@ -2663,7 +2298,6 @@ void r300UpdateClipPlanes( GLcontext *ctx )
  */
 void r300InitStateFuncs(struct dd_function_table *functions)
 {
-	radeonInitStateFuncs(functions);
 
 	functions->UpdateState = r300InvalidateState;
 	functions->AlphaFunc = r300AlphaFunc;
@@ -2699,4 +2333,25 @@ void r300InitStateFuncs(struct dd_function_table *functions)
 	functions->RenderMode = r300RenderMode;
 
 	functions->ClipPlane = r300ClipPlane;
+	functions->Scissor = radeonScissor;
+
+	functions->DrawBuffer		= radeonDrawBuffer;
+	functions->ReadBuffer		= radeonReadBuffer;
+}
+
+void r300InitShaderFunctions(r300ContextPtr r300)
+{
+	if (r300->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+		r300->vtbl.SetupRSUnit = r500SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r500SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r500SetupFragmentShaderTextures;
+		r300->vtbl.BuildFragmentProgramHwCode = r500BuildFragmentProgramHwCode;
+		r300->vtbl.FragmentProgramDump = r500FragmentProgramDump;
+	} else {
+		r300->vtbl.SetupRSUnit = r300SetupRSUnit;
+		r300->vtbl.SetupPixelShader = r300SetupPixelShader;
+		r300->vtbl.SetupFragmentShaderTextures = r300SetupFragmentShaderTextures;
+		r300->vtbl.BuildFragmentProgramHwCode = r300BuildFragmentProgramHwCode;
+		r300->vtbl.FragmentProgramDump = r300FragmentProgramDump;
+	}
 }
diff --git a/src/mesa/drivers/dri/r300/r300_state.h b/src/mesa/drivers/dri/r300/r300_state.h
index 0589ab7cad9..d46bf9f1796 100644
--- a/src/mesa/drivers/dri/r300/r300_state.h
+++ b/src/mesa/drivers/dri/r300/r300_state.h
@@ -39,42 +39,24 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define R300_NEWPRIM( rmesa )			\
   do {						\
-    if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );		\
+  if ( rmesa->radeon.dma.flush )			\
+    rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
   } while (0)
 
 #define R300_STATECHANGE(r300, atom) \
 	do {						\
 	  R300_NEWPRIM(r300);				\
 		r300->hw.atom.dirty = GL_TRUE;		\
-		r300->hw.is_dirty = GL_TRUE;		\
+		r300->radeon.hw.is_dirty = GL_TRUE;		\
 	} while(0)
 
-#define R300_PRINT_STATE(r300, atom) \
-		r300PrintStateAtom(r300, &r300->hw.atom)
-
-/* Fire the buffered vertices no matter what.
-   TODO: This has not been implemented yet
- */
-#define R300_FIREVERTICES( r300 )			\
-do {							\
-    \
-   if ( (r300)->cmdbuf.count_used || (r300)->dma.flush ) {	\
-      r300Flush( (r300)->radeon.glCtx );		\
-   }							\
-    \
-} while (0)
-
-// r300_state.c
-extern int future_hw_tcl_on;
-void _tnl_UpdateFixedFunctionProgram (GLcontext * ctx);
 void r300UpdateViewportOffset (GLcontext * ctx);
 void r300UpdateDrawBuffer (GLcontext * ctx);
-void r300UpdateStateParameters (GLcontext * ctx, GLuint new_state);
 void r300UpdateShaders (r300ContextPtr rmesa);
 void r300UpdateShaderStates (r300ContextPtr rmesa);
 void r300InitState (r300ContextPtr r300);
-void r300UpdateClipPlanes (GLcontext * ctx);
 void r300InitStateFuncs (struct dd_function_table *functions);
+void r300VapCntl(r300ContextPtr rmesa, GLuint input_count, GLuint output_count, GLuint temp_count);
+void r300SetupVAP(GLcontext *ctx, GLuint InputsRead, GLuint OutputsWritten);
 
 #endif				/* __R300_STATE_H__ */
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.c b/src/mesa/drivers/dri/r300/r300_swtcl.c
index ba3621b16b2..56ed519cf41 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.c
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.c
@@ -28,362 +28,231 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 /*
  * Authors:
  *   Dave Airlie <[email protected]>
+ *   Maciej Cencora <[email protected]>
  */
 
-/* derived from r200 swtcl path */
-
-
-
-#include "main/glheader.h"
-#include "main/mtypes.h"
-#include "main/colormac.h"
-#include "main/enums.h"
-#include "main/image.h"
-#include "main/imports.h"
-#include "main/light.h"
-#include "main/macros.h"
-
-#include "swrast/s_context.h"
-#include "swrast/s_fog.h"
-#include "swrast_setup/swrast_setup.h"
-#include "math/m_translate.h"
 #include "tnl/tnl.h"
-#include "tnl/t_context.h"
 #include "tnl/t_pipeline.h"
 
-#include "r300_context.h"
-#include "r300_swtcl.h"
 #include "r300_state.h"
-#include "r300_ioctl.h"
+#include "r300_swtcl.h"
 #include "r300_emit.h"
-#include "r300_mem.h"
-
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  );
+#include "r300_tex.h"
+#include "r300_render.h"
 
-
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset);
-void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr);
 #define EMIT_ATTR( ATTR, STYLE )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
+} while (0)
+
+#define ADD_ATTR(_attr, _format, _dst_loc, _swizzle, _write_mask, _normalize) \
+do { \
+	attrs[num_attrs].element = (_attr); \
+	attrs[num_attrs].data_type = (_format); \
+	attrs[num_attrs].dst_loc = (_dst_loc); \
+	attrs[num_attrs].swizzle = (_swizzle); \
+	attrs[num_attrs].write_mask = (_write_mask); \
+	attrs[num_attrs]._signed = 0; \
+	attrs[num_attrs].normalize = (_normalize); \
+	++num_attrs; \
 } while (0)
 
-static void r300SetVertexFormat( GLcontext *ctx )
+void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *_InputsRead,  GLuint *_OutputsWritten)
 {
 	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 	TNLcontext *tnl = TNL_CONTEXT(ctx);
 	struct vertex_buffer *VB = &tnl->vb;
-	DECLARE_RENDERINPUTS(index_bitset);
-	GLuint InputsRead = 0, OutputsWritten = 0;
-	int vap_fmt_1 = 0;
-	int offset = 0;
-	int vte = 0;
-	int fog_id;
-	GLint inputs[VERT_ATTRIB_MAX];
-	GLint tab[VERT_ATTRIB_MAX];
-	int swizzle[VERT_ATTRIB_MAX][4];
-	GLuint i, nr;
-	GLuint sz;
-
-	DECLARE_RENDERINPUTS(render_inputs_bitset);
-	RENDERINPUTS_COPY(render_inputs_bitset, tnl->render_inputs_bitset);
-	RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
-	RENDERINPUTS_COPY(rmesa->state.render_inputs_bitset, render_inputs_bitset);
-
-	vte = rmesa->hw.vte.cmd[1];
-	vte &= ~(R300_VTX_XY_FMT | R300_VTX_Z_FMT | R300_VTX_W0_FMT);
-	/* Important:
-	 */
-	if ( VB->NdcPtr != NULL ) {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->NdcPtr;
-		vte |= R300_VTX_XY_FMT | R300_VTX_Z_FMT;
-	}
-	else {
-		VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
-		vte |= R300_VTX_W0_FMT;
-	}
-
-	assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-	rmesa->swtcl.vertex_attr_count = 0;
-
-	/* EMIT_ATTR's must be in order as they tell t_vertex.c how to
-	 * build up a hardware vertex.
-	 */
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POS)) {
-		sz = VB->AttribPtr[VERT_ATTRIB_POS]->size;
-		InputsRead |= 1 << VERT_ATTRIB_POS;
-		OutputsWritten |= 1 << VERT_RESULT_HPOS;
-		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_1F + sz - 1 );
-		offset = sz;
-	} else {
-		offset = 4;
-		EMIT_PAD(4 * sizeof(float));
-	}
-/*
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POINTSIZE )) {
-		EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
-		offset += 1;
-	}
-*/
-	if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_COLOR0)) {
-		sz = VB->AttribPtr[VERT_ATTRIB_COLOR0]->size;
-	        rmesa->swtcl.coloroffset = offset;
+	int first_free_tex = 0;
+	GLuint InputsRead = 0;
+	GLuint OutputsWritten = 0;
+	int num_attrs = 0;
+	GLuint fp_reads = rmesa->selected_fp->Base->InputsRead;
+	struct vertex_attribute *attrs = rmesa->vbuf.attribs;
+
+	rmesa->swtcl.coloroffset = rmesa->swtcl.specoffset = 0;
+	rmesa->radeon.swtcl.vertex_attr_count = 0;
+
+	/* We always want non Ndc coords format */
+	VB->AttribPtr[VERT_ATTRIB_POS] = VB->ClipPtr;
+
+	/* Always write position vector */
+	InputsRead |= 1 << VERT_ATTRIB_POS;
+	OutputsWritten |= 1 << VERT_RESULT_HPOS;
+	EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F );
+	ADD_ATTR(VERT_ATTRIB_POS, R300_DATA_TYPE_FLOAT_4, SWTCL_OVM_POS, SWIZZLE_XYZW, MASK_XYZW, 0);
+	rmesa->swtcl.coloroffset = 4;
+
+	if (fp_reads & FRAG_BIT_COL0) {
 		InputsRead |= 1 << VERT_ATTRIB_COLOR0;
 		OutputsWritten |= 1 << VERT_RESULT_COL0;
-		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_1F + sz - 1 );
-		offset += sz;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_COLOR0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR0, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
 	}
 
-	rmesa->swtcl.specoffset = 0;
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR1 )) {
-		sz = VB->AttribPtr[VERT_ATTRIB_COLOR1]->size;
-		rmesa->swtcl.specoffset = offset;
-		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_1F + sz - 1 );
+	if (fp_reads & FRAG_BIT_COL1) {
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
 		InputsRead |= 1 << VERT_ATTRIB_COLOR1;
 		OutputsWritten |= 1 << VERT_RESULT_COL1;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_COLOR1, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_COLOR1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR1, swiz, MASK_XYZW, 1);
+#endif
+		rmesa->swtcl.specoffset = rmesa->swtcl.coloroffset + 1;
 	}
 
-	fog_id = -1;
-	if (RENDERINPUTS_TEST(index_bitset, _TNL_ATTRIB_FOG)) {
-		/* find first free tex coord slot */
-		if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-			int i;
-			for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-				if (!RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-					fog_id = i;
-					break;
-				}
-			}
-		} else {
-			fog_id = 0;
+	if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
+		VB->AttribPtr[VERT_ATTRIB_GENERIC0] = VB->ColorPtr[1];
+		OutputsWritten |= 1 << VERT_RESULT_BFC0;
+#if MESA_LITTLE_ENDIAN
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_RGBA );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#else
+		EMIT_ATTR( _TNL_ATTRIB_GENERIC0, EMIT_4UB_4F_ABGR );
+		ADD_ATTR(VERT_ATTRIB_GENERIC0, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR2, SWIZZLE_XYZW, MASK_XYZW, 1);
+#endif
+		if (fp_reads & FRAG_BIT_COL1) {
+			VB->AttribPtr[VERT_ATTRIB_GENERIC1] = VB->SecondaryColorPtr[1];
+			GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+			OutputsWritten |= 1 << VERT_RESULT_BFC1;
+#if MESA_LITTLE_ENDIAN
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_RGBA );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#else
+			EMIT_ATTR( _TNL_ATTRIB_GENERIC1, EMIT_4UB_4F_ABGR );
+			ADD_ATTR(VERT_ATTRIB_GENERIC1, R300_DATA_TYPE_BYTE, SWTCL_OVM_COLOR3, swiz, MASK_XYZW, 1);
+#endif
 		}
+	}
 
-		if (fog_id == -1) {
-			fprintf(stderr, "\tout of free texcoords to do fog\n");
-			_mesa_exit(-1);
-		}
+	if (RENDERINPUTS_TEST(tnl->render_inputs_bitset, _TNL_ATTRIB_POINTSIZE )) {
+		GLuint swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO);
+		InputsRead |= 1 << VERT_ATTRIB_POINT_SIZE;
+		OutputsWritten |= 1 << VERT_RESULT_PSIZ;
+		EMIT_ATTR( _TNL_ATTRIB_POINTSIZE, EMIT_1F );
+		ADD_ATTR(VERT_ATTRIB_POINT_SIZE, R300_DATA_TYPE_FLOAT_1, SWTCL_OVM_POINT_SIZE, swiz, MASK_X, 0);
+	}
+
+	if (rmesa->selected_fp->wpos_attr != FRAG_ATTRIB_MAX) {
+		int tex_id = rmesa->selected_fp->wpos_attr - FRAG_ATTRIB_TEX0;
 
-		sz = VB->AttribPtr[VERT_ATTRIB_FOG]->size;
-		EMIT_ATTR( _TNL_ATTRIB_FOG, EMIT_1F + sz - 1);
-		InputsRead |= 1 << VERT_ATTRIB_FOG;
-		OutputsWritten |= 1 << VERT_RESULT_FOGC;
-		vap_fmt_1 |= sz << (3 * fog_id);
+		VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+		VB->TexCoordPtr[tex_id] = VB->AttribPtr[VERT_ATTRIB_POS];
+		RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
 	}
 
-	if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-		int i;
+	if (rmesa->selected_fp->fog_attr != FRAG_ATTRIB_MAX) {
+		int tex_id = rmesa->selected_fp->fog_attr - FRAG_ATTRIB_TEX0;
 
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-			if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-				sz = VB->TexCoordPtr[i]->size;
-				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
-				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-				EMIT_ATTR( _TNL_ATTRIB_TEX0+i, EMIT_1F + sz - 1 );
-				vap_fmt_1 |= sz << (3 * i);
-			}
-		}
+		VB->AttribPtr[VERT_ATTRIB_TEX0 + tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+		VB->TexCoordPtr[tex_id] = VB->AttribPtr[VERT_ATTRIB_FOG];
+		RENDERINPUTS_SET(tnl->render_inputs_bitset, _TNL_ATTRIB_TEX0 + tex_id);
 	}
 
-	/* RS can't put fragment position on the pixel stack, so stuff it in texcoord if needed */
-	if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_POS) && (ctx->FragmentProgram._Current->Base.InputsRead & FRAG_BIT_WPOS)) {
-		int first_free_tex = -1;
-		if (fog_id >= 0) {
-			first_free_tex = fog_id+1;
-		} else {
-			if (RENDERINPUTS_TEST_RANGE( index_bitset, _TNL_FIRST_TEX, _TNL_LAST_TEX )) {
-				int i;
-				for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-					if (!RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_TEX(i) )) {
-						first_free_tex = i;
+	/**
+	 *  Sending only one texcoord component may lead to lock up,
+	 *  so for all textures always output 4 texcoord components to RS.
+	 */
+	{
+		int i;
+		GLuint swiz, format, hw_format;
+		for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
+			if (fp_reads & FRAG_BIT_TEX(i)) {
+				switch (VB->TexCoordPtr[i]->size) {
+					case 1:
+						format = EMIT_1F;
+						hw_format = R300_DATA_TYPE_FLOAT_1;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
+					case 2:
+						format = EMIT_2F;
+						hw_format = R300_DATA_TYPE_FLOAT_2;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_ZERO, SWIZZLE_ONE);
+						break;
+					case 3:
+						format = EMIT_3F;
+						hw_format = R300_DATA_TYPE_FLOAT_3;
+						swiz = MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
+						break;
+					case 4:
+						format = EMIT_4F;
+						hw_format = R300_DATA_TYPE_FLOAT_4;
+						swiz = SWIZZLE_XYZW;
 						break;
-					}
+					default:
+						continue;
 				}
-			} else {
-				first_free_tex = 0;
+				InputsRead |= 1 << (VERT_ATTRIB_TEX0 + i);
+				OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
+				EMIT_ATTR(_TNL_ATTRIB_TEX(i), format);
+				ADD_ATTR(VERT_ATTRIB_TEX0 + i, hw_format, SWTCL_OVM_TEX(first_free_tex), swiz, MASK_XYZW, 0);
+				++first_free_tex;
 			}
 		}
-
-		if (first_free_tex == -1) {
-			fprintf(stderr, "\tout of free texcoords to write w pos\n");
-			_mesa_exit(-1);
-		}
-
-		sz = VB->AttribPtr[VERT_ATTRIB_POS]->size;
-		InputsRead |= 1 << (VERT_ATTRIB_TEX0 + first_free_tex);
-		OutputsWritten |= 1 << (VERT_RESULT_TEX0 + first_free_tex);
-		EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_1F + sz - 1 );
-		vap_fmt_1 |= sz << (3 * first_free_tex);
-	}
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			inputs[i] = nr++;
-		} else {
-			inputs[i] = -1;
-		}
 	}
 
-	/* Fixed, apply to vir0 only */
-	if (InputsRead & (1 << VERT_ATTRIB_POS))
-		inputs[VERT_ATTRIB_POS] = 0;
-	if (InputsRead & (1 << VERT_ATTRIB_COLOR0))
-		inputs[VERT_ATTRIB_COLOR0] = 2;
-	if (InputsRead & (1 << VERT_ATTRIB_COLOR1))
-		inputs[VERT_ATTRIB_COLOR1] = 3;
-	if (InputsRead & (1 << VERT_ATTRIB_FOG))
-		inputs[VERT_ATTRIB_FOG] = 6 + fog_id;
-	for (i = VERT_ATTRIB_TEX0; i <= VERT_ATTRIB_TEX7; i++)
-		if (InputsRead & (1 << i))
-			inputs[i] = 6 + (i - VERT_ATTRIB_TEX0);
-
-	for (i = 0, nr = 0; i < VERT_ATTRIB_MAX; i++) {
-		if (InputsRead & (1 << i)) {
-			tab[nr++] = i;
-		}
-	}
-
-	for (i = 0; i < nr; i++) {
-		int ci;
-
-		swizzle[i][0] = SWIZZLE_ZERO;
-		swizzle[i][1] = SWIZZLE_ZERO;
-		swizzle[i][2] = SWIZZLE_ZERO;
-		swizzle[i][3] = SWIZZLE_ONE;
-
-		for (ci = 0; ci < VB->AttribPtr[tab[i]]->size; ci++) {
-			swizzle[i][ci] = ci;
-		}
+	if (first_free_tex >= ctx->Const.MaxTextureUnits) {
+		fprintf(stderr, "\tout of free texcoords to write fog coordinate\n");
+		_mesa_exit(-1);
 	}
 
 	R300_NEWPRIM(rmesa);
-	R300_STATECHANGE(rmesa, vir[0]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[0].cmd)->packet0.count =
-		r300VAPInputRoute0(&rmesa->hw.vir[0].cmd[R300_VIR_CNTL_0],
-				   VB->AttribPtr, inputs, tab, nr);
-	R300_STATECHANGE(rmesa, vir[1]);
-	((drm_r300_cmd_header_t *) rmesa->hw.vir[1].cmd)->packet0.count =
-		r300VAPInputRoute1(&rmesa->hw.vir[1].cmd[R300_VIR_CNTL_0], swizzle,
-				   nr);
-
-	R300_STATECHANGE(rmesa, vic);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_0] = r300VAPInputCntl0(ctx, InputsRead);
-	rmesa->hw.vic.cmd[R300_VIC_CNTL_1] = r300VAPInputCntl1(ctx, InputsRead);
-
-	R300_STATECHANGE(rmesa, vof);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_0] = r300VAPOutputCntl0(ctx, OutputsWritten);
-	rmesa->hw.vof.cmd[R300_VOF_CNTL_1] = vap_fmt_1;
-
-	rmesa->swtcl.vertex_size =
-		_tnl_install_attrs( ctx,
-				    rmesa->swtcl.vertex_attrs,
-				    rmesa->swtcl.vertex_attr_count,
-				    NULL, 0 );
-
-	rmesa->swtcl.vertex_size /= 4;
+	rmesa->vbuf.num_attribs = num_attrs;
+	*_InputsRead = InputsRead;
+	*_OutputsWritten = OutputsWritten;
 
-	RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
-
-
-	R300_STATECHANGE(rmesa, vte);
-	rmesa->hw.vte.cmd[1] = vte;
-	rmesa->hw.vte.cmd[2] = rmesa->swtcl.vertex_size;
+	RENDERINPUTS_COPY(rmesa->render_inputs_bitset, tnl->render_inputs_bitset);
 }
 
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( r300ContextPtr rmesa  )
+static void r300PrepareVertices(GLcontext *ctx)
 {
-	if (RADEON_DEBUG & DEBUG_IOCTL)
-		fprintf(stderr, "%s\n", __FUNCTION__);
-
-	rmesa->dma.flush = NULL;
-
-	if (rmesa->dma.current.buf) {
-		struct r300_dma_region *current = &rmesa->dma.current;
-		GLuint current_offset = GET_START(current);
-
-		assert (current->start +
-			rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-			current->ptr);
-
-		if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-
-			r300EnsureCmdBufSpace( rmesa, rmesa->hw.max_state_size + (12*sizeof(int)), __FUNCTION__);
-
-			r300EmitState(rmesa);
-
-			r300EmitVertexAOS( rmesa,
-					   rmesa->swtcl.vertex_size,
-					   current_offset);
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+	GLuint InputsRead, OutputsWritten;
 
-			r300EmitVbufPrim( rmesa,
-					  rmesa->swtcl.hw_primitive,
-					  rmesa->swtcl.numverts);
+	r300ChooseSwtclVertexFormat(ctx, &InputsRead, &OutputsWritten);
+	r300SetupVAP(ctx, InputsRead, OutputsWritten);
 
-			r300EmitCacheFlush(rmesa);
-		}
+	rmesa->radeon.swtcl.vertex_size =
+		_tnl_install_attrs( ctx,
+				    rmesa->radeon.swtcl.vertex_attrs,
+				    rmesa->radeon.swtcl.vertex_attr_count,
+				    NULL, 0 );
 
-		rmesa->swtcl.numverts = 0;
-		current->start = current->ptr;
-	}
+	rmesa->radeon.swtcl.vertex_size /= 4;
 }
 
-/* Alloc space in the current dma region.
- */
-static void *
-r300AllocDmaLowVerts( r300ContextPtr rmesa, int nverts, int vsize )
-{
-	GLuint bytes = vsize * nverts;
-
-	if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end )
-		r300RefillCurrentDmaRegion( rmesa, bytes);
-
-	if (!rmesa->dma.flush) {
-		rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-		rmesa->dma.flush = flush_last_swtcl_prim;
-	}
-
-	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
-	ASSERT( rmesa->dma.flush == flush_last_swtcl_prim );
-	ASSERT( rmesa->dma.current.start +
-		rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-		rmesa->dma.current.ptr );
-
-	{
-		GLubyte *head = (GLubyte *) (rmesa->dma.current.address + rmesa->dma.current.ptr);
-		rmesa->dma.current.ptr += bytes;
-		rmesa->swtcl.numverts += nverts;
-		return head;
-	}
-}
 
 static GLuint reduced_prim[] = {
-  GL_POINTS,
-  GL_LINES,
-  GL_LINES,
-  GL_LINES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
-  GL_TRIANGLES,
+	GL_POINTS,
+	GL_LINES,
+	GL_LINES,
+	GL_LINES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
+	GL_TRIANGLES,
 };
 
 static void r300RasterPrimitive( GLcontext *ctx, GLuint prim );
-static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
-//static void r300ResetLineStipple( GLcontext *ctx );
 
 /***********************************************************************
  *                    Emit primitives as inline vertices               *
@@ -405,15 +274,13 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
 #define CTX_ARG r300ContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) r300AllocDmaLowVerts( rmesa, n, size * 4 )
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, size * 4 )
 #define LOCAL_VARS						\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
-   const char *r300verts = (char *)rmesa->swtcl.verts;
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;
 #define VERT(x) (r300Vertex *)(r300verts + ((x) * vertsize * sizeof(int)))
 #define VERTEX r300Vertex
-#define DO_DEBUG_VERTS (1 && (RADEON_DEBUG & DEBUG_VERTS))
-#define PRINT_VERTEX(x)
 #undef TAG
 #define TAG(x) r300_##x
 #include "tnl_dd/t_dd_triemit.h"
@@ -433,9 +300,8 @@ static void r300RenderPrimitive( GLcontext *ctx, GLenum prim );
  *              Build render functions from dd templates               *
  ***********************************************************************/
 
-#define R300_TWOSIDE_BIT	0x01
-#define R300_UNFILLED_BIT	0x02
-#define R300_MAX_TRIFUNC	0x04
+#define R300_UNFILLED_BIT	0x01
+#define R300_MAX_TRIFUNC	0x02
 
 static struct {
    tnl_points_func	        points;
@@ -446,9 +312,9 @@ static struct {
 
 #define DO_FALLBACK  0
 #define DO_UNFILLED (IND & R300_UNFILLED_BIT)
-#define DO_TWOSIDE  (IND & R300_TWOSIDE_BIT)
+#define DO_TWOSIDE   0
 #define DO_FLAT      0
-#define DO_OFFSET     0
+#define DO_OFFSET    0
 #define DO_TRI       1
 #define DO_QUAD      1
 #define DO_LINE      1
@@ -468,33 +334,39 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + (e*rmesa->swtcl.vertex_size*sizeof(int)))
-
-/* Only used to pull back colors into vertices (ie, we know color is
- * floating point).
- */
-#define R300_COLOR( dst, src )				\
-do {							\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[3], (src)[3]);	\
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + (e*rmesa->radeon.swtcl.vertex_size*sizeof(int)))
+
+#define VERT_SET_RGBA( v, c ) \
+do { \
+   r300_color_t *color = (r300_color_t *)&((v)->ui[coloroffset]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->blue, (c)[2]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(color->alpha, (c)[3]); \
 } while (0)
 
-#define VERT_SET_RGBA( v, c )    if (coloroffset) R300_COLOR( v->ub4[coloroffset], c )
-#define VERT_COPY_RGBA( v0, v1 ) if (coloroffset) v0->ui[coloroffset] = v1->ui[coloroffset]
-#define VERT_SAVE_RGBA( idx )    if (coloroffset) color[idx] = v[idx]->ui[coloroffset]
-#define VERT_RESTORE_RGBA( idx ) if (coloroffset) v[idx]->ui[coloroffset] = color[idx]
+#define VERT_COPY_RGBA( v0, v1 ) v0->ui[coloroffset] = v1->ui[coloroffset]
 
-#define R300_SPEC( dst, src )				\
-do {							\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[0], (src)[2]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[1], (src)[1]);	\
-   UNCLAMPED_FLOAT_TO_UBYTE((dst)[2], (src)[0]);	\
+#define VERT_SET_SPEC( v0, c ) \
+do { \
+   if (specoffset) { \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.red, (c)[0]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.green, (c)[1]); \
+   UNCLAMPED_FLOAT_TO_UBYTE(v0->v.specular.blue, (c)[2]); \
+   } \
 } while (0)
 
-#define VERT_SET_SPEC( v, c )    if (specoffset) R300_SPEC( v->ub4[specoffset], c )
-#define VERT_COPY_SPEC( v0, v1 ) if (specoffset) COPY_3V(v0->ub4[specoffset], v1->ub4[specoffset])
+#define VERT_COPY_SPEC( v0, v1 ) \
+do { \
+   if (specoffset) { \
+       v0->v.specular.red = v1->v.specular.red; \
+       v0->v.specular.green = v1->v.specular.green; \
+       v0->v.specular.blue = v1->v.specular.blue; \
+   } \
+} while (0)
+
+#define VERT_SAVE_RGBA( idx )    color[idx] = v[idx]->ui[coloroffset]
+#define VERT_RESTORE_RGBA( idx ) v[idx]->ui[coloroffset] = color[idx]
 #define VERT_SAVE_SPEC( idx )    if (specoffset) spec[idx] = v[idx]->ui[specoffset]
 #define VERT_RESTORE_SPEC( idx ) if (specoffset) v[idx]->ui[specoffset] = spec[idx]
 
@@ -514,7 +386,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) r300RasterPrimitive( ctx, reduced_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -530,26 +402,15 @@ do {							\
 #define TAG(x) x
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT)
-#define TAG(x) x##_twoside
-#include "tnl_dd/t_dd_tritmp.h"
-
 #define IND (R300_UNFILLED_BIT)
 #define TAG(x) x##_unfilled
 #include "tnl_dd/t_dd_tritmp.h"
 
-#define IND (R300_TWOSIDE_BIT|R300_UNFILLED_BIT)
-#define TAG(x) x##_twoside_unfilled
-#include "tnl_dd/t_dd_tritmp.h"
-
-
 
 static void init_rast_tab( void )
 {
    init();
-   init_twoside();
    init_unfilled();
-   init_twoside_unfilled();
 }
 
 /**********************************************************************/
@@ -571,8 +432,8 @@ static void init_rast_tab( void )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
    r300ContextPtr rmesa = R300_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *r300verts = (char *)rmesa->swtcl.verts;		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *r300verts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -601,10 +462,9 @@ static void r300ChooseRenderState( GLcontext *ctx )
 	GLuint index = 0;
 	GLuint flags = ctx->_TriangleCaps;
 
-	if (flags & DD_TRI_LIGHT_TWOSIDE) index |= R300_TWOSIDE_BIT;
 	if (flags & DD_TRI_UNFILLED)      index |= R300_UNFILLED_BIT;
 
-	if (index != rmesa->swtcl.RenderIndex) {
+	if (index != rmesa->radeon.swtcl.RenderIndex) {
 		tnl->Driver.Render.Points = rast_tab[index].points;
 		tnl->Driver.Render.Line = rast_tab[index].line;
 		tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -621,30 +481,34 @@ static void r300ChooseRenderState( GLcontext *ctx )
 			tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
 		}
 
-		rmesa->swtcl.RenderIndex = index;
+		rmesa->radeon.swtcl.RenderIndex = index;
 	}
 }
 
 
-static void r300RenderStart(GLcontext *ctx)
+void r300RenderStart(GLcontext *ctx)
 {
-        r300ContextPtr rmesa = R300_CONTEXT( ctx );
+	r300ContextPtr rmesa = R300_CONTEXT( ctx );
 
 	r300ChooseRenderState(ctx);
-	r300SetVertexFormat(ctx);
 
 	r300UpdateShaders(rmesa);
+
+	r300PrepareVertices(ctx);
+
+	r300ValidateBuffers(ctx);
+
 	r300UpdateShaderStates(rmesa);
 
 	r300EmitCacheFlush(rmesa);
 
-	if (rmesa->dma.flush != 0 &&
-	    rmesa->dma.flush != flush_last_swtcl_prim)
-		rmesa->dma.flush( rmesa );
-
+	/* investigate if we can put back flush optimisation if needed */
+	if (rmesa->radeon.dma.flush != NULL) {
+		rmesa->radeon.dma.flush(ctx);
+	}
 }
 
-static void r300RenderFinish(GLcontext *ctx)
+void r300RenderFinish(GLcontext *ctx)
 {
 }
 
@@ -652,28 +516,26 @@ static void r300RasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
 
-	if (rmesa->swtcl.hw_primitive != hwprim) {
-	        R300_NEWPRIM( rmesa );
-		rmesa->swtcl.hw_primitive = hwprim;
+	if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
+		R300_NEWPRIM( rmesa );
+		rmesa->radeon.swtcl.hw_primitive = hwprim;
 	}
 }
 
-static void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
+void r300RenderPrimitive(GLcontext *ctx, GLenum prim)
 {
 
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	rmesa->swtcl.render_primitive = prim;
+	rmesa->radeon.swtcl.render_primitive = prim;
 
 	if ((prim == GL_TRIANGLES) && (ctx->_TriangleCaps & DD_TRI_UNFILLED))
-	  return;
+		return;
 
 	r300RasterPrimitive( ctx, reduced_prim[prim] );
 }
 
-static void r300ResetLineStipple(GLcontext *ctx)
+void r300ResetLineStipple(GLcontext *ctx)
 {
-
-
 }
 
 void r300InitSwtcl(GLcontext *ctx)
@@ -699,50 +561,68 @@ void r300InitSwtcl(GLcontext *ctx)
 	_tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12,
 			    48 * sizeof(GLfloat) );
 
-	rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-	rmesa->swtcl.RenderIndex = ~0;
-	rmesa->swtcl.render_primitive = GL_TRIANGLES;
-	rmesa->swtcl.hw_primitive = 0;
+	rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+	rmesa->radeon.swtcl.RenderIndex = ~0;
+	rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+	rmesa->radeon.swtcl.hw_primitive = 0;
 
 	_tnl_invalidate_vertex_state( ctx, ~0 );
 	_tnl_invalidate_vertices( ctx, ~0 );
-	RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
 
 	_tnl_need_projected_coords( ctx, GL_FALSE );
-	r300ChooseRenderState(ctx);
 }
 
 void r300DestroySwtcl(GLcontext *ctx)
 {
 }
 
-void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, GLuint offset)
+static void r300EmitVertexAOS(r300ContextPtr rmesa, GLuint vertex_size, struct radeon_bo *bo, GLuint offset)
 {
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(&rmesa->radeon);
 
-	drm_radeon_cmd_header_t *cmd = NULL;
 	if (RADEON_DEBUG & DEBUG_VERTS)
-	  fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
-		  __FUNCTION__, vertex_size, offset);
-
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2), 2);
-	e32(1);
-	e32(vertex_size | (vertex_size << 8));
-	e32(offset);
+		fprintf(stderr, "%s:  vertex_size %d, offset 0x%x \n",
+			__FUNCTION__, vertex_size, offset);
+
+	BEGIN_BATCH(7);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_LOAD_VBPNTR, 2);
+	OUT_BATCH(1);
+	OUT_BATCH(vertex_size | (vertex_size << 8));
+	OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+	END_BATCH();
 }
 
-void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
+static void r300EmitVbufPrim(r300ContextPtr rmesa, GLuint primitive, GLuint vertex_nr)
 {
-
-	int cmd_reserved = 0;
-	int cmd_written = 0;
+	BATCH_LOCALS(&rmesa->radeon);
 	int type, num_verts;
-	drm_radeon_cmd_header_t *cmd = NULL;
 
 	type = r300PrimitiveType(rmesa, primitive);
 	num_verts = r300NumVerts(rmesa, vertex_nr, primitive);
 
-	start_packet3(CP_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0), 0);
-	e32(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+	BEGIN_BATCH(3);
+	OUT_BATCH_PACKET3(R300_PACKET3_3D_DRAW_VBUF_2, 0);
+	OUT_BATCH(R300_VAP_VF_CNTL__PRIM_WALK_VERTEX_LIST | (num_verts << 16) | type);
+	END_BATCH();
+}
+
+void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
+{
+	r300ContextPtr rmesa = R300_CONTEXT(ctx);
+
+	rcommonEnsureCmdBufSpace(&rmesa->radeon,
+			   rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+			   __FUNCTION__);
+	radeonEmitState(&rmesa->radeon);
+    r300_emit_scissor(ctx);
+	r300EmitVertexAOS(rmesa,
+			rmesa->radeon.swtcl.vertex_size,
+			rmesa->radeon.dma.current,
+			current_offset);
+
+	r300EmitVbufPrim(rmesa,
+		   rmesa->radeon.swtcl.hw_primitive,
+		   rmesa->radeon.swtcl.numverts);
+	r300EmitCacheFlush(rmesa);
+	COMMIT_BATCH();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_swtcl.h b/src/mesa/drivers/dri/r300/r300_swtcl.h
index 55df53c1adf..c271d265468 100644
--- a/src/mesa/drivers/dri/r300/r300_swtcl.h
+++ b/src/mesa/drivers/dri/r300/r300_swtcl.h
@@ -39,7 +39,27 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 #include "r300_context.h"
 
+/*
+ * Here are definitions of OVM locations of vertex attributes for non TCL hw
+ */
+#define SWTCL_OVM_POS 0
+#define SWTCL_OVM_COLOR0 2
+#define SWTCL_OVM_COLOR1 3
+#define SWTCL_OVM_COLOR2 4
+#define SWTCL_OVM_COLOR3 5
+#define SWTCL_OVM_TEX(n) ((n) + 6)
+#define SWTCL_OVM_POINT_SIZE 15
+
+extern void r300ChooseSwtclVertexFormat(GLcontext *ctx, GLuint *InputsRead,  GLuint *OutputsWritten);
+
 extern void r300InitSwtcl( GLcontext *ctx );
 extern void r300DestroySwtcl( GLcontext *ctx );
 
+extern void r300RenderStart(GLcontext *ctx);
+extern void r300RenderFinish(GLcontext *ctx);
+extern void r300RenderPrimitive(GLcontext *ctx, GLenum prim);
+extern void r300ResetLineStipple(GLcontext *ctx);
+
+extern void r300_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r300_tex.c b/src/mesa/drivers/dri/r300/r300_tex.c
index 7c699ec572c..0af5bb4f469 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.c
+++ b/src/mesa/drivers/dri/r300/r300_tex.c
@@ -38,6 +38,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/enums.h"
 #include "main/image.h"
+#include "main/mipmap.h"
 #include "main/simple_list.h"
 #include "main/texformat.h"
 #include "main/texstore.h"
@@ -49,6 +50,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 
 #include "xmlpool.h"
@@ -77,20 +79,20 @@ static unsigned int translate_wrap_mode(GLenum wrapmode)
  *
  * \param t Texture object whose wrap modes are to be set
  */
-static void r300UpdateTexWrap(r300TexObjPtr t)
+static void r300UpdateTexWrap(radeonTexObjPtr t)
 {
-	struct gl_texture_object *tObj = t->base.tObj;
+	struct gl_texture_object *tObj = &t->base;
 
-	t->filter &=
+	t->pp_txfilter &=
 	    ~(R300_TX_WRAP_S_MASK | R300_TX_WRAP_T_MASK | R300_TX_WRAP_R_MASK);
 
-	t->filter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
+	t->pp_txfilter |= translate_wrap_mode(tObj->WrapS) << R300_TX_WRAP_S_SHIFT;
 
 	if (tObj->Target != GL_TEXTURE_1D) {
-		t->filter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
+		t->pp_txfilter |= translate_wrap_mode(tObj->WrapT) << R300_TX_WRAP_T_SHIFT;
 
 		if (tObj->Target == GL_TEXTURE_3D)
-			t->filter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
+			t->pp_txfilter |= translate_wrap_mode(tObj->WrapR) << R300_TX_WRAP_R_SHIFT;
 	}
 }
 
@@ -117,10 +119,13 @@ static GLuint aniso_filter(GLfloat anisotropy)
  * \param magf Texture magnification mode
  * \param anisotropy Maximum anisotropy level
  */
-static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+static void r300SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
 {
-	t->filter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
-	t->filter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
+	/* Force revalidation to account for switches from/to mipmapping. */
+	t->validated = GL_FALSE;
+
+	t->pp_txfilter &= ~(R300_TX_MIN_FILTER_MASK | R300_TX_MIN_FILTER_MIP_MASK | R300_TX_MAG_FILTER_MASK | R300_TX_MAX_ANISO_MASK);
+	t->pp_txfilter_1 &= ~R300_EDGE_ANISO_EDGE_ONLY;
 
 	/* Note that EXT_texture_filter_anisotropic is extremely vague about
 	 * how anisotropic filtering interacts with the "normal" filter modes.
@@ -128,7 +133,7 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 	 * filter settings completely. This includes driconf's settings.
 	 */
 	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
-		t->filter |= R300_TX_MAG_FILTER_ANISO
+		t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
 			| R300_TX_MIN_FILTER_ANISO
 			| R300_TX_MIN_FILTER_MIP_LINEAR
 			| aniso_filter(anisotropy);
@@ -139,22 +144,22 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 
 	switch (minf) {
 	case GL_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST;
 		break;
 	case GL_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR;
 		break;
 	case GL_NEAREST_MIPMAP_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_NEAREST;
 		break;
 	case GL_NEAREST_MIPMAP_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_NEAREST|R300_TX_MIN_FILTER_MIP_LINEAR;
 		break;
 	case GL_LINEAR_MIPMAP_NEAREST:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_NEAREST;
 		break;
 	case GL_LINEAR_MIPMAP_LINEAR:
-		t->filter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
+		t->pp_txfilter |= R300_TX_MIN_FILTER_LINEAR|R300_TX_MIN_FILTER_MIP_LINEAR;
 		break;
 	}
 
@@ -163,15 +168,15 @@ static void r300SetTexFilter(r300TexObjPtr t, GLenum minf, GLenum magf, GLfloat
 	 */
 	switch (magf) {
 	case GL_NEAREST:
-		t->filter |= R300_TX_MAG_FILTER_NEAREST;
+		t->pp_txfilter |= R300_TX_MAG_FILTER_NEAREST;
 		break;
 	case GL_LINEAR:
-		t->filter |= R300_TX_MAG_FILTER_LINEAR;
+		t->pp_txfilter |= R300_TX_MAG_FILTER_LINEAR;
 		break;
 	}
 }
 
-static void r300SetTexBorderColor(r300TexObjPtr t, const GLfloat color[4])
+static void r300SetTexBorderColor(radeonTexObjPtr t, const GLfloat color[4])
 {
 	GLubyte c[4];
 	CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
@@ -182,729 +187,6 @@ static void r300SetTexBorderColor(r300TexObjPtr t, const GLfloat color[4])
 }
 
 /**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static r300TexObjPtr r300AllocTexObj(struct gl_texture_object *texObj)
-{
-	r300TexObjPtr t;
-
-	t = CALLOC_STRUCT(r300_tex_obj);
-	texObj->DriverData = t;
-	if (t != NULL) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE) {
-			fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-				(void *)texObj, (void *)t);
-		}
-
-		/* Initialize non-image-dependent parts of the state:
-		 */
-		t->base.tObj = texObj;
-		t->border_fallback = GL_FALSE;
-
-		make_empty_list(&t->base);
-
-		r300UpdateTexWrap(t);
-		r300SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
-		r300SetTexBorderColor(t, texObj->BorderColor);
-	}
-
-	return t;
-}
-
-/* try to find a format which will only need a memcopy */
-static const struct gl_texture_format *r300Choose8888TexFormat(GLenum srcFormat,
-							       GLenum srcType)
-{
-	const GLuint ui = 1;
-	const GLubyte littleEndian = *((const GLubyte *)&ui);
-
-	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
-		return &_mesa_texformat_rgba8888;
-	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
-		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
-		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
-		return &_mesa_texformat_rgba8888_rev;
-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
-					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
-		return &_mesa_texformat_argb8888_rev;
-	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
-					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
-		return &_mesa_texformat_argb8888;
-	} else
-		return _dri_texformat_argb8888;
-}
-
-static const struct gl_texture_format *r300ChooseTextureFormat(GLcontext * ctx,
-							       GLint
-							       internalFormat,
-							       GLenum format,
-							       GLenum type)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	const GLboolean do32bpt =
-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
-	const GLboolean force16bpt =
-	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
-	(void)format;
-
-#if 0
-	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
-		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
-		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
-	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
-#endif
-
-	switch (internalFormat) {
-	case 4:
-	case GL_RGBA:
-	case GL_COMPRESSED_RGBA:
-		switch (type) {
-		case GL_UNSIGNED_INT_10_10_10_2:
-		case GL_UNSIGNED_INT_2_10_10_10_REV:
-			return do32bpt ? _dri_texformat_argb8888 :
-			    _dri_texformat_argb1555;
-		case GL_UNSIGNED_SHORT_4_4_4_4:
-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-			return _dri_texformat_argb4444;
-		case GL_UNSIGNED_SHORT_5_5_5_1:
-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-			return _dri_texformat_argb1555;
-		default:
-			return do32bpt ? r300Choose8888TexFormat(format, type) :
-			    _dri_texformat_argb4444;
-		}
-
-	case 3:
-	case GL_RGB:
-	case GL_COMPRESSED_RGB:
-		switch (type) {
-		case GL_UNSIGNED_SHORT_4_4_4_4:
-		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-			return _dri_texformat_argb4444;
-		case GL_UNSIGNED_SHORT_5_5_5_1:
-		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-			return _dri_texformat_argb1555;
-		case GL_UNSIGNED_SHORT_5_6_5:
-		case GL_UNSIGNED_SHORT_5_6_5_REV:
-			return _dri_texformat_rgb565;
-		default:
-			return do32bpt ? _dri_texformat_argb8888 :
-			    _dri_texformat_rgb565;
-		}
-
-	case GL_RGBA8:
-	case GL_RGB10_A2:
-	case GL_RGBA12:
-	case GL_RGBA16:
-		return !force16bpt ?
-		    r300Choose8888TexFormat(format,
-					    type) : _dri_texformat_argb4444;
-
-	case GL_RGBA4:
-	case GL_RGBA2:
-		return _dri_texformat_argb4444;
-
-	case GL_RGB5_A1:
-		return _dri_texformat_argb1555;
-
-	case GL_RGB8:
-	case GL_RGB10:
-	case GL_RGB12:
-	case GL_RGB16:
-		return !force16bpt ? _dri_texformat_argb8888 :
-		    _dri_texformat_rgb565;
-
-	case GL_RGB5:
-	case GL_RGB4:
-	case GL_R3_G3_B2:
-		return _dri_texformat_rgb565;
-
-	case GL_ALPHA:
-	case GL_ALPHA4:
-	case GL_ALPHA8:
-	case GL_ALPHA12:
-	case GL_ALPHA16:
-	case GL_COMPRESSED_ALPHA:
-		return _dri_texformat_a8;
-
-	case 1:
-	case GL_LUMINANCE:
-	case GL_LUMINANCE4:
-	case GL_LUMINANCE8:
-	case GL_LUMINANCE12:
-	case GL_LUMINANCE16:
-	case GL_COMPRESSED_LUMINANCE:
-		return _dri_texformat_l8;
-
-	case 2:
-	case GL_LUMINANCE_ALPHA:
-	case GL_LUMINANCE4_ALPHA4:
-	case GL_LUMINANCE6_ALPHA2:
-	case GL_LUMINANCE8_ALPHA8:
-	case GL_LUMINANCE12_ALPHA4:
-	case GL_LUMINANCE12_ALPHA12:
-	case GL_LUMINANCE16_ALPHA16:
-	case GL_COMPRESSED_LUMINANCE_ALPHA:
-		return _dri_texformat_al88;
-
-	case GL_INTENSITY:
-	case GL_INTENSITY4:
-	case GL_INTENSITY8:
-	case GL_INTENSITY12:
-	case GL_INTENSITY16:
-	case GL_COMPRESSED_INTENSITY:
-		return _dri_texformat_i8;
-
-	case GL_YCBCR_MESA:
-		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-		    type == GL_UNSIGNED_BYTE)
-			return &_mesa_texformat_ycbcr;
-		else
-			return &_mesa_texformat_ycbcr_rev;
-
-	case GL_RGB_S3TC:
-	case GL_RGB4_S3TC:
-	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgb_dxt1;
-
-	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-		return &_mesa_texformat_rgba_dxt1;
-
-	case GL_RGBA_S3TC:
-	case GL_RGBA4_S3TC:
-	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-		return &_mesa_texformat_rgba_dxt3;
-
-	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-		return &_mesa_texformat_rgba_dxt5;
-
-	case GL_ALPHA16F_ARB:
-		return &_mesa_texformat_alpha_float16;
-	case GL_ALPHA32F_ARB:
-		return &_mesa_texformat_alpha_float32;
-	case GL_LUMINANCE16F_ARB:
-		return &_mesa_texformat_luminance_float16;
-	case GL_LUMINANCE32F_ARB:
-		return &_mesa_texformat_luminance_float32;
-	case GL_LUMINANCE_ALPHA16F_ARB:
-		return &_mesa_texformat_luminance_alpha_float16;
-	case GL_LUMINANCE_ALPHA32F_ARB:
-		return &_mesa_texformat_luminance_alpha_float32;
-	case GL_INTENSITY16F_ARB:
-		return &_mesa_texformat_intensity_float16;
-	case GL_INTENSITY32F_ARB:
-		return &_mesa_texformat_intensity_float32;
-	case GL_RGB16F_ARB:
-		return &_mesa_texformat_rgba_float16;
-	case GL_RGB32F_ARB:
-		return &_mesa_texformat_rgba_float32;
-	case GL_RGBA16F_ARB:
-		return &_mesa_texformat_rgba_float16;
-	case GL_RGBA32F_ARB:
-		return &_mesa_texformat_rgba_float32;
-
-	case GL_DEPTH_COMPONENT:
-	case GL_DEPTH_COMPONENT16:
-	case GL_DEPTH_COMPONENT24:
-	case GL_DEPTH_COMPONENT32:
-#if 0
-		switch (type) {
-		case GL_UNSIGNED_BYTE:
-		case GL_UNSIGNED_SHORT:
-			return &_mesa_texformat_z16;
-		case GL_UNSIGNED_INT:
-			return &_mesa_texformat_z32;
-		case GL_UNSIGNED_INT_24_8_EXT:
-		default:
-			return &_mesa_texformat_z24_s8;
-		}
-#else
-		return &_mesa_texformat_z16;
-#endif
-
-	default:
-		_mesa_problem(ctx,
-			      "unexpected internalFormat 0x%x in r300ChooseTextureFormat",
-			      (int)internalFormat);
-		return NULL;
-	}
-
-	return NULL;		/* never get here */
-}
-
-static GLboolean
-r300ValidateClientStorage(GLcontext * ctx, GLenum target,
-			  GLint internalFormat,
-			  GLint srcWidth, GLint srcHeight,
-			  GLenum format, GLenum type, const void *pixels,
-			  const struct gl_pixelstore_attrib *packing,
-			  struct gl_texture_object *texObj,
-			  struct gl_texture_image *texImage)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "intformat %s format %s type %s\n",
-			_mesa_lookup_enum_by_nr(internalFormat),
-			_mesa_lookup_enum_by_nr(format),
-			_mesa_lookup_enum_by_nr(type));
-
-	if (!ctx->Unpack.ClientStorage)
-		return 0;
-
-	if (ctx->_ImageTransferState ||
-	    texImage->IsCompressed || texObj->GenerateMipmap)
-		return 0;
-
-	/* This list is incomplete, may be different on ppc???
-	 */
-	switch (internalFormat) {
-	case GL_RGBA:
-		if (format == GL_BGRA && type == GL_UNSIGNED_INT_8_8_8_8_REV) {
-			texImage->TexFormat = _dri_texformat_argb8888;
-		} else
-			return 0;
-		break;
-
-	case GL_RGB:
-		if (format == GL_RGB && type == GL_UNSIGNED_SHORT_5_6_5) {
-			texImage->TexFormat = _dri_texformat_rgb565;
-		} else
-			return 0;
-		break;
-
-	case GL_YCBCR_MESA:
-		if (format == GL_YCBCR_MESA &&
-		    type == GL_UNSIGNED_SHORT_8_8_REV_APPLE) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr_rev;
-		} else if (format == GL_YCBCR_MESA &&
-			   (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-			    type == GL_UNSIGNED_BYTE)) {
-			texImage->TexFormat = &_mesa_texformat_ycbcr;
-		} else
-			return 0;
-		break;
-
-	default:
-		return 0;
-	}
-
-	/* Could deal with these packing issues, but currently don't:
-	 */
-	if (packing->SkipPixels ||
-	    packing->SkipRows || packing->SwapBytes || packing->LsbFirst) {
-		return 0;
-	}
-
-	GLint srcRowStride = _mesa_image_row_stride(packing, srcWidth,
-						    format, type);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: srcRowStride %d/%x\n",
-			__FUNCTION__, srcRowStride, srcRowStride);
-
-	/* Could check this later in upload, pitch restrictions could be
-	 * relaxed, but would need to store the image pitch somewhere,
-	 * as packing details might change before image is uploaded:
-	 */
-	if (!r300IsGartMemory(rmesa, pixels, srcHeight * srcRowStride)
-	    || (srcRowStride & 63))
-		return 0;
-
-	/* Have validated that _mesa_transfer_teximage would be a straight
-	 * memcpy at this point.  NOTE: future calls to TexSubImage will
-	 * overwrite the client data.  This is explicitly mentioned in the
-	 * extension spec.
-	 */
-	texImage->Data = (void *)pixels;
-	texImage->IsClientData = GL_TRUE;
-	texImage->RowStride = srcRowStride / texImage->TexFormat->TexelBytes;
-
-	return 1;
-}
-
-static void r300TexImage1D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-			return;
-		}
-	}
-
-	/* Note, this will call ChooseTextureFormat */
-	_mesa_store_teximage1d(ctx, target, level, internalFormat,
-			       width, border, format, type, pixels,
-			       &ctx->Unpack, texObj, texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-static void r300TexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset,
-			      GLsizei width,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-			return;
-		}
-	}
-
-	_mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-				  format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-static void r300TexImage2D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage2d(ctx, target, level, internalFormat,
-				       width, height, border, format, type,
-				       pixels, &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
-}
-
-static void r300TexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
-			      GLint xoffset, GLint yoffset,
-			      GLsizei width, GLsizei height,
-			      GLenum format, GLenum type,
-			      const GLvoid * pixels,
-			      const struct gl_pixelstore_attrib *packing,
-			      struct gl_texture_object *texObj,
-			      struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-			return;
-		}
-	}
-
-	_mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-				  height, format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[face] |= (1 << level);
-}
-
-static void r300CompressedTexImage2D(GLcontext * ctx, GLenum target,
-				     GLint level, GLint internalFormat,
-				     GLint width, GLint height, GLint border,
-				     GLsizei imageSize, const GLvoid * data,
-				     struct gl_texture_object *texObj,
-				     struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	if (t != NULL) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexImage2D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-	/* can't call this, different parameters. Would never evaluate to true anyway currently */
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_compressed_teximage2d(ctx, target, level,
-						  internalFormat, width, height,
-						  border, imageSize, data,
-						  texObj, texImage);
-
-		t->dirty_images[face] |= (1 << level);
-	}
-}
-
-static void r300CompressedTexSubImage2D(GLcontext * ctx, GLenum target,
-					GLint level, GLint xoffset,
-					GLint yoffset, GLsizei width,
-					GLsizei height, GLenum format,
-					GLsizei imageSize, const GLvoid * data,
-					struct gl_texture_object *texObj,
-					struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-	GLuint face;
-
-	/* which cube face or ordinary 2D image */
-	switch (target) {
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-		face =
-		    (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-		ASSERT(face < 6);
-		break;
-	default:
-		face = 0;
-	}
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY,
-				    "glCompressedTexSubImage3D");
-			return;
-		}
-	}
-
-	_mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset,
-					     yoffset, width, height, format,
-					     imageSize, data, texObj, texImage);
-
-	t->dirty_images[face] |= (1 << level);
-}
-
-static void r300TexImage3D(GLcontext * ctx, GLenum target, GLint level,
-			   GLint internalFormat,
-			   GLint width, GLint height, GLint depth,
-			   GLint border,
-			   GLenum format, GLenum type, const GLvoid * pixels,
-			   const struct gl_pixelstore_attrib *packing,
-			   struct gl_texture_object *texObj,
-			   struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage3D");
-			return;
-		}
-	}
-
-	texImage->IsClientData = GL_FALSE;
-
-#if 0
-	if (r300ValidateClientStorage(ctx, target,
-				      internalFormat,
-				      width, height,
-				      format, type, pixels,
-				      packing, texObj, texImage)) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using client storage\n",
-				__FUNCTION__);
-	} else
-#endif
-	{
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: Using normal storage\n",
-				__FUNCTION__);
-
-		/* Normal path: copy (to cached memory) and eventually upload
-		 * via another copy to GART memory and then a blit...  Could
-		 * eliminate one copy by going straight to (permanent) GART.
-		 *
-		 * Note, this will call r300ChooseTextureFormat.
-		 */
-		_mesa_store_teximage3d(ctx, target, level, internalFormat,
-				       width, height, depth, border,
-				       format, type, pixels,
-				       &ctx->Unpack, texObj, texImage);
-
-		t->dirty_images[0] |= (1 << level);
-	}
-}
-
-static void
-r300TexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
-		  GLint xoffset, GLint yoffset, GLint zoffset,
-		  GLsizei width, GLsizei height, GLsizei depth,
-		  GLenum format, GLenum type,
-		  const GLvoid * pixels,
-		  const struct gl_pixelstore_attrib *packing,
-		  struct gl_texture_object *texObj,
-		  struct gl_texture_image *texImage)
-{
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
-
-/*     fprintf(stderr, "%s\n", __FUNCTION__); */
-
-	assert(t);		/* this _should_ be true */
-	if (t) {
-		driSwapOutTextureObject(t);
-	} else {
-		t = (driTextureObject *) r300AllocTexObj(texObj);
-		if (!t) {
-			_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage3D");
-			return;
-		}
-		texObj->DriverData = t;
-	}
-
-	_mesa_store_texsubimage3d(ctx, target, level, xoffset, yoffset, zoffset,
-				  width, height, depth,
-				  format, type, pixels, packing, texObj,
-				  texImage);
-
-	t->dirty_images[0] |= (1 << level);
-}
-
-/**
  * Changes variables and flags for a state update, which will happen at the
  * next UpdateTextureState
  */
@@ -913,7 +195,7 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 			     struct gl_texture_object *texObj,
 			     GLenum pname, const GLfloat * params)
 {
-	r300TexObjPtr t = (r300TexObjPtr) texObj->DriverData;
+	radeonTexObj* t = radeon_tex_obj(texObj);
 
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
 		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
@@ -946,7 +228,11 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 		 * we just have to rely on loading the right subset of mipmap levels
 		 * to simulate a clamped LOD.
 		 */
-		driSwapOutTextureObject((driTextureObject *) t);
+		if (t->mt) {
+			radeon_miptree_unreference(t->mt);
+			t->mt = 0;
+			t->validated = GL_FALSE;
+		}
 		break;
 
 	case GL_DEPTH_TEXTURE_MODE:
@@ -969,27 +255,10 @@ static void r300TexParameter(GLcontext * ctx, GLenum target,
 	}
 }
 
-static void r300BindTexture(GLcontext * ctx, GLenum target,
-			    struct gl_texture_object *texObj)
-{
-	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
-		fprintf(stderr, "%s( %p ) unit=%d\n", __FUNCTION__,
-			(void *)texObj, ctx->Texture.CurrentUnit);
-	}
-
-	if ((target == GL_TEXTURE_1D)
-	    || (target == GL_TEXTURE_2D)
-	    || (target == GL_TEXTURE_3D)
-	    || (target == GL_TEXTURE_CUBE_MAP)
-	    || (target == GL_TEXTURE_RECTANGLE_NV)) {
-		assert(texObj->DriverData != NULL);
-	}
-}
-
 static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	driTextureObject *t = (driTextureObject *) texObj->DriverData;
+	radeonTexObj* t = radeon_tex_obj(texObj);
 
 	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
 		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
@@ -997,14 +266,24 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
 			_mesa_lookup_enum_by_nr(texObj->Target));
 	}
 
-	if (t != NULL) {
-		if (rmesa) {
-			R300_FIREVERTICES(rmesa);
-		}
+	if (rmesa) {
+		int i;
+		radeon_firevertices(&rmesa->radeon);
+
+		for(i = 0; i < R300_MAX_TEXTURE_UNITS; ++i)
+			if (rmesa->hw.textures[i] == t)
+				rmesa->hw.textures[i] = 0;
+	}
 
-		driDestroyTextureObject(t);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
 	}
-	/* Free mipmap images and the texture object itself */
 	_mesa_delete_texture_object(ctx, texObj);
 }
 
@@ -1013,8 +292,6 @@ static void r300DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
  * Called via ctx->Driver.NewTextureObject.
  * Note: this function will be called during context creation to
  * allocate the default texture objects.
- * Note: we could use containment here to 'derive' the driver-specific
- * texture object from the core mesa gl_texture_object.  Not done at this time.
  * Fixup MaxAnisotropy according to user preference.
  */
 static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
@@ -1022,14 +299,23 @@ static struct gl_texture_object *r300NewTextureObject(GLcontext * ctx,
 						      GLenum target)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_object *obj;
-	obj = _mesa_new_texture_object(ctx, name, target);
-	if (!obj)
-		return NULL;
-	obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
+	radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
 
-	r300AllocTexObj(obj);
-	return obj;
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			t, _mesa_lookup_enum_by_nr(target));
+	}
+
+	_mesa_initialize_texture_object(&t->base, name, target);
+	t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+	/* Initialize hardware state */
+	r300UpdateTexWrap(t);
+	r300SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+	r300SetTexBorderColor(t, t->base.BorderColor);
+
+	return &t->base;
 }
 
 void r300InitTextureFuncs(struct dd_function_table *functions)
@@ -1037,22 +323,30 @@ void r300InitTextureFuncs(struct dd_function_table *functions)
 	/* Note: we only plug in the functions we implement in the driver
 	 * since _mesa_init_driver_functions() was already called.
 	 */
-	functions->ChooseTextureFormat = r300ChooseTextureFormat;
-	functions->TexImage1D = r300TexImage1D;
-	functions->TexImage2D = r300TexImage2D;
-	functions->TexImage3D = r300TexImage3D;
-	functions->TexSubImage1D = r300TexSubImage1D;
-	functions->TexSubImage2D = r300TexSubImage2D;
-	functions->TexSubImage3D = r300TexSubImage3D;
+	functions->NewTextureImage = radeonNewTextureImage;
+	functions->FreeTexImageData = radeonFreeTexImageData;
+	functions->MapTexture = radeonMapTexture;
+	functions->UnmapTexture = radeonUnmapTexture;
+
+	functions->ChooseTextureFormat = radeonChooseTextureFormat_mesa;
+	functions->TexImage1D = radeonTexImage1D;
+	functions->TexImage2D = radeonTexImage2D;
+	functions->TexImage3D = radeonTexImage3D;
+	functions->TexSubImage1D = radeonTexSubImage1D;
+	functions->TexSubImage2D = radeonTexSubImage2D;
+	functions->TexSubImage3D = radeonTexSubImage3D;
+	functions->GetTexImage = radeonGetTexImage;
+	functions->GetCompressedTexImage = radeonGetCompressedTexImage;
 	functions->NewTextureObject = r300NewTextureObject;
-	functions->BindTexture = r300BindTexture;
 	functions->DeleteTexture = r300DeleteTexture;
 	functions->IsTextureResident = driIsTextureResident;
 
 	functions->TexParameter = r300TexParameter;
 
-	functions->CompressedTexImage2D = r300CompressedTexImage2D;
-	functions->CompressedTexSubImage2D = r300CompressedTexSubImage2D;
+	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
+	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
+
+	functions->GenerateMipmap = radeonGenerateMipmap;
 
 	driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/r300/r300_tex.h b/src/mesa/drivers/dri/r300/r300_tex.h
index b86d45bfe05..8a653ea2d11 100644
--- a/src/mesa/drivers/dri/r300/r300_tex.h
+++ b/src/mesa/drivers/dri/r300/r300_tex.h
@@ -37,16 +37,17 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 extern void r300SetDepthTexMode(struct gl_texture_object *tObj);
 
+extern void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
+			     __DRIdrawable *dPriv);
+
+extern void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+			      GLint format, __DRIdrawable *dPriv);
+
 extern void r300SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
 			     unsigned long long offset, GLint depth,
 			     GLuint pitch);
 
-extern void r300UpdateTextureState(GLcontext * ctx);
-
-extern int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLuint face);
-
-extern void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t);
+extern GLboolean r300ValidateBuffers(GLcontext * ctx);
 
 extern void r300InitTextureFuncs(struct dd_function_table *functions);
 
diff --git a/src/mesa/drivers/dri/r300/r300_texmem.c b/src/mesa/drivers/dri/r300/r300_texmem.c
deleted file mode 100644
index a89ab83d948..00000000000
--- a/src/mesa/drivers/dri/r300/r300_texmem.c
+++ /dev/null
@@ -1,568 +0,0 @@
-/**************************************************************************
-
-Copyright (C) Tungsten Graphics 2002.  All Rights Reserved.
-The Weather Channel, Inc. funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86
-license. This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file
- *
- * \author Gareth Hughes <[email protected]>
- *
- * \author Kevin E. Martin <[email protected]>
- */
-
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/colormac.h"
-#include "main/macros.h"
-#include "main/simple_list.h"
-#include "main/texobj.h"
-#include "radeon_reg.h"		/* gets definition for usleep */
-#include "r300_context.h"
-#include "r300_state.h"
-#include "r300_cmdbuf.h"
-#include "radeon_ioctl.h"
-#include "r300_tex.h"
-#include "r300_ioctl.h"
-#include <unistd.h>		/* for usleep() */
-
-#ifdef USER_BUFFERS
-#include "r300_mem.h"
-#endif
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void r300DestroyTexObj(r300ContextPtr rmesa, r300TexObjPtr t)
-{
-	int i;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr, "%s( %p, %p )\n", __FUNCTION__,
-			(void *)t, (void *)t->base.tObj);
-	}
-
-	for (i = 0; i < rmesa->radeon.glCtx->Const.MaxTextureUnits; i++) {
-		if (rmesa->state.texture.unit[i].texobj == t->base.tObj) {
-			_mesa_reference_texobj(&rmesa->state.texture.unit[i].texobj, NULL);
-		}
-	}
-}
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-static void r300UploadGARTClientSubImage(r300ContextPtr rmesa,
-					 r300TexObjPtr t,
-					 struct gl_texture_image *texImage,
-					 GLint hwlevel,
-					 GLint x, GLint y,
-					 GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	GLuint srcPitch, dstPitch;
-	int blit_format;
-	int srcOffset;
-
-	/*
-	 * XXX it appears that we always upload the full image, not a subimage.
-	 * I.e. x==0, y==0, width=texWidth, height=texWidth.  If this is ever
-	 * changed, the src pitch will have to change.
-	 */
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		srcPitch = t->image[0][0].width * texFormat->TexelBytes;
-		dstPitch = t->image[0][0].width * texFormat->TexelBytes;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][hwlevel].data = texImage->Data;
-	srcOffset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-
-	assert(srcOffset != ~0);
-
-	/* Don't currently need to cope with small pitches?
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	r300EmitWait(rmesa, R300_WAIT_3D);
-
-	r300EmitBlit(rmesa, blit_format,
-		     srcPitch,
-		     srcOffset,
-		     dstPitch,
-		     t->bufAddr,
-		     x,
-		     y,
-		     t->image[0][hwlevel].x + x,
-		     t->image[0][hwlevel].y + y, width, height);
-
-	r300EmitWait(rmesa, R300_WAIT_2D);
-}
-
-static void r300UploadRectSubImage(r300ContextPtr rmesa,
-				   r300TexObjPtr t,
-				   struct gl_texture_image *texImage,
-				   GLint x, GLint y, GLint width, GLint height)
-{
-	const struct gl_texture_format *texFormat = texImage->TexFormat;
-	int blit_format, dstPitch, done;
-
-	switch (texFormat->TexelBytes) {
-	case 1:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	case 2:
-		blit_format = R300_CP_COLOR_FORMAT_RGB565;
-		break;
-	case 4:
-		blit_format = R300_CP_COLOR_FORMAT_ARGB8888;
-		break;
-	case 8:
-	case 16:
-		blit_format = R300_CP_COLOR_FORMAT_CI8;
-		break;
-	default:
-		return;
-	}
-
-	t->image[0][0].data = texImage->Data;
-
-	/* Currently don't need to cope with small pitches.
-	 */
-	width = texImage->Width;
-	height = texImage->Height;
-	dstPitch = t->pitch;
-
-	if (texFormat->TexelBytes > 4) {
-		width *= texFormat->TexelBytes;
-	}
-
-	if (rmesa->prefer_gart_client_texturing && texImage->IsClientData) {
-		/* In this case, could also use GART texturing.  This is
-		 * currently disabled, but has been tested & works.
-		 */
-		t->offset = r300GartOffsetFromVirtual(rmesa, texImage->Data);
-		t->pitch = texImage->RowStride * texFormat->TexelBytes - 32;
-
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"Using GART texturing for rectangular client texture\n");
-
-		/* Release FB memory allocated for this image:
-		 */
-		/* FIXME This may not be correct as driSwapOutTextureObject sets
-		 * FIXME dirty_images.  It may be fine, though.
-		 */
-		if (t->base.memBlock) {
-			driSwapOutTextureObject((driTextureObject *) t);
-		}
-	} else if (texImage->IsClientData) {
-		/* Data already in GART memory, with usable pitch.
-		 */
-		GLuint srcPitch;
-		srcPitch = texImage->RowStride * texFormat->TexelBytes;
-		r300EmitBlit(rmesa,
-			     blit_format,
-			     srcPitch,
-			     r300GartOffsetFromVirtual(rmesa, texImage->Data),
-			     dstPitch, t->bufAddr, 0, 0, 0, 0, width, height);
-	} else {
-		/* Data not in GART memory, or bad pitch.
-		 */
-		for (done = 0; done < height;) {
-			struct r300_dma_region region;
-			int lines =
-			    MIN2(height - done, RADEON_BUFFER_SIZE / dstPitch);
-			int src_pitch;
-			char *tex;
-
-			src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-			tex = (char *)texImage->Data + done * src_pitch;
-
-			memset(&region, 0, sizeof(region));
-			r300AllocDmaRegion(rmesa, &region, lines * dstPitch,
-					   1024);
-
-			/* Copy texdata to dma:
-			 */
-			if (RADEON_DEBUG & DEBUG_TEXTURE)
-				fprintf(stderr,
-					"%s: src_pitch %d dst_pitch %d\n",
-					__FUNCTION__, src_pitch, dstPitch);
-
-			if (src_pitch == dstPitch) {
-				memcpy(region.address + region.start, tex,
-				       lines * src_pitch);
-			} else {
-				char *buf = region.address + region.start;
-				int i;
-				for (i = 0; i < lines; i++) {
-					memcpy(buf, tex, src_pitch);
-					buf += dstPitch;
-					tex += src_pitch;
-				}
-			}
-
-			r300EmitWait(rmesa, R300_WAIT_3D);
-
-			/* Blit to framebuffer
-			 */
-			r300EmitBlit(rmesa,
-				     blit_format,
-				     dstPitch, GET_START(&region),
-				     dstPitch | (t->tile_bits >> 16),
-				     t->bufAddr, 0, 0, 0, done, width, lines);
-
-			r300EmitWait(rmesa, R300_WAIT_2D);
-#ifdef USER_BUFFERS
-			r300_mem_use(rmesa, region.buf->id);
-#endif
-
-			r300ReleaseDmaRegion(rmesa, &region, __FUNCTION__);
-			done += lines;
-		}
-	}
-}
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void r300UploadSubImage(r300ContextPtr rmesa, r300TexObjPtr t,
-			       GLint hwlevel,
-			       GLint x, GLint y, GLint width, GLint height,
-			       GLuint face)
-{
-	struct gl_texture_image *texImage = NULL;
-	GLuint offset;
-	GLint imageWidth, imageHeight;
-	GLint ret;
-	drm_radeon_texture_t tex;
-	drm_radeon_tex_image_t tmp;
-	const int level = hwlevel + t->base.firstLevel;
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE) {
-		fprintf(stderr,
-			"%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n",
-			__FUNCTION__, (void *)t, (void *)t->base.tObj, level,
-			width, height, face);
-	}
-
-	ASSERT(face < 6);
-
-	/* Ensure we have a valid texture to upload */
-	if ((hwlevel < 0) || (hwlevel >= R300_MAX_TEXTURE_LEVELS)) {
-		_mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-		return;
-	}
-
-	texImage = t->base.tObj->Image[face][level];
-
-	if (!texImage) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: texImage %d is NULL!\n",
-				__FUNCTION__, level);
-		return;
-	}
-	if (!texImage->Data) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is NULL!\n",
-				__FUNCTION__);
-		return;
-	}
-
-	if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		assert(level == 0);
-		assert(hwlevel == 0);
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr, "%s: image data is rectangular\n",
-				__FUNCTION__);
-		r300UploadRectSubImage(rmesa, t, texImage, x, y, width, height);
-		return;
-	} else if (texImage->IsClientData) {
-		if (RADEON_DEBUG & DEBUG_TEXTURE)
-			fprintf(stderr,
-				"%s: image data is in GART client storage\n",
-				__FUNCTION__);
-		r300UploadGARTClientSubImage(rmesa, t, texImage, hwlevel, x, y,
-					     width, height);
-		return;
-	} else if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "%s: image data is in normal memory\n",
-			__FUNCTION__);
-
-	imageWidth = texImage->Width;
-	imageHeight = texImage->Height;
-
-	offset = t->bufAddr;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		GLint imageX = 0;
-		GLint imageY = 0;
-		GLint blitX = t->image[face][hwlevel].x;
-		GLint blitY = t->image[face][hwlevel].y;
-		GLint blitWidth = t->image[face][hwlevel].width;
-		GLint blitHeight = t->image[face][hwlevel].height;
-		fprintf(stderr, "   upload image: %d,%d at %d,%d\n",
-			imageWidth, imageHeight, imageX, imageY);
-		fprintf(stderr, "   upload  blit: %d,%d at %d,%d\n",
-			blitWidth, blitHeight, blitX, blitY);
-		fprintf(stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-			(GLuint) offset, hwlevel, level);
-	}
-
-	t->image[face][hwlevel].data = texImage->Data;
-
-	/* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-	 * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-	 * We used to use 1, 2 and 4-byte texels and used to use the texture
-	 * width to dictate the blit width - but that won't work for compressed
-	 * textures. (Brian)
-	 * NOTE: can't do that with texture tiling. (sroland)
-	 */
-	tex.offset = offset;
-	tex.image = &tmp;
-	/* copy (x,y,width,height,data) */
-	memcpy(&tmp, &t->image[face][hwlevel], sizeof(tmp));
-
-	if (texImage->TexFormat->TexelBytes > 4) {
-		const int log2TexelBytes =
-		    (3 + (texImage->TexFormat->TexelBytes >> 4));
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.height = imageHeight;
-		tex.width = imageWidth << log2TexelBytes;
-		tex.offset += (tmp.x << log2TexelBytes) & ~1023;
-		tmp.x = tmp.x % (1024 >> log2TexelBytes);
-		tmp.width = tmp.width << log2TexelBytes;
-	} else if (texImage->TexFormat->TexelBytes) {
-		/* use multi-byte upload scheme */
-		tex.height = imageHeight;
-		tex.width = imageWidth;
-		switch (texImage->TexFormat->TexelBytes) {
-		case 1:
-			tex.format = RADEON_TXFORMAT_I8;
-			break;
-		case 2:
-			tex.format = RADEON_TXFORMAT_AI88;
-			break;
-		case 4:
-			tex.format = RADEON_TXFORMAT_ARGB8888;
-			break;
-		}
-		tex.pitch =
-		    MAX2((texImage->Width * texImage->TexFormat->TexelBytes) /
-			 64, 1);
-		tex.offset += tmp.x & ~1023;
-		tmp.x = tmp.x % 1024;
-
-		if (t->tile_bits & R300_TXO_MICRO_TILE) {
-			/* need something like "tiled coordinates" ? */
-			tmp.y = tmp.x / (tex.pitch * 128) * 2;
-			tmp.x =
-			    tmp.x % (tex.pitch * 128) / 2 /
-			    texImage->TexFormat->TexelBytes;
-			tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-		} else {
-			tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-		}
-#if 1
-		if ((t->tile_bits & R300_TXO_MACRO_TILE) &&
-		    (texImage->Width * texImage->TexFormat->TexelBytes >= 256)
-		    && ((!(t->tile_bits & R300_TXO_MICRO_TILE)
-			 && (texImage->Height >= 8))
-			|| (texImage->Height >= 16))) {
-			/* weird: R200 disables macro tiling if mip width is smaller than 256 bytes,
-			   OR if height is smaller than 8 automatically, but if micro tiling is active
-			   the limit is height 16 instead ? */
-			tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-		}
-#endif
-	} else {
-		/* In case of for instance 8x8 texture (2x2 dxt blocks),
-		   padding after the first two blocks is needed (only
-		   with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-		/* set tex.height to 1/4 since 1 "macropixel" (dxt-block)
-		   has 4 real pixels. Needed so the kernel module reads
-		   the right amount of data. */
-		tex.format = RADEON_TXFORMAT_I8;	/* any 1-byte texel format */
-		tex.pitch = (R300_BLIT_WIDTH_BYTES / 64);
-		tex.height = (imageHeight + 3) / 4;
-		tex.width = (imageWidth + 3) / 4;
-		if ((t->format & R300_TX_FORMAT_DXT1) == R300_TX_FORMAT_DXT1) {
-			tex.width *= 8;
-		} else {
-			tex.width *= 16;
-		}
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-	do {
-		ret =
-		    drmCommandWriteRead(rmesa->radeon.dri.fd,
-					DRM_RADEON_TEXTURE, &tex,
-					sizeof(drm_radeon_texture_t));
-		if (ret) {
-			if (RADEON_DEBUG & DEBUG_IOCTL)
-				fprintf(stderr,
-					"DRM_RADEON_TEXTURE:  again!\n");
-			usleep(1);
-		}
-	} while (ret == -EAGAIN);
-
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret);
-		fprintf(stderr, "   offset=0x%08x\n", offset);
-		fprintf(stderr, "   image width=%d height=%d\n",
-			imageWidth, imageHeight);
-		fprintf(stderr, "    blit width=%d height=%d data=%p\n",
-			t->image[face][hwlevel].width,
-			t->image[face][hwlevel].height,
-			t->image[face][hwlevel].data);
-		_mesa_exit(-1);
-	}
-}
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- *
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int r300UploadTexImages(r300ContextPtr rmesa, r300TexObjPtr t, GLuint face)
-{
-	const int numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-	if (t->image_override)
-		return 0;
-
-	if (RADEON_DEBUG & (DEBUG_TEXTURE | DEBUG_IOCTL)) {
-		fprintf(stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-			(void *)rmesa->radeon.glCtx, (void *)t->base.tObj,
-			t->base.totalSize, t->base.firstLevel,
-			t->base.lastLevel);
-	}
-
-	if (t->base.totalSize == 0)
-		return 0;
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	LOCK_HARDWARE(&rmesa->radeon);
-
-	if (t->base.memBlock == NULL) {
-		int heap;
-
-		heap = driAllocateTexture(rmesa->texture_heaps, rmesa->nr_heaps,
-					  (driTextureObject *) t);
-		if (heap == -1) {
-			UNLOCK_HARDWARE(&rmesa->radeon);
-			return -1;
-		}
-
-		/* Set the base offset of the texture image */
-		t->bufAddr = rmesa->radeon.radeonScreen->texOffset[heap]
-		    + t->base.memBlock->ofs;
-		t->offset = t->bufAddr;
-
-		if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-			/* hope it's safe to add that here... */
-			t->offset |= t->tile_bits;
-		}
-	}
-
-	/* Let the world know we've used this memory recently.
-	 */
-	driUpdateTextureLRU((driTextureObject *) t);
-	UNLOCK_HARDWARE(&rmesa->radeon);
-
-	/* Upload any images that are new */
-	if (t->base.dirty_images[face]) {
-		int i;
-		for (i = 0; i < numLevels; i++) {
-			if ((t->base.
-			     dirty_images[face] & (1 <<
-						   (i + t->base.firstLevel))) !=
-			    0) {
-				r300UploadSubImage(rmesa, t, i, 0, 0,
-						   t->image[face][i].width,
-						   t->image[face][i].height,
-						   face);
-			}
-		}
-		t->base.dirty_images[face] = 0;
-	}
-
-	if (RADEON_DEBUG & DEBUG_SYNC) {
-		fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-		radeonFinish(rmesa->radeon.glCtx);
-	}
-
-	return 0;
-}
diff --git a/src/mesa/drivers/dri/r300/r300_texstate.c b/src/mesa/drivers/dri/r300/r300_texstate.c
index f6ae4b675b8..6f489ace7bf 100644
--- a/src/mesa/drivers/dri/r300/r300_texstate.c
+++ b/src/mesa/drivers/dri/r300/r300_texstate.c
@@ -47,7 +47,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "r300_context.h"
 #include "r300_state.h"
 #include "r300_ioctl.h"
-#include "radeon_ioctl.h"
+#include "radeon_mipmap_tree.h"
 #include "r300_tex.h"
 #include "r300_reg.h"
 
@@ -117,7 +117,12 @@ static const struct tx_table {
 	_ASSIGN(INTENSITY_FLOAT16, R300_EASY_TX_FORMAT(X, X, X, X, FL_I16)),
 	_ASSIGN(Z16, R300_EASY_TX_FORMAT(X, X, X, X, X16)),
 	_ASSIGN(Z24_S8, R300_EASY_TX_FORMAT(X, X, X, X, X24_Y8)),
+	_ASSIGN(S8_Z24, R300_EASY_TX_FORMAT(Y, Y, Y, Y, X24_Y8)),
 	_ASSIGN(Z32, R300_EASY_TX_FORMAT(X, X, X, X, X32)),
+	/* EXT_texture_sRGB */
+	_ASSIGN(SRGBA8, R300_EASY_TX_FORMAT(Y, Z, W, X, W8Z8Y8X8) | R300_TX_FORMAT_GAMMA),
+	_ASSIGN(SLA8, R300_EASY_TX_FORMAT(X, X, X, Y, Y8X8) | R300_TX_FORMAT_GAMMA),
+	_ASSIGN(SL8, R300_EASY_TX_FORMAT(X, X, X, ONE, X8) | R300_TX_FORMAT_GAMMA),
 	/* *INDENT-ON* */
 };
 
@@ -143,13 +148,12 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 		},
 	};
 	const GLuint *format;
-	r300TexObjPtr t;
+	radeonTexObjPtr t;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
+	t = radeon_tex_obj(tObj);
 
 	switch (tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat) {
 	case MESA_FORMAT_Z16:
@@ -171,13 +175,13 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 	switch (tObj->DepthMode) {
 	case GL_LUMINANCE:
-		t->format = format[0];
+		t->pp_txformat = format[0];
 		break;
 	case GL_INTENSITY:
-		t->format = format[1];
+		t->pp_txformat = format[1];
 		break;
 	case GL_ALPHA:
-		t->format = format[2];
+		t->pp_txformat = format[2];
 		break;
 	default:
 		/* Error...which should have already been caught by higher
@@ -190,486 +194,307 @@ void r300SetDepthTexMode(struct gl_texture_object *tObj)
 
 
 /**
- * Compute sizes and fill in offset and blit information for the given
- * image (determined by \p face and \p level).
- *
- * \param curOffset points to the offset at which the image is to be stored
- * and is updated by this function according to the size of the image.
- */
-static void compute_tex_image_offset(
-	struct gl_texture_object *tObj,
-	GLuint face,
-	GLint level,
-	GLint* curOffset)
-{
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image* texImage;
-	GLuint blitWidth = R300_BLIT_WIDTH_BYTES;
-	GLuint texelBytes;
-	GLuint size;
-
-	texImage = tObj->Image[0][level + t->base.firstLevel];
-	if (!texImage)
-		return;
-
-	texelBytes = texImage->TexFormat->TexelBytes;
-
-	/* find image size in bytes */
-	if (texImage->IsCompressed) {
-		if ((t->format & R300_TX_FORMAT_DXT1) ==
-			R300_TX_FORMAT_DXT1) {
-			// fprintf(stderr,"DXT 1 %d %08X\n", texImage->Width, t->format);
-			if ((texImage->Width + 3) < 8)	/* width one block */
-				size = texImage->CompressedSize * 4;
-			else if ((texImage->Width + 3) < 16)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		} else {
-			/* DXT3/5, 16 bytes per block */
-			WARN_ONCE
-				("DXT 3/5 suffers from multitexturing problems!\n");
-			// fprintf(stderr,"DXT 3/5 %d\n", texImage->Width);
-			if ((texImage->Width + 3) < 8)
-				size = texImage->CompressedSize * 2;
-			else
-				size = texImage->CompressedSize;
-		}
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		size =
-			((texImage->Width * texelBytes +
-			63) & ~63) * texImage->Height;
-		blitWidth = 64 / texelBytes;
-	} else if (t->tile_bits & R300_TXO_MICRO_TILE) {
-		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-			though the actual offset may be different (if texture is less than
-			32 bytes width) to the untiled case */
-		int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-		size =
-			(w * ((texImage->Height + 1) / 2)) *
-			texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	} else {
-		int w = (texImage->Width * texelBytes + 31) & ~31;
-		size = w * texImage->Height * texImage->Depth;
-		blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-	}
-	assert(size > 0);
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr, "w=%d h=%d d=%d tb=%d intFormat=%d\n",
-			texImage->Width, texImage->Height,
-			texImage->Depth,
-			texImage->TexFormat->TexelBytes,
-			texImage->InternalFormat);
-
-	/* All images are aligned to a 32-byte offset */
-	*curOffset = (*curOffset + 0x1f) & ~0x1f;
-
-	if (texelBytes) {
-		/* fix x and y coords up later together with offset */
-		t->image[face][level].x = *curOffset;
-		t->image[face][level].y = 0;
-		t->image[face][level].width =
-			MIN2(size / texelBytes, blitWidth);
-		t->image[face][level].height =
-			(size / texelBytes) / t->image[face][level].width;
-	} else {
-		t->image[face][level].x = *curOffset % R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].y = *curOffset / R300_BLIT_WIDTH_BYTES;
-		t->image[face][level].width =
-			MIN2(size, R300_BLIT_WIDTH_BYTES);
-		t->image[face][level].height = size / t->image[face][level].width;
-	}
-
-	if (RADEON_DEBUG & DEBUG_TEXTURE)
-		fprintf(stderr,
-			"level %d, face %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-			level, face, texImage->Width, texImage->Height,
-			t->image[face][level].x, t->image[face][level].y,
-			t->image[face][level].width, t->image[face][level].height,
-			size, *curOffset);
-
-	*curOffset += size;
-}
-
-
-
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c filter, \c format, etc. will be set here
- * too.
+ * Compute the cached hardware register values for the given texture object.
  *
  * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
+ * \param t the r300 texture object
  */
-static void r300SetTexImages(r300ContextPtr rmesa,
-			     struct gl_texture_object *tObj)
+static void setup_hardware_state(r300ContextPtr rmesa, radeonTexObj *t)
 {
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	const struct gl_texture_image *baseImage =
-	    tObj->Image[0][tObj->BaseLevel];
-	GLint curOffset;
-	GLint i, texelBytes;
-	GLint numLevels;
-	GLint log2Width, log2Height, log2Depth;
-
-	/* Set the hardware texture format
-	 */
+	const struct gl_texture_image *firstImage;
+	int firstlevel = t->mt ? t->mt->firstLevel : 0;
+	    
+	firstImage = t->base.Image[0][firstlevel];
+
 	if (!t->image_override
-	    && VALID_FORMAT(baseImage->TexFormat->MesaFormat)) {
-		if (baseImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
-			r300SetDepthTexMode(tObj);
+	    && VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+		if (firstImage->TexFormat->BaseFormat == GL_DEPTH_COMPONENT) {
+			r300SetDepthTexMode(&t->base);
 		} else {
-			t->format = tx_table[baseImage->TexFormat->MesaFormat].format;
+			t->pp_txformat = tx_table[firstImage->TexFormat->MesaFormat].format;
 		}
 
-		t->filter |= tx_table[baseImage->TexFormat->MesaFormat].filter;
+		t->pp_txfilter |= tx_table[firstImage->TexFormat->MesaFormat].filter;
 	} else if (!t->image_override) {
 		_mesa_problem(NULL, "unexpected texture format in %s",
 			      __FUNCTION__);
 		return;
 	}
 
-	texelBytes = baseImage->TexFormat->TexelBytes;
-
-	/* Compute which mipmap levels we really want to send to the hardware.
-	 */
-	driCalculateTextureFirstLastLevel((driTextureObject *) t);
-	log2Width = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-	log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-	log2Depth = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-	numLevels = t->base.lastLevel - t->base.firstLevel + 1;
+	if (t->image_override && t->bo)
+		return;
 
-	assert(numLevels <= R300_MAX_TEXTURE_LEVELS);
+	t->pp_txsize = (((firstImage->Width - 1) << R300_TX_WIDTHMASK_SHIFT)
+			| ((firstImage->Height - 1) << R300_TX_HEIGHTMASK_SHIFT)
+			| ((firstImage->DepthLog2) << R300_TX_DEPTHMASK_SHIFT)
+			| ((t->mt->lastLevel - t->mt->firstLevel) << R300_TX_MAX_MIP_LEVEL_SHIFT));
 
-	/* Calculate mipmap offsets and dimensions for blitting (uploading)
-	 * The idea is that we lay out the mipmap levels within a block of
-	 * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-	 */
 	t->tile_bits = 0;
 
-	/* figure out if this texture is suitable for tiling. */
-#if 0				/* Disabled for now */
-	if (texelBytes) {
-		if ((tObj->Target != GL_TEXTURE_RECTANGLE_NV) &&
-		    /* texrect might be able to use micro tiling too in theory? */
-		    (baseImage->Height > 1)) {
-
-			/* allow 32 (bytes) x 1 mip (which will use two times the space
-			   the non-tiled version would use) max if base texture is large enough */
-			if ((numLevels == 1) ||
-			    (((baseImage->Width * texelBytes /
-			       baseImage->Height) <= 32)
-			     && (baseImage->Width * texelBytes > 64))
-			    ||
-			    ((baseImage->Width * texelBytes /
-			      baseImage->Height) <= 16)) {
-				t->tile_bits |= R300_TXO_MICRO_TILE;
-			}
-		}
-
-		if (tObj->Target != GL_TEXTURE_RECTANGLE_NV) {
-			/* we can set macro tiling even for small textures, they will be untiled anyway */
-			t->tile_bits |= R300_TXO_MACRO_TILE;
-		}
-	}
-#endif
-
-	curOffset = 0;
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+		t->pp_txformat |= R300_TX_FORMAT_CUBIC_MAP;
+	if (t->base.Target == GL_TEXTURE_3D)
+		t->pp_txformat |= R300_TX_FORMAT_3D;
 
-	if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-		ASSERT(log2Width == log2Height);
-		t->format |= R300_TX_FORMAT_CUBIC_MAP;
 
-		for(i = 0; i < numLevels; i++) {
-			GLuint face;
-			for(face = 0; face < 6; face++)
-				compute_tex_image_offset(tObj, face, i, &curOffset);
-		}
-	} else {
-		if (tObj->Target == GL_TEXTURE_3D)
-                	t->format |= R300_TX_FORMAT_3D;
-
-		for (i = 0; i < numLevels; i++)
-			compute_tex_image_offset(tObj, 0, i, &curOffset);
-	}
-
-	/* Align the total size of texture memory block.
-	 */
-	t->base.totalSize =
-	    (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-	t->size =
-	    (((tObj->Image[0][t->base.firstLevel]->Width -
-	       1) << R300_TX_WIDTHMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->Height - 1) <<
-		R300_TX_HEIGHTMASK_SHIFT)
-	     | ((tObj->Image[0][t->base.firstLevel]->DepthLog2) <<
-		R300_TX_DEPTHMASK_SHIFT))
-	    | ((numLevels - 1) << R300_TX_MAX_MIP_LEVEL_SHIFT);
-
-	t->pitch = 0;
-
-	/* Only need to round to nearest 32 for textures, but the blitter
-	 * requires 64-byte aligned pitches, and we may/may not need the
-	 * blitter.   NPOT only!
-	 */
-	if (baseImage->IsCompressed) {
-		t->pitch |=
-		    (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-	} else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-		unsigned int align = (64 / texelBytes) - 1;
-		t->pitch |= ((tObj->Image[0][t->base.firstLevel]->Width *
-			     texelBytes) + 63) & ~(63);
-		t->size |= R300_TX_SIZE_TXPITCH_EN;
+	if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+		unsigned int align = (64 / t->mt->bpp) - 1;
+		t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
 		if (!t->image_override)
-			t->pitch_reg =
-			    (((tObj->Image[0][t->base.firstLevel]->Width) +
-			      align) & ~align) - 1;
-	} else {
-		t->pitch |=
-		    ((tObj->Image[0][t->base.firstLevel]->Width *
-		      texelBytes) + 63) & ~(63);
+			t->pp_txpitch = ((firstImage->Width + align) & ~align) - 1;
 	}
 
 	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
-	    if (tObj->Image[0][t->base.firstLevel]->Width > 2048)
-		t->pitch_reg |= R500_TXWIDTH_BIT11;
-	    if (tObj->Image[0][t->base.firstLevel]->Height > 2048)
-		t->pitch_reg |= R500_TXHEIGHT_BIT11;
+	    if (firstImage->Width > 2048)
+		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+	    if (firstImage->Height > 2048)
+		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
 	}
 }
 
-/* ================================================================
- * Texture unit state management
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
  */
-
-static GLboolean r300EnableTexture2D(GLcontext * ctx, int unit)
+static GLboolean r300_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	radeonTexObj *t = radeon_tex_obj(texObj);
 
-	ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
-
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
+	if (!radeon_validate_texture_miptree(ctx, texObj))
+		return GL_FALSE;
 
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override)
-			return GL_FALSE;
-	}
+	/* Configure the hardware registers (more precisely, the cached version
+	 * of the hardware registers). */
+	setup_hardware_state(rmesa, t);
 
+	t->validated = GL_TRUE;
 	return GL_TRUE;
 }
 
-static GLboolean r300EnableTexture3D(GLcontext * ctx, int unit)
+/**
+ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+ */
+GLboolean r300ValidateBuffers(GLcontext * ctx)
 {
 	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
+	struct radeon_renderbuffer *rrb;
+	int i;
+	int ret;
 
-	ASSERT(tObj->Target == GL_TEXTURE_3D);
+	radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
 
-	/* r300 does not support mipmaps for 3D textures. */
-	if ((tObj->MinFilter != GL_NEAREST) && (tObj->MinFilter != GL_LINEAR)) {
-		return GL_FALSE;
+	rrb = radeon_get_colorbuffer(&rmesa->radeon);
+	/* color buffer */
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
 	}
 
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock)
-			return GL_FALSE;
+	/* depth buffer */
+	rrb = radeon_get_depthbuffer(&rmesa->radeon);
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
 	}
+	
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+		radeonTexObj *t;
 
-	return GL_TRUE;
-}
-
-static GLboolean r300EnableTextureCube(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-	GLuint face;
-
-	ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
-
-	if (t->base.dirty_images[0] || t->base.dirty_images[1] ||
-	    t->base.dirty_images[2] || t->base.dirty_images[3] ||
-	    t->base.dirty_images[4] || t->base.dirty_images[5]) {
-		/* flush */
-		R300_FIREVERTICES(rmesa);
-		/* layout memory space, once for all faces */
-		r300SetTexImages(rmesa, tObj);
-	}
+		if (!ctx->Texture.Unit[i]._ReallyEnabled)
+			continue;
 
-	/* upload (per face) */
-	for (face = 0; face < 6; face++) {
-		if (t->base.dirty_images[face]) {
-			r300UploadTexImages(rmesa,
-					    (r300TexObjPtr) tObj->DriverData,
-					    face);
+		if (!r300_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
+			_mesa_warning(ctx,
+				      "failed to validate texture for unit %d.\n",
+				      i);
 		}
+		t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+		if (t->image_override && t->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+		else if (t->mt->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->mt->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
 	}
 
-	if (!t->base.memBlock) {
-		/* texmem alloc failed, use s/w fallback */
+	ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, rmesa->radeon.dma.current, RADEON_GEM_DOMAIN_GTT, 0);
+	if (ret)
 		return GL_FALSE;
-	}
-
 	return GL_TRUE;
 }
 
-static GLboolean r300EnableTextureRect(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_Current;
-	r300TexObjPtr t = (r300TexObjPtr) tObj->DriverData;
-
-	ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
-
-	if (t->base.dirty_images[0]) {
-		R300_FIREVERTICES(rmesa);
-
-		r300SetTexImages(rmesa, tObj);
-		r300UploadTexImages(rmesa, (r300TexObjPtr) tObj->DriverData, 0);
-		if (!t->base.memBlock && !t->image_override &&
-		    !rmesa->prefer_gart_client_texturing)
-			return GL_FALSE;
-	}
-
-	return GL_TRUE;
-}
-
-static GLboolean r300UpdateTexture(GLcontext * ctx, int unit)
-{
-	r300ContextPtr rmesa = R300_CONTEXT(ctx);
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-	struct gl_texture_object *tObj = texUnit->_ReallyEnabled ?
-		texUnit->_Current : NULL;
-	r300TexObjPtr t = tObj ? (r300TexObjPtr) tObj->DriverData : NULL;
-
-	/* Fallback if there's a texture border */
-	if (tObj && tObj->Image[0][tObj->BaseLevel]->Border > 0) {
-		tObj = NULL;
-		t = NULL;
-	}
-
-	/* Update state if this is a different texture object to last
-	 * time.
-	 */
-	if (rmesa->state.texture.unit[unit].texobj != tObj) {
-		if (rmesa->state.texture.unit[unit].texobj != NULL) {
-			r300TexObjPtr t_old = (r300TexObjPtr) rmesa->state.texture.unit[unit].texobj->DriverData;
-
-			/* The old texture is no longer bound to this texture unit.
-			 * Mark it as such.
-			 */
-
-			t_old->base.bound &= ~(1 << unit);
-		}
-
-		_mesa_reference_texobj(&rmesa->state.texture.unit[unit].texobj, tObj);
-
-		if (t) {
-			t->base.bound |= (1 << unit);
-			driUpdateTextureLRU(&t->base);	/* XXX: should be locked! */
-		}
-	}
-
-	return !t || !t->border_fallback;
-}
-
 void r300SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 		      unsigned long long offset, GLint depth, GLuint pitch)
 {
 	r300ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
 	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
-	r300TexObjPtr t;
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 	uint32_t pitch_val;
 
 	if (!tObj)
 		return;
 
-	t = (r300TexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
 
-	t->offset = offset;
-	t->pitch_reg &= (1 << 13) -1;
+	t->bo = NULL;
+	t->override_offset = offset;
+	t->pp_txpitch &= (1 << 13) -1;
 	pitch_val = pitch;
 
 	switch (depth) {
 	case 32:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
-		t->filter |= tx_table[2].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[2].filter;
 		pitch_val /= 4;
 		break;
 	case 24:
 	default:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
-		t->filter |= tx_table[4].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[4].filter;
 		pitch_val /= 4;
 		break;
 	case 16:
-		t->format = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
-		t->filter |= tx_table[5].filter;
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		t->pp_txfilter |= tx_table[5].filter;
 		pitch_val /= 2;
 		break;
 	}
 	pitch_val--;
 
-	t->pitch_reg |= pitch_val;
+	t->pp_txpitch |= pitch_val;
 }
 
-static GLboolean r300UpdateTextureUnit(GLcontext * ctx, int unit)
+void r300SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format, __DRIdrawable *dPriv)
 {
-	struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-
-	if (texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT)) {
-		return (r300EnableTextureRect(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT)) {
-		return (r300EnableTexture2D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_3D_BIT)) {
-		return (r300EnableTexture3D(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT)) {
-		return (r300EnableTextureCube(ctx, unit) &&
-			r300UpdateTexture(ctx, unit));
-	} else if (texUnit->_ReallyEnabled) {
-		return GL_FALSE;
-	} else {
-		return r300UpdateTexture(ctx, unit);
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r300ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
 	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		else
+			t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, W, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[2].filter;
+		pitch_val /= 4;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, W8Z8Y8X8);
+		t->pp_txfilter |= tx_table[4].filter;
+		pitch_val /= 4;
+		break;
+	case 2:
+		t->pp_txformat = R300_EASY_TX_FORMAT(X, Y, Z, ONE, Z5Y6X5);
+		t->pp_txfilter |= tx_table[5].filter;
+		pitch_val /= 2;
+		break;
+	}
+	pitch_val--;
+	t->pp_txsize = ((rb->base.Width - 1) << R300_TX_WIDTHMASK_SHIFT) |
+              ((rb->base.Height - 1) << R300_TX_HEIGHTMASK_SHIFT);
+	t->pp_txsize |= R300_TX_SIZE_TXPITCH_EN;
+	t->pp_txpitch |= pitch_val;
+
+	if (rmesa->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV515) {
+	    if (rb->base.Width > 2048)
+		t->pp_txpitch |= R500_TXWIDTH_BIT11;
+	    if (rb->base.Height > 2048)
+		t->pp_txpitch |= R500_TXHEIGHT_BIT11;
+	}
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
 }
 
-void r300UpdateTextureState(GLcontext * ctx)
+void r300SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
 {
-	int i;
-
-	for (i = 0; i < 8; i++) {
-		if (!r300UpdateTextureUnit(ctx, i)) {
-			_mesa_warning(ctx,
-				      "failed to update texture state for unit %d.\n",
-				      i);
-		}
-	}
+        r300SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.c b/src/mesa/drivers/dri/r300/r300_vertprog.c
index 146daa367cd..de32013032f 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.c
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.c
@@ -32,12 +32,17 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/macros.h"
 #include "main/enums.h"
 #include "shader/program.h"
+#include "shader/programopt.h"
 #include "shader/prog_instruction.h"
+#include "shader/prog_optimize.h"
 #include "shader/prog_parameter.h"
+#include "shader/prog_print.h"
 #include "shader/prog_statevars.h"
 #include "tnl/tnl.h"
 
+#include "radeon_nqssadce.h"
 #include "r300_context.h"
+#include "r300_state.h"
 
 /* TODO: Get rid of t_src_class call */
 #define CMP_SRCS(a, b) ((a.RelAddr != b.RelAddr) || (a.Index != b.Index && \
@@ -64,20 +69,18 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 		int u_temp_used = (VSF_MAX_FRAGMENT_TEMPS - 1) - u_temp_i; \
 		if((vp->num_temporaries + u_temp_used) > VSF_MAX_FRAGMENT_TEMPS) { \
 			WARN_ONCE("Ran out of temps, num temps %d, us %d\n", vp->num_temporaries, u_temp_used); \
-			vp->native = GL_FALSE; \
+			vp->error = GL_TRUE; \
 		} \
 		u_temp_i=VSF_MAX_FRAGMENT_TEMPS-1; \
 	} while (0)
 
-int r300VertexProgUpdateParams(GLcontext * ctx,
-			       struct r300_vertex_program_cont *vp, float *dst)
+static int r300VertexProgUpdateParams(GLcontext * ctx, struct gl_vertex_program *vp, float *dst)
 {
 	int pi;
-	struct gl_vertex_program *mesa_vp = &vp->mesa_program;
 	float *dst_o = dst;
 	struct gl_program_parameter_list *paramList;
 
-	if (mesa_vp->IsNVProgram) {
+	if (vp->IsNVProgram) {
 		_mesa_load_tracked_matrices(ctx);
 
 		for (pi = 0; pi < MAX_NV_VERTEX_PROGRAM_PARAMS; pi++) {
@@ -89,16 +92,18 @@ int r300VertexProgUpdateParams(GLcontext * ctx,
 		return dst - dst_o;
 	}
 
-	assert(mesa_vp->Base.Parameters);
-	_mesa_load_state_parameters(ctx, mesa_vp->Base.Parameters);
+	if (!vp->Base.Parameters)
+		return 0;
 
-	if (mesa_vp->Base.Parameters->NumParameters * 4 >
+	_mesa_load_state_parameters(ctx, vp->Base.Parameters);
+
+	if (vp->Base.Parameters->NumParameters * 4 >
 	    VSF_MAX_FRAGMENT_LENGTH) {
 		fprintf(stderr, "%s:Params exhausted\n", __FUNCTION__);
 		_mesa_exit(-1);
 	}
 
-	paramList = mesa_vp->Base.Parameters;
+	paramList = vp->Base.Parameters;
 	for (pi = 0; pi < paramList->NumParameters; pi++) {
 		switch (paramList->Parameters[pi].Type) {
 		case PROGRAM_STATE_VAR:
@@ -214,21 +219,8 @@ static void vp_dump_inputs(struct r300_vertex_program *vp, char *caller)
 static unsigned long t_src_index(struct r300_vertex_program *vp,
 				 struct prog_src_register *src)
 {
-	int i;
-	int max_reg = -1;
-
 	if (src->File == PROGRAM_INPUT) {
-		if (vp->inputs[src->Index] != -1)
-			return vp->inputs[src->Index];
-
-		for (i = 0; i < VERT_ATTRIB_MAX; i++)
-			if (vp->inputs[i] > max_reg)
-				max_reg = vp->inputs[i];
-
-		vp->inputs[src->Index] = max_reg + 1;
-
-		//vp_dump_inputs(vp, __FUNCTION__);
-
+		assert(vp->inputs[src->Index] != -1);
 		return vp->inputs[src->Index];
 	} else {
 		if (src->Index < 0) {
@@ -943,60 +935,80 @@ static GLuint *r300TranslateOpcodeXPD(struct r300_vertex_program *vp,
 static void t_inputs_outputs(struct r300_vertex_program *vp)
 {
 	int i;
-	int cur_reg = 0;
+	int cur_reg;
+	GLuint OutputsWritten, InputsRead;
 
-	for (i = 0; i < VERT_ATTRIB_MAX; i++)
-		vp->inputs[i] = -1;
+	OutputsWritten = vp->Base->Base.OutputsWritten;
+	InputsRead = vp->Base->Base.InputsRead;
+
+	cur_reg = -1;
+	for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+		if (InputsRead & (1 << i))
+			vp->inputs[i] = ++cur_reg;
+		else
+			vp->inputs[i] = -1;
+	}
 
+	cur_reg = 0;
 	for (i = 0; i < VERT_RESULT_MAX; i++)
 		vp->outputs[i] = -1;
 
-	assert(vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS));
+	assert(OutputsWritten & (1 << VERT_RESULT_HPOS));
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_HPOS)) {
+	if (OutputsWritten & (1 << VERT_RESULT_HPOS)) {
 		vp->outputs[VERT_RESULT_HPOS] = cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
+	if (OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
 		vp->outputs[VERT_RESULT_PSIZ] = cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL0)) {
+	/* If we're writing back facing colors we need to send
+	 * four colors to make front/back face colors selection work.
+	 * If the vertex program doesn't write all 4 colors, lets
+	 * pretend it does by skipping output index reg so the colors
+	 * get written into appropriate output vectors.
+	 */
+	if (OutputsWritten & (1 << VERT_RESULT_COL0)) {
 		vp->outputs[VERT_RESULT_COL0] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_COL1)) {
-		vp->outputs[VERT_RESULT_COL1] =
-		    vp->outputs[VERT_RESULT_COL0] + 1;
-		cur_reg = vp->outputs[VERT_RESULT_COL1] + 1;
+	if (OutputsWritten & (1 << VERT_RESULT_COL1)) {
+		vp->outputs[VERT_RESULT_COL1] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC0) ||
+		OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC0)) {
-		vp->outputs[VERT_RESULT_BFC0] =
-		    vp->outputs[VERT_RESULT_COL0] + 2;
-		cur_reg = vp->outputs[VERT_RESULT_BFC0] + 2;
+	if (OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+		vp->outputs[VERT_RESULT_BFC0] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		cur_reg++;
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_BFC1)) {
-		vp->outputs[VERT_RESULT_BFC1] =
-		    vp->outputs[VERT_RESULT_COL0] + 3;
-		cur_reg = vp->outputs[VERT_RESULT_BFC1] + 1;
+	if (OutputsWritten & (1 << VERT_RESULT_BFC1)) {
+		vp->outputs[VERT_RESULT_BFC1] = cur_reg++;
+	} else if (OutputsWritten & (1 << VERT_RESULT_BFC0)) {
+		cur_reg++;
 	}
 
 	for (i = VERT_RESULT_TEX0; i <= VERT_RESULT_TEX7; i++) {
-		if (vp->key.OutputsWritten & (1 << i)) {
+		if (OutputsWritten & (1 << i)) {
 			vp->outputs[i] = cur_reg++;
 		}
 	}
 
-	if (vp->key.OutputsWritten & (1 << VERT_RESULT_FOGC)) {
+	if (OutputsWritten & (1 << VERT_RESULT_FOGC)) {
 		vp->outputs[VERT_RESULT_FOGC] = cur_reg++;
 	}
 }
 
-static void r300TranslateVertexShader(struct r300_vertex_program *vp,
-				      struct prog_instruction *vpi)
+void r300TranslateVertexShader(struct r300_vertex_program *vp)
 {
+	struct prog_instruction *vpi = vp->Base->Base.Instructions;
 	int i;
 	GLuint *inst;
 	unsigned long num_operands;
@@ -1007,14 +1019,13 @@ static void r300TranslateVertexShader(struct r300_vertex_program *vp,
 	struct prog_src_register src[3];
 
 	vp->pos_end = 0;	/* Not supported yet */
-	vp->program.length = 0;
-	/*vp->num_temporaries=mesa_vp->Base.NumTemporaries; */
+	vp->hw_code.length = 0;
 	vp->translated = GL_TRUE;
-	vp->native = GL_TRUE;
+	vp->error = GL_FALSE;
 
 	t_inputs_outputs(vp);
 
-	for (inst = vp->program.body.i; vpi->Opcode != OPCODE_END;
+	for (inst = vp->hw_code.body.d; vpi->Opcode != OPCODE_END;
 	     vpi++, inst += 4) {
 
 		FREE_TEMPS();
@@ -1176,293 +1187,541 @@ static void r300TranslateVertexShader(struct r300_vertex_program *vp,
 						      &u_temp_i);
 			break;
 		default:
-			assert(0);
+			vp->error = GL_TRUE;
 			break;
 		}
 	}
 
-	/* Some outputs may be artificially added, to match the inputs
-	   of the fragment program. Blank the outputs here. */
-	for (i = 0; i < VERT_RESULT_MAX; i++) {
-		if (vp->key.OutputsAdded & (1 << i)) {
-			inst[0] = PVS_OP_DST_OPERAND(VE_ADD,
-						     GL_FALSE,
-						     GL_FALSE,
-						     vp->outputs[i],
-						     VSF_FLAG_ALL,
-						     PVS_DST_REG_OUT);
-			inst[1] = __CONST(0, SWIZZLE_ZERO);
-			inst[2] = __CONST(0, SWIZZLE_ZERO);
-			inst[3] = __CONST(0, SWIZZLE_ZERO);
-			inst += 4;
-		}
-	}
-
-	vp->program.length = (inst - vp->program.body.i);
-	if (vp->program.length >= VSF_MAX_FRAGMENT_LENGTH) {
-		vp->program.length = 0;
-		vp->native = GL_FALSE;
+	vp->hw_code.length = (inst - vp->hw_code.body.d);
+	if (vp->hw_code.length >= VSF_MAX_FRAGMENT_LENGTH) {
+		vp->error = GL_TRUE;
 	}
-#if 0
-	fprintf(stderr, "hw program:\n");
-	for (i = 0; i < vp->program.length; i++)
-		fprintf(stderr, "%08x\n", vp->program.body.d[i]);
-#endif
 }
 
-/* DP4 version seems to trigger some hw peculiarity */
-//#define PREFER_DP4
+static void insert_wpos(struct gl_program *prog, GLuint temp_index, int tex_id)
+{
+	struct prog_instruction *vpi;
+
+	_mesa_insert_instructions(prog, prog->NumInstructions - 1, 2);
+
+	vpi = &prog->Instructions[prog->NumInstructions - 3];
+
+	vpi->Opcode = OPCODE_MOV;
+
+	vpi->DstReg.File = PROGRAM_OUTPUT;
+	vpi->DstReg.Index = VERT_RESULT_HPOS;
+	vpi->DstReg.WriteMask = WRITEMASK_XYZW;
+	vpi->DstReg.CondMask = COND_TR;
 
-static void position_invariant(struct gl_program *prog)
+	vpi->SrcReg[0].File = PROGRAM_TEMPORARY;
+	vpi->SrcReg[0].Index = temp_index;
+	vpi->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	++vpi;
+
+	vpi->Opcode = OPCODE_MOV;
+
+	vpi->DstReg.File = PROGRAM_OUTPUT;
+	vpi->DstReg.Index = VERT_RESULT_TEX0 + tex_id;
+	vpi->DstReg.WriteMask = WRITEMASK_XYZW;
+	vpi->DstReg.CondMask = COND_TR;
+
+	vpi->SrcReg[0].File = PROGRAM_TEMPORARY;
+	vpi->SrcReg[0].Index = temp_index;
+	vpi->SrcReg[0].Swizzle = SWIZZLE_XYZW;
+
+	++vpi;
+
+	vpi->Opcode = OPCODE_END;
+}
+
+static void pos_as_texcoord(struct gl_program *prog, int tex_id)
 {
 	struct prog_instruction *vpi;
-	struct gl_program_parameter_list *paramList;
-	int i;
+	GLuint tempregi = prog->NumTemporaries;
 
-	gl_state_index tokens[STATE_LENGTH] = { STATE_MVP_MATRIX, 0, 0, 0, 0 };
+	prog->NumTemporaries++;
 
-	/* tokens[4] = matrix modifier */
-#ifdef PREFER_DP4
-	tokens[4] = 0;		/* not transposed or inverted */
-#else
-	tokens[4] = STATE_MATRIX_TRANSPOSE;
-#endif
-	paramList = prog->Parameters;
-
-	vpi = _mesa_alloc_instructions(prog->NumInstructions + 4);
-	_mesa_init_instructions(vpi, prog->NumInstructions + 4);
-
-	for (i = 0; i < 4; i++) {
-		GLint idx;
-		tokens[2] = tokens[3] = i;	/* matrix row[i]..row[i] */
-		idx = _mesa_add_state_reference(paramList, tokens);
-#ifdef PREFER_DP4
-		vpi[i].Opcode = OPCODE_DP4;
-		vpi[i].StringPos = 0;
-		vpi[i].Data = 0;
-
-		vpi[i].DstReg.File = PROGRAM_OUTPUT;
-		vpi[i].DstReg.Index = VERT_RESULT_HPOS;
-		vpi[i].DstReg.WriteMask = 1 << i;
-		vpi[i].DstReg.CondMask = COND_TR;
-
-		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
-		vpi[i].SrcReg[0].Index = idx;
-		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
-		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
-		vpi[i].SrcReg[1].Swizzle = SWIZZLE_XYZW;
-#else
-		if (i == 0)
-			vpi[i].Opcode = OPCODE_MUL;
-		else
-			vpi[i].Opcode = OPCODE_MAD;
+	for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
+		if (vpi->DstReg.File == PROGRAM_OUTPUT && vpi->DstReg.Index == VERT_RESULT_HPOS) {
+			vpi->DstReg.File = PROGRAM_TEMPORARY;
+			vpi->DstReg.Index = tempregi;
+		}
+	}
 
-		vpi[i].Data = 0;
+	insert_wpos(prog, tempregi, tex_id);
 
-		if (i == 3)
-			vpi[i].DstReg.File = PROGRAM_OUTPUT;
-		else
-			vpi[i].DstReg.File = PROGRAM_TEMPORARY;
-		vpi[i].DstReg.Index = 0;
-		vpi[i].DstReg.WriteMask = 0xf;
-		vpi[i].DstReg.CondMask = COND_TR;
-
-		vpi[i].SrcReg[0].File = PROGRAM_STATE_VAR;
-		vpi[i].SrcReg[0].Index = idx;
-		vpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-		vpi[i].SrcReg[1].File = PROGRAM_INPUT;
-		vpi[i].SrcReg[1].Index = VERT_ATTRIB_POS;
-		vpi[i].SrcReg[1].Swizzle = MAKE_SWIZZLE4(i, i, i, i);
-
-		if (i > 0) {
-			vpi[i].SrcReg[2].File = PROGRAM_TEMPORARY;
-			vpi[i].SrcReg[2].Index = 0;
-			vpi[i].SrcReg[2].Swizzle = SWIZZLE_XYZW;
+	prog->OutputsWritten |= 1 << (VERT_RESULT_TEX0 + tex_id);
+}
+
+/**
+ * The fogcoord attribute is special in that only the first component
+ * is relevant, and the remaining components are always fixed (when read
+ * from by the fragment program) to yield an X001 pattern.
+ *
+ * We need to enforce this either in the vertex program or in the fragment
+ * program, and this code chooses not to enforce it in the vertex program.
+ * This is slightly cheaper, as long as the fragment program does not use
+ * weird swizzles.
+ *
+ * And it seems that usually, weird swizzles are not used, so...
+ *
+ * See also the counterpart rewriting for fragment programs.
+ */
+static void fog_as_texcoord(struct gl_program *prog, int tex_id)
+{
+	struct prog_instruction *vpi;
+
+	vpi = prog->Instructions;
+	while (vpi->Opcode != OPCODE_END) {
+		if (vpi->DstReg.File == PROGRAM_OUTPUT && vpi->DstReg.Index == VERT_RESULT_FOGC) {
+			vpi->DstReg.Index = VERT_RESULT_TEX0 + tex_id;
+			vpi->DstReg.WriteMask = WRITEMASK_X;
 		}
-#endif
+
+		++vpi;
 	}
 
-	_mesa_copy_instructions(&vpi[i], prog->Instructions,
-				prog->NumInstructions);
+	prog->OutputsWritten &= ~(1 << VERT_RESULT_FOGC);
+	prog->OutputsWritten |= 1 << (VERT_RESULT_TEX0 + tex_id);
+}
 
-	free(prog->Instructions);
+static int translateABS(struct gl_program *prog, int pos)
+{
+	struct prog_instruction *inst;
 
-	prog->Instructions = vpi;
+	inst = &prog->Instructions[pos];
 
-	prog->NumInstructions += 4;
-	vpi = &prog->Instructions[prog->NumInstructions - 1];
+	inst->Opcode = OPCODE_MAX;
+	inst->SrcReg[1] = inst->SrcReg[0];
+	inst->SrcReg[1].Negate ^= NEGATE_XYZW;
 
-	assert(vpi->Opcode == OPCODE_END);
+	return 0;
 }
 
-static void insert_wpos(struct r300_vertex_program *vp, struct gl_program *prog,
-			GLuint temp_index)
+static int translateDP3(struct gl_program *prog, int pos)
 {
-	struct prog_instruction *vpi;
-	struct prog_instruction *vpi_insert;
-	int i = 0;
+	struct prog_instruction *inst;
 
-	vpi = _mesa_alloc_instructions(prog->NumInstructions + 2);
-	_mesa_init_instructions(vpi, prog->NumInstructions + 2);
-	/* all but END */
-	_mesa_copy_instructions(vpi, prog->Instructions,
-				prog->NumInstructions - 1);
-	/* END */
-	_mesa_copy_instructions(&vpi[prog->NumInstructions + 1],
-				&prog->Instructions[prog->NumInstructions - 1],
-				1);
-	vpi_insert = &vpi[prog->NumInstructions - 1];
+	inst = &prog->Instructions[pos];
 
-	vpi_insert[i].Opcode = OPCODE_MOV;
+	inst->Opcode = OPCODE_DP4;
+	inst->SrcReg[0].Swizzle = combine_swizzles4(inst->SrcReg[0].Swizzle, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
 
-	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
-	vpi_insert[i].DstReg.Index = VERT_RESULT_HPOS;
-	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
-	vpi_insert[i].DstReg.CondMask = COND_TR;
+	return 0;
+}
 
-	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	vpi_insert[i].SrcReg[0].Index = temp_index;
-	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-	i++;
+static int translateDPH(struct gl_program *prog, int pos)
+{
+	struct prog_instruction *inst;
 
-	vpi_insert[i].Opcode = OPCODE_MOV;
+	inst = &prog->Instructions[pos];
 
-	vpi_insert[i].DstReg.File = PROGRAM_OUTPUT;
-	vpi_insert[i].DstReg.Index = VERT_RESULT_TEX0 + vp->wpos_idx;
-	vpi_insert[i].DstReg.WriteMask = WRITEMASK_XYZW;
-	vpi_insert[i].DstReg.CondMask = COND_TR;
+	inst->Opcode = OPCODE_DP4;
+	inst->SrcReg[0].Swizzle = combine_swizzles4(inst->SrcReg[0].Swizzle, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ONE);
 
-	vpi_insert[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	vpi_insert[i].SrcReg[0].Index = temp_index;
-	vpi_insert[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-	i++;
+	return 0;
+}
+
+static int translateFLR(struct gl_program *prog, int pos)
+{
+	struct prog_instruction *inst;
+	struct prog_dst_register dst;
+	int tmp_idx;
 
-	free(prog->Instructions);
+	tmp_idx = prog->NumTemporaries++;
 
-	prog->Instructions = vpi;
+	_mesa_insert_instructions(prog, pos + 1, 1);
 
-	prog->NumInstructions += i;
-	vpi = &prog->Instructions[prog->NumInstructions - 1];
+	inst = &prog->Instructions[pos];
+	dst = inst->DstReg;
 
-	assert(vpi->Opcode == OPCODE_END);
+	inst->Opcode = OPCODE_FRC;
+	inst->DstReg.File = PROGRAM_TEMPORARY;
+	inst->DstReg.Index = tmp_idx;
+	++inst;
+
+	inst->Opcode = OPCODE_ADD;
+	inst->DstReg = dst;
+	inst->SrcReg[0] = (inst-1)->SrcReg[0];
+	inst->SrcReg[1].File = PROGRAM_TEMPORARY;
+	inst->SrcReg[1].Index = tmp_idx;
+	inst->SrcReg[1].Negate = NEGATE_XYZW;
+
+	return 1;
 }
 
-static void pos_as_texcoord(struct r300_vertex_program *vp,
-			    struct gl_program *prog)
+static int translateSUB(struct gl_program *prog, int pos)
 {
-	struct prog_instruction *vpi;
-	GLuint tempregi = prog->NumTemporaries;
-	/* should do something else if no temps left... */
-	prog->NumTemporaries++;
+	struct prog_instruction *inst;
 
-	for (vpi = prog->Instructions; vpi->Opcode != OPCODE_END; vpi++) {
-		if (vpi->DstReg.File == PROGRAM_OUTPUT
-		    && vpi->DstReg.Index == VERT_RESULT_HPOS) {
-			vpi->DstReg.File = PROGRAM_TEMPORARY;
-			vpi->DstReg.Index = tempregi;
+	inst = &prog->Instructions[pos];
+
+	inst->Opcode = OPCODE_ADD;
+	inst->SrcReg[1].Negate ^= NEGATE_XYZW;
+
+	return 0;
+}
+
+static int translateSWZ(struct gl_program *prog, int pos)
+{
+	prog->Instructions[pos].Opcode = OPCODE_MOV;
+
+	return 0;
+}
+
+static int translateXPD(struct gl_program *prog, int pos)
+{
+	struct prog_instruction *inst;
+	int tmp_idx;
+
+	tmp_idx = prog->NumTemporaries++;
+
+	_mesa_insert_instructions(prog, pos + 1, 1);
+
+	inst = &prog->Instructions[pos];
+
+	*(inst+1) = *inst;
+
+	inst->Opcode = OPCODE_MUL;
+	inst->DstReg.File = PROGRAM_TEMPORARY;
+	inst->DstReg.Index = tmp_idx;
+	inst->SrcReg[0].Swizzle = combine_swizzles4(inst->SrcReg[0].Swizzle, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W);
+	inst->SrcReg[1].Swizzle = combine_swizzles4(inst->SrcReg[1].Swizzle, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W);
+	++inst;
+
+	inst->Opcode = OPCODE_MAD;
+	inst->SrcReg[0].Swizzle = combine_swizzles4(inst->SrcReg[0].Swizzle, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_Y, SWIZZLE_W);
+	inst->SrcReg[1].Swizzle = combine_swizzles4(inst->SrcReg[1].Swizzle, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_X, SWIZZLE_W);
+	inst->SrcReg[1].Negate ^= NEGATE_XYZW;
+	inst->SrcReg[2].File = PROGRAM_TEMPORARY;
+	inst->SrcReg[2].Index = tmp_idx;
+
+	return 1;
+}
+
+static void translateInsts(struct gl_program *prog)
+{
+	struct prog_instruction *inst;
+	int i;
+
+	for (i = 0; i < prog->NumInstructions; ++i) {
+		inst = &prog->Instructions[i];
+
+		switch (inst->Opcode) {
+			case OPCODE_ABS:
+				i += translateABS(prog, i);
+				break;
+			case OPCODE_DP3:
+				i += translateDP3(prog, i);
+				break;
+			case OPCODE_DPH:
+				i += translateDPH(prog, i);
+				break;
+			case OPCODE_FLR:
+				i += translateFLR(prog, i);
+				break;
+			case OPCODE_SUB:
+				i += translateSUB(prog, i);
+				break;
+			case OPCODE_SWZ:
+				i += translateSWZ(prog, i);
+				break;
+			case OPCODE_XPD:
+				i += translateXPD(prog, i);
+				break;
+			default:
+				break;
 		}
 	}
-	insert_wpos(vp, prog, tempregi);
 }
 
-static struct r300_vertex_program *build_program(struct r300_vertex_program_key
-						 *wanted_key, struct gl_vertex_program
-						 *mesa_vp, GLint wpos_idx)
+#define ADD_OUTPUT(fp_attr, vp_result) \
+	do { \
+		if ((FpReads & (1 << (fp_attr))) && !(prog->OutputsWritten & (1 << (vp_result)))) { \
+			OutputsAdded |= 1 << (vp_result); \
+			count++; \
+		} \
+	} while (0)
+
+static void addArtificialOutputs(GLcontext *ctx, struct gl_program *prog)
 {
-	struct r300_vertex_program *vp;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	GLuint OutputsAdded, FpReads;
+	int i, count;
 
-	vp = _mesa_calloc(sizeof(*vp));
-	_mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
-	vp->wpos_idx = wpos_idx;
+	OutputsAdded = 0;
+	count = 0;
+	FpReads = r300->selected_fp->Base->InputsRead;
+
+	ADD_OUTPUT(FRAG_ATTRIB_COL0, VERT_RESULT_COL0);
+	ADD_OUTPUT(FRAG_ATTRIB_COL1, VERT_RESULT_COL1);
+
+	for (i = 0; i < 7; ++i) {
+		ADD_OUTPUT(FRAG_ATTRIB_TEX0 + i, VERT_RESULT_TEX0 + i);
+	}
+
+	/* Some outputs may be artificially added, to match the inputs of the fragment program.
+	 * Issue 16 of vertex program spec says that all vertex attributes that are unwritten by
+	 * vertex program are undefined, so just use MOV [vertex_result], CONST[0]
+	 */
+	if (count > 0) {
+		struct prog_instruction *inst;
+
+		_mesa_insert_instructions(prog, prog->NumInstructions - 1, count);
+		inst = &prog->Instructions[prog->NumInstructions - 1 - count];
+
+		for (i = 0; i < VERT_RESULT_MAX; ++i) {
+			if (OutputsAdded & (1 << i)) {
+				inst->Opcode = OPCODE_MOV;
+
+				inst->DstReg.File = PROGRAM_OUTPUT;
+				inst->DstReg.Index = i;
+				inst->DstReg.WriteMask = WRITEMASK_XYZW;
+				inst->DstReg.CondMask = COND_TR;
+
+				inst->SrcReg[0].File = PROGRAM_CONSTANT;
+				inst->SrcReg[0].Index = 0;
+				inst->SrcReg[0].Swizzle = SWIZZLE_XYZW;
 
-	if (mesa_vp->IsPositionInvariant) {
-		position_invariant(&mesa_vp->Base);
+				++inst;
+			}
+		}
+
+		prog->OutputsWritten |= OutputsAdded;
 	}
+}
+
+#undef ADD_OUTPUT
+
+static void nqssadceInit(struct nqssadce_state* s)
+{
+	r300ContextPtr r300 = R300_CONTEXT(s->Ctx);
+	GLuint fp_reads;
+
+	fp_reads = r300->selected_fp->Base->InputsRead;
+	{
+		if (fp_reads & FRAG_BIT_COL0) {
+				s->Outputs[VERT_RESULT_COL0].Sourced = WRITEMASK_XYZW;
+				s->Outputs[VERT_RESULT_BFC0].Sourced = WRITEMASK_XYZW;
+		}
 
-	if (wpos_idx > -1) {
-		pos_as_texcoord(vp, &mesa_vp->Base);
+		if (fp_reads & FRAG_BIT_COL1) {
+				s->Outputs[VERT_RESULT_COL1].Sourced = WRITEMASK_XYZW;
+				s->Outputs[VERT_RESULT_BFC1].Sourced = WRITEMASK_XYZW;
+		}
 	}
 
-	assert(mesa_vp->Base.NumInstructions);
-	vp->num_temporaries = mesa_vp->Base.NumTemporaries;
-	r300TranslateVertexShader(vp, mesa_vp->Base.Instructions);
+	{
+		int i;
+		for (i = 0; i < 8; ++i) {
+			if (fp_reads & FRAG_BIT_TEX(i)) {
+				s->Outputs[VERT_RESULT_TEX0 + i].Sourced = WRITEMASK_XYZW;
+			}
+		}
+	}
 
-	return vp;
+	s->Outputs[VERT_RESULT_HPOS].Sourced = WRITEMASK_XYZW;
+	if (s->Program->OutputsWritten & (1 << VERT_RESULT_PSIZ))
+		s->Outputs[VERT_RESULT_PSIZ].Sourced = WRITEMASK_X;
 }
 
-static void add_outputs(struct r300_vertex_program_key *key, GLint vert)
+static GLboolean swizzleIsNative(GLuint opcode, struct prog_src_register reg)
 {
-	if (key->OutputsWritten & (1 << vert))
-		return;
+	(void) opcode;
+	(void) reg;
 
-	key->OutputsWritten |= 1 << vert;
-	key->OutputsAdded |= 1 << vert;
+	return GL_TRUE;
 }
 
-void r300SelectVertexShader(r300ContextPtr r300)
+static struct r300_vertex_program *build_program(GLcontext *ctx,
+						 struct r300_vertex_program_key *wanted_key,
+						 const struct gl_vertex_program *mesa_vp)
 {
-	GLcontext *ctx = ctx = r300->radeon.glCtx;
-	GLuint InputsRead;
-	struct r300_vertex_program_key wanted_key = { 0 };
-	GLint i;
-	struct r300_vertex_program_cont *vpc;
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
 	struct r300_vertex_program *vp;
-	GLint wpos_idx;
+	struct gl_program *prog;
 
-	vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
-	wanted_key.InputsRead = vpc->mesa_program.Base.InputsRead;
-	wanted_key.OutputsWritten = vpc->mesa_program.Base.OutputsWritten;
-	InputsRead = ctx->FragmentProgram._Current->Base.InputsRead;
-
-	wpos_idx = -1;
-	if (InputsRead & FRAG_BIT_WPOS) {
-		for (i = 0; i < ctx->Const.MaxTextureUnits; i++)
-			if (!(InputsRead & (FRAG_BIT_TEX0 << i)))
-				break;
+	vp = _mesa_calloc(sizeof(*vp));
+	vp->Base = (struct gl_vertex_program *) _mesa_clone_program(ctx, &mesa_vp->Base);
+	_mesa_memcpy(&vp->key, wanted_key, sizeof(vp->key));
 
-		if (i == ctx->Const.MaxTextureUnits) {
-			fprintf(stderr, "\tno free texcoord found\n");
-			_mesa_exit(-1);
-		}
+	prog = &vp->Base->Base;
 
-		wanted_key.OutputsWritten |= 1 << (VERT_RESULT_TEX0 + i);
-		wpos_idx = i;
+	if (RADEON_DEBUG & DEBUG_VERTS) {
+		fprintf(stderr, "Initial vertex program:\n");
+		_mesa_print_program(prog);
+		fflush(stdout);
 	}
 
-	add_outputs(&wanted_key, VERT_RESULT_HPOS);
+	if (vp->Base->IsPositionInvariant) {
+		_mesa_insert_mvp_code(ctx, vp->Base);
+	}
+
+	if (r300->selected_fp->wpos_attr != FRAG_ATTRIB_MAX) {
+		pos_as_texcoord(&vp->Base->Base, r300->selected_fp->wpos_attr - FRAG_ATTRIB_TEX0);
+	}
 
-	if (InputsRead & FRAG_BIT_COL0) {
-		add_outputs(&wanted_key, VERT_RESULT_COL0);
+	if (r300->selected_fp->fog_attr != FRAG_ATTRIB_MAX) {
+		fog_as_texcoord(&vp->Base->Base, r300->selected_fp->fog_attr - FRAG_ATTRIB_TEX0);
 	}
 
-	if (InputsRead & FRAG_BIT_COL1) {
-		add_outputs(&wanted_key, VERT_RESULT_COL1);
+	addArtificialOutputs(ctx, prog);
+
+	translateInsts(prog);
+
+	if (RADEON_DEBUG & DEBUG_VERTS) {
+		fprintf(stderr, "Vertex program after native rewrite:\n");
+		_mesa_print_program(prog);
+		fflush(stdout);
 	}
 
-	for (i = 0; i < ctx->Const.MaxTextureUnits; i++) {
-		if (InputsRead & (FRAG_BIT_TEX0 << i)) {
-			add_outputs(&wanted_key, VERT_RESULT_TEX0 + i);
+	{
+		struct radeon_nqssadce_descr nqssadce = {
+			.Init = &nqssadceInit,
+			.IsNativeSwizzle = &swizzleIsNative,
+			.BuildSwizzle = NULL
+		};
+		radeonNqssaDce(ctx, prog, &nqssadce);
+
+		/* We need this step for reusing temporary registers */
+		_mesa_optimize_program(ctx, prog);
+
+		if (RADEON_DEBUG & DEBUG_VERTS) {
+			fprintf(stderr, "Vertex program after NQSSADCE:\n");
+			_mesa_print_program(prog);
+			fflush(stdout);
 		}
 	}
 
-	if (vpc->mesa_program.IsPositionInvariant) {
-		/* we wan't position don't we ? */
-		wanted_key.InputsRead |= (1 << VERT_ATTRIB_POS);
+	assert(prog->NumInstructions);
+	{
+		struct prog_instruction *inst;
+		int max, i, tmp;
+
+		inst = prog->Instructions;
+		max = -1;
+		while (inst->Opcode != OPCODE_END) {
+			tmp = _mesa_num_inst_src_regs(inst->Opcode);
+			for (i = 0; i < tmp; ++i) {
+				if (inst->SrcReg[i].File == PROGRAM_TEMPORARY) {
+					if ((int) inst->SrcReg[i].Index > max) {
+						max = inst->SrcReg[i].Index;
+					}
+				}
+			}
+
+			if (_mesa_num_inst_dst_regs(inst->Opcode)) {
+				if (inst->DstReg.File == PROGRAM_TEMPORARY) {
+					if ((int) inst->DstReg.Index > max) {
+						max = inst->DstReg.Index;
+					}
+				}
+			}
+			++inst;
+		}
+
+		/* We actually want highest index of used temporary register,
+		 * not the number of temporaries used.
+		 * These values aren't always the same.
+		 */
+		vp->num_temporaries = max + 1;
 	}
 
-	for (vp = vpc->progs; vp; vp = vp->next)
+	return vp;
+}
+
+struct r300_vertex_program * r300SelectVertexShader(GLcontext *ctx)
+{
+	r300ContextPtr r300 = R300_CONTEXT(ctx);
+	struct r300_vertex_program_key wanted_key = { 0 };
+	struct r300_vertex_program_cont *vpc;
+	struct r300_vertex_program *vp;
+
+	vpc = (struct r300_vertex_program_cont *)ctx->VertexProgram._Current;
+	wanted_key.FpReads = r300->selected_fp->Base->InputsRead;
+	wanted_key.FogAttr = r300->selected_fp->fog_attr;
+	wanted_key.WPosAttr = r300->selected_fp->wpos_attr;
+
+	for (vp = vpc->progs; vp; vp = vp->next) {
 		if (_mesa_memcmp(&vp->key, &wanted_key, sizeof(wanted_key))
 		    == 0) {
-			r300->selected_vp = vp;
-			return;
+			return r300->selected_vp = vp;
 		}
-	//_mesa_print_program(&vpc->mesa_program.Base);
+	}
 
-	vp = build_program(&wanted_key, &vpc->mesa_program, wpos_idx);
+	vp = build_program(ctx, &wanted_key, &vpc->mesa_program);
 	vp->next = vpc->progs;
 	vpc->progs = vp;
-	r300->selected_vp = vp;
+
+	return r300->selected_vp = vp;
+}
+
+#define bump_vpu_count(ptr, new_count)   do { \
+		drm_r300_cmd_header_t* _p=((drm_r300_cmd_header_t*)(ptr)); \
+		int _nc=(new_count)/4; \
+		assert(_nc < 256); \
+		if(_nc>_p->vpu.count)_p->vpu.count=_nc; \
+	} while(0)
+
+static void r300EmitVertexProgram(r300ContextPtr r300, int dest, struct r300_vertex_shader_hw_code *code)
+{
+	int i;
+
+	assert((code->length > 0) && (code->length % 4 == 0));
+
+	switch ((dest >> 8) & 0xf) {
+		case 0:
+			R300_STATECHANGE(r300, vpi);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpi.cmd[R300_VPI_INSTR_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpi.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 2:
+			R300_STATECHANGE(r300, vpp);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vpp.cmd[R300_VPP_PARAM_0 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vpp.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		case 4:
+			R300_STATECHANGE(r300, vps);
+			for (i = 0; i < code->length; i++)
+				r300->hw.vps.cmd[1 + i + 4 * (dest & 0xff)] = (code->body.d[i]);
+			bump_vpu_count(r300->hw.vps.cmd, code->length + 4 * (dest & 0xff));
+			break;
+		default:
+			fprintf(stderr, "%s:%s don't know how to handle dest %04x\n", __FILE__, __FUNCTION__, dest);
+			_mesa_exit(-1);
+	}
+}
+
+void r300SetupVertexProgram(r300ContextPtr rmesa)
+{
+	GLcontext *ctx = rmesa->radeon.glCtx;
+	struct r300_vertex_program *prog = rmesa->selected_vp;
+	int inst_count = 0;
+	int param_count = 0;
+
+	/* Reset state, in case we don't use something */
+	((drm_r300_cmd_header_t *) rmesa->hw.vpp.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vpi.cmd)->vpu.count = 0;
+	((drm_r300_cmd_header_t *) rmesa->hw.vps.cmd)->vpu.count = 0;
+
+	R300_STATECHANGE(rmesa, vpp);
+	param_count = r300VertexProgUpdateParams(ctx, prog->Base, (float *)&rmesa->hw.vpp.cmd[R300_VPP_PARAM_0]);
+	bump_vpu_count(rmesa->hw.vpp.cmd, param_count);
+	param_count /= 4;
+
+	r300EmitVertexProgram(rmesa, R300_PVS_CODE_START, &(prog->hw_code));
+	inst_count = (prog->hw_code.length / 4) - 1;
+
+	r300VapCntl(rmesa, _mesa_bitcount(prog->Base->Base.InputsRead),
+				 _mesa_bitcount(prog->Base->Base.OutputsWritten), prog->num_temporaries);
+
+	R300_STATECHANGE(rmesa, pvs);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_1] = (0 << R300_PVS_FIRST_INST_SHIFT) | (inst_count << R300_PVS_XYZW_VALID_INST_SHIFT) |
+				(inst_count << R300_PVS_LAST_INST_SHIFT);
+
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_2] = (0 << R300_PVS_CONST_BASE_OFFSET_SHIFT) | (param_count << R300_PVS_MAX_CONST_ADDR_SHIFT);
+	rmesa->hw.pvs.cmd[R300_PVS_CNTL_3] = (inst_count << R300_PVS_LAST_VTX_SRC_INST_SHIFT);
 }
diff --git a/src/mesa/drivers/dri/r300/r300_vertprog.h b/src/mesa/drivers/dri/r300/r300_vertprog.h
index 2f35f02bc84..2dab11c3378 100644
--- a/src/mesa/drivers/dri/r300/r300_vertprog.h
+++ b/src/mesa/drivers/dri/r300/r300_vertprog.h
@@ -32,4 +32,10 @@
 
 #endif
 
+void r300SetupVertexProgram(r300ContextPtr rmesa);
+
+struct r300_vertex_program * r300SelectVertexShader(GLcontext *ctx);
+
+void r300TranslateVertexShader(struct r300_vertex_program *vp);
+
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.c b/src/mesa/drivers/dri/r300/r500_fragprog.c
index 292573de893..4d58cf21622 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.c
@@ -27,10 +27,6 @@
 
 #include "r500_fragprog.h"
 
-#include "radeon_nqssadce.h"
-#include "radeon_program_alu.h"
-
-
 static void reset_srcreg(struct prog_src_register* reg)
 {
 	_mesa_bzero(reg, sizeof(*reg));
@@ -58,12 +54,12 @@ static struct prog_src_register shadow_ambient(struct gl_program *program, int t
  *  - introduce a temporary register when write masks are needed
  *
  */
-static GLboolean transform_TEX(
+GLboolean r500_transform_TEX(
 	struct radeon_transform_context *t,
 	struct prog_instruction* orig_inst, void* data)
 {
-	struct r500_fragment_program_compiler *compiler =
-		(struct r500_fragment_program_compiler*)data;
+	struct r300_fragment_program_compiler *compiler =
+		(struct r300_fragment_program_compiler*)data;
 	struct prog_instruction inst = *orig_inst;
 	struct prog_instruction* tgt;
 	GLboolean destredirect = GL_FALSE;
@@ -188,121 +184,7 @@ static GLboolean transform_TEX(
 	return GL_TRUE;
 }
 
-
-static void update_params(r300ContextPtr r300, struct r500_fragment_program *fp)
-{
-	struct gl_fragment_program *mp = &fp->mesa_program;
-
-	/* Ask Mesa nicely to fill in ParameterValues for us */
-	if (mp->Base.Parameters)
-		_mesa_load_state_parameters(r300->radeon.glCtx, mp->Base.Parameters);
-}
-
-
-/**
- * Transform the program to support fragment.position.
- *
- * Introduce a small fragment at the start of the program that will be
- * the only code that directly reads the FRAG_ATTRIB_WPOS input.
- * All other code pieces that reference that input will be rewritten
- * to read from a newly allocated temporary.
- *
- * \todo if/when r5xx supports the radeon_program architecture, this is a
- * likely candidate for code sharing.
- */
-static void insert_WPOS_trailer(struct r500_fragment_program_compiler *compiler)
-{
-	GLuint InputsRead = compiler->fp->mesa_program.Base.InputsRead;
-
-	if (!(InputsRead & FRAG_BIT_WPOS))
-		return;
-
-	static gl_state_index tokens[STATE_LENGTH] = {
-		STATE_INTERNAL, STATE_R300_WINDOW_DIMENSION, 0, 0, 0
-	};
-	struct prog_instruction *fpi;
-	GLuint window_index;
-	int i = 0;
-	GLuint tempregi = _mesa_find_free_register(compiler->program, PROGRAM_TEMPORARY);
-
-	_mesa_insert_instructions(compiler->program, 0, 3);
-	fpi = compiler->program->Instructions;
-
-	/* perspective divide */
-	fpi[i].Opcode = OPCODE_RCP;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_W;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	fpi[i].Opcode = OPCODE_MUL;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_INPUT;
-	fpi[i].SrcReg[0].Index = FRAG_ATTRIB_WPOS;
-	fpi[i].SrcReg[0].Swizzle = SWIZZLE_XYZW;
-
-	fpi[i].SrcReg[1].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[1].Index = tempregi;
-	fpi[i].SrcReg[1].Swizzle = SWIZZLE_WWWW;
-	i++;
-
-	/* viewport transformation */
-	window_index = _mesa_add_state_reference(compiler->program->Parameters, tokens);
-
-	fpi[i].Opcode = OPCODE_MAD;
-
-	fpi[i].DstReg.File = PROGRAM_TEMPORARY;
-	fpi[i].DstReg.Index = tempregi;
-	fpi[i].DstReg.WriteMask = WRITEMASK_XYZ;
-	fpi[i].DstReg.CondMask = COND_TR;
-
-	fpi[i].SrcReg[0].File = PROGRAM_TEMPORARY;
-	fpi[i].SrcReg[0].Index = tempregi;
-	fpi[i].SrcReg[0].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[1].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[1].Index = window_index;
-	fpi[i].SrcReg[1].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-
-	fpi[i].SrcReg[2].File = PROGRAM_STATE_VAR;
-	fpi[i].SrcReg[2].Index = window_index;
-	fpi[i].SrcReg[2].Swizzle =
-	    MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_ZERO);
-	i++;
-
-	for (; i < compiler->program->NumInstructions; ++i) {
-		int reg;
-		for (reg = 0; reg < 3; reg++) {
-			if (fpi[i].SrcReg[reg].File == PROGRAM_INPUT &&
-			    fpi[i].SrcReg[reg].Index == FRAG_ATTRIB_WPOS) {
-				fpi[i].SrcReg[reg].File = PROGRAM_TEMPORARY;
-				fpi[i].SrcReg[reg].Index = tempregi;
-			}
-		}
-	}
-}
-
-
-static void nqssadce_init(struct nqssadce_state* s)
-{
-	s->Outputs[FRAG_RESULT_COLOR].Sourced = WRITEMASK_XYZW;
-	s->Outputs[FRAG_RESULT_DEPTH].Sourced = WRITEMASK_W;
-}
-
-static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
+GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg)
 {
 	GLuint relevant;
 	int i;
@@ -314,22 +196,20 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
 		if (reg.Abs)
 			return GL_FALSE;
 
+		if (opcode == OPCODE_KIL && (reg.Swizzle != SWIZZLE_NOOP || reg.Negate != NEGATE_NONE))
+			return GL_FALSE;
+
 		if (reg.Negate)
 			reg.Negate ^= NEGATE_XYZW;
 
-		if (opcode == OPCODE_KIL) {
-			if (reg.Swizzle != SWIZZLE_NOOP)
-				return GL_FALSE;
-		} else {
-			for(i = 0; i < 4; ++i) {
-				GLuint swz = GET_SWZ(reg.Swizzle, i);
-				if (swz == SWIZZLE_NIL) {
-					reg.Negate &= ~(1 << i);
-					continue;
-				}
-				if (swz >= 4)
-					return GL_FALSE;
+		for(i = 0; i < 4; ++i) {
+			GLuint swz = GET_SWZ(reg.Swizzle, i);
+			if (swz == SWIZZLE_NIL) {
+				reg.Negate &= ~(1 << i);
+				continue;
 			}
+			if (swz >= 4)
+				return GL_FALSE;
 		}
 
 		if (reg.Negate)
@@ -367,8 +247,7 @@ static GLboolean is_native_swizzle(GLuint opcode, struct prog_src_register reg)
  * The only thing we *cannot* do in an ALU instruction is per-component
  * negation. Therefore, we split the MOV into two instructions when necessary.
  */
-static void nqssadce_build_swizzle(struct nqssadce_state *s,
-	struct prog_dst_register dst, struct prog_src_register src)
+void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src)
 {
 	struct prog_instruction *inst;
 	GLuint negatebase[2] = { 0, 0 };
@@ -392,129 +271,12 @@ static void nqssadce_build_swizzle(struct nqssadce_state *s,
 		inst->DstReg = dst;
 		inst->DstReg.WriteMask = negatebase[i];
 		inst->SrcReg[0] = src;
+		inst->SrcReg[0].Negate = (i == 0) ? NEGATE_NONE : NEGATE_XYZW;
 		inst++;
 		s->IP++;
 	}
 }
 
-static GLuint build_dtm(GLuint depthmode)
-{
-	switch(depthmode) {
-	default:
-	case GL_LUMINANCE: return 0;
-	case GL_INTENSITY: return 1;
-	case GL_ALPHA: return 2;
-	}
-}
-
-static GLuint build_func(GLuint comparefunc)
-{
-	return comparefunc - GL_NEVER;
-}
-
-
-/**
- * Collect all external state that is relevant for compiling the given
- * fragment program.
- */
-static void build_state(
-	r300ContextPtr r300,
-	struct r500_fragment_program *fp,
-	struct r500_fragment_program_external_state *state)
-{
-	int unit;
-
-	_mesa_bzero(state, sizeof(*state));
-
-	for(unit = 0; unit < 16; ++unit) {
-		if (fp->mesa_program.Base.ShadowSamplers & (1 << unit)) {
-			struct gl_texture_object* tex = r300->radeon.glCtx->Texture.Unit[unit]._Current;
-
-			state->unit[unit].depth_texture_mode = build_dtm(tex->DepthMode);
-			state->unit[unit].texture_compare_func = build_func(tex->CompareFunc);
-		}
-	}
-}
-
-static void dump_program(struct r500_fragment_program_code *code);
-
-void r500TranslateFragmentShader(r300ContextPtr r300,
-				 struct r500_fragment_program *fp)
-{
-	struct r500_fragment_program_external_state state;
-
-	build_state(r300, fp, &state);
-	if (_mesa_memcmp(&fp->state, &state, sizeof(state))) {
-		/* TODO: cache compiled programs */
-		fp->translated = GL_FALSE;
-		_mesa_memcpy(&fp->state, &state, sizeof(state));
-	}
-
-	if (!fp->translated) {
-		struct r500_fragment_program_compiler compiler;
-
-		compiler.r300 = r300;
-		compiler.fp = fp;
-		compiler.code = &fp->code;
-		compiler.program = _mesa_clone_program(r300->radeon.glCtx, &fp->mesa_program.Base);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: Initial program:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		insert_WPOS_trailer(&compiler);
-
-		struct radeon_program_transformation transformations[] = {
-			{ &transform_TEX, &compiler },
-			{ &radeonTransformALU, 0 },
-			{ &radeonTransformDeriv, 0 },
-			{ &radeonTransformTrigScale, 0 }
-		};
-		radeonLocalTransform(r300->radeon.glCtx, compiler.program,
-			4, transformations);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after native rewrite:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		struct radeon_nqssadce_descr nqssadce = {
-			.Init = &nqssadce_init,
-			.IsNativeSwizzle = &is_native_swizzle,
-			.BuildSwizzle = &nqssadce_build_swizzle,
-			.RewriteDepthOut = GL_TRUE
-		};
-		radeonNqssaDce(r300->radeon.glCtx, compiler.program, &nqssadce);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			_mesa_printf("Compiler: after NqSSA-DCE:\n");
-			_mesa_print_program(compiler.program);
-		}
-
-		fp->translated = r500FragmentProgramEmit(&compiler);
-
-		/* Subtle: Rescue any parameters that have been added during transformations */
-		_mesa_free_parameter_list(fp->mesa_program.Base.Parameters);
-		fp->mesa_program.Base.Parameters = compiler.program->Parameters;
-		compiler.program->Parameters = 0;
-
-		_mesa_reference_program(r300->radeon.glCtx, &compiler.program, 0);
-
-		r300UpdateStateParameters(r300->radeon.glCtx, _NEW_PROGRAM);
-
-		if (RADEON_DEBUG & DEBUG_PIXEL) {
-			if (fp->translated) {
-				_mesa_printf("Machine-readable code:\n");
-				dump_program(&fp->code);
-			}
-		}
-
-	}
-
-	update_params(r300, fp);
-
-}
 
 static char *toswiz(int swiz_val) {
   switch(swiz_val) {
@@ -613,9 +375,9 @@ static char *to_texop(int val)
   return NULL;
 }
 
-static void dump_program(struct r500_fragment_program_code *code)
+void r500FragmentProgramDump(union rX00_fragment_program_code *c)
 {
-
+  struct r500_fragment_program_code *code = &c->r500;
   fprintf(stderr, "R500 Fragment Program:\n--------\n");
 
   int n;
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog.h b/src/mesa/drivers/dri/r300/r500_fragprog.h
index 1e45538f807..1179bf66073 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog.h
+++ b/src/mesa/drivers/dri/r300/r500_fragprog.h
@@ -33,30 +33,20 @@
 #ifndef __R500_FRAGPROG_H_
 #define __R500_FRAGPROG_H_
 
-#include "main/glheader.h"
-#include "main/macros.h"
-#include "main/enums.h"
 #include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/program.h"
 #include "shader/prog_instruction.h"
 
 #include "r300_context.h"
-#include "r300_state.h"
-#include "radeon_program.h"
+#include "radeon_nqssadce.h"
 
-struct r500_fragment_program;
+extern GLboolean r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler);
 
-extern void r500TranslateFragmentShader(r300ContextPtr r300,
-					struct r500_fragment_program *fp);
+extern void r500FragmentProgramDump(union rX00_fragment_program_code *c);
 
-struct r500_fragment_program_compiler {
-	r300ContextPtr r300;
-	struct r500_fragment_program *fp;
-	struct r500_fragment_program_code *code;
-	struct gl_program *program;
-};
+extern GLboolean r500FPIsNativeSwizzle(GLuint opcode, struct prog_src_register reg);
 
-extern GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler);
+extern void r500FPBuildSwizzle(struct nqssadce_state *s, struct prog_dst_register dst, struct prog_src_register src);
+
+extern GLboolean r500_transform_TEX(struct radeon_transform_context *t, struct prog_instruction* orig_inst, void* data);
 
 #endif
diff --git a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
index 4631235f0d3..30f4514897e 100644
--- a/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
+++ b/src/mesa/drivers/dri/r300/r500_fragprog_emit.c
@@ -49,8 +49,8 @@
 
 
 #define PROG_CODE \
-	struct r500_fragment_program_compiler *c = (struct r500_fragment_program_compiler*)data; \
-	struct r500_fragment_program_code *code = c->code
+	struct r300_fragment_program_compiler *c = (struct r300_fragment_program_compiler*)data; \
+	struct r500_fragment_program_code *code = &c->code->r500
 
 #define error(fmt, args...) do {			\
 		fprintf(stderr, "%s::%s(): " fmt "\n",	\
@@ -72,7 +72,7 @@ static GLboolean emit_const(void *data, GLuint file, GLuint idx, GLuint *hwindex
 	}
 
 	if (*hwindex >= code->const_nr) {
-		if (*hwindex >= PFS_NUM_CONST_REGS) {
+		if (*hwindex >= R500_PFS_NUM_CONST_REGS) {
 			error("Out of hw constants!\n");
 			return GL_FALSE;
 		}
@@ -299,9 +299,9 @@ static const struct radeon_pair_handler pair_handler = {
 	.MaxHwTemps = 128
 };
 
-GLboolean r500FragmentProgramEmit(struct r500_fragment_program_compiler *compiler)
+GLboolean r500BuildFragmentProgramHwCode(struct r300_fragment_program_compiler *compiler)
 {
-	struct r500_fragment_program_code *code = compiler->code;
+	struct r500_fragment_program_code *code = &compiler->code->r500;
 
 	_mesa_bzero(code, sizeof(*code));
 	code->max_temp_idx = 1;
diff --git a/src/mesa/drivers/dri/r300/radeon_context.c b/src/mesa/drivers/dri/r300/radeon_context.c
deleted file mode 100644
index 5267fe9a774..00000000000
--- a/src/mesa/drivers/dri/r300/radeon_context.c
+++ /dev/null
@@ -1,330 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/**
- * \file radeon_context.c
- * Common context initialization.
- *
- * \author Keith Whitwell <[email protected]>
- */
-
-#include <dlfcn.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/state.h"
-#include "main/matrix.h"
-#include "main/framebuffer.h"
-
-#include "drivers/common/driverfuncs.h"
-#include "swrast/swrast.h"
-
-#include "radeon_screen.h"
-#include "radeon_ioctl.h"
-#include "radeon_macros.h"
-#include "radeon_reg.h"
-
-#include "radeon_state.h"
-#include "r300_state.h"
-
-#include "utils.h"
-#include "vblank.h"
-#include "xmlpool.h"		/* for symbolic values of enum-type options */
-
-#define DRIVER_DATE "20060815"
-
-
-/* Return various strings for glGetString().
- */
-static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-	static char buffer[128];
-
-	switch (name) {
-	case GL_VENDOR:
-		if (IS_R300_CLASS(radeon->radeonScreen))
-			return (GLubyte *) "DRI R300 Project";
-		else
-			return (GLubyte *) "Tungsten Graphics, Inc.";
-
-	case GL_RENDERER:
-	{
-		unsigned offset;
-		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
-			radeon->radeonScreen->AGPMode;
-		const char* chipname;
-
-		if (IS_R300_CLASS(radeon->radeonScreen))
-			chipname = "R300";
-		else
-			chipname = "R200";
-
-		offset = driGetRendererString(buffer, chipname, DRIVER_DATE,
-					      agp_mode);
-
-		if (IS_R300_CLASS(radeon->radeonScreen)) {
-		sprintf(&buffer[offset], " %sTCL",
-			(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
-			? "" : "NO-");
-		} else {
-			sprintf(&buffer[offset], " %sTCL",
-			!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
-			? "" : "NO-");
-		}
-
-		return (GLubyte *) buffer;
-	}
-
-	default:
-		return NULL;
-	}
-}
-
-/* Initialize the driver's misc functions.
- */
-static void radeonInitDriverFuncs(struct dd_function_table *functions)
-{
-	functions->GetString = radeonGetString;
-}
-
-
-/**
- * Create and initialize all common fields of the context,
- * including the Mesa context itself.
- */
-GLboolean radeonInitContext(radeonContextPtr radeon,
-			    struct dd_function_table* functions,
-			    const __GLcontextModes * glVisual,
-			    __DRIcontextPrivate * driContextPriv,
-			    void *sharedContextPrivate)
-{
-	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
-	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
-	GLcontext* ctx;
-	GLcontext* shareCtx;
-	int fthrottle_mode;
-
-	/* Fill in additional standard functions. */
-	radeonInitDriverFuncs(functions);
-
-	radeon->radeonScreen = screen;
-	/* Allocate and initialize the Mesa context */
-	if (sharedContextPrivate)
-		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
-	else
-		shareCtx = NULL;
-	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
-					    functions, (void *)radeon);
-	if (!radeon->glCtx)
-		return GL_FALSE;
-
-	ctx = radeon->glCtx;
-	driContextPriv->driverPrivate = radeon;
-
-	/* DRI fields */
-	radeon->dri.context = driContextPriv;
-	radeon->dri.screen = sPriv;
-	radeon->dri.drawable = NULL;
-	radeon->dri.readable = NULL;
-	radeon->dri.hwContext = driContextPriv->hHWContext;
-	radeon->dri.hwLock = &sPriv->pSAREA->lock;
-	radeon->dri.fd = sPriv->fd;
-	radeon->dri.drmMinor = sPriv->drm_version.minor;
-
-	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
-					       screen->sarea_priv_offset);
-
-	/* Setup IRQs */
-	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
-	radeon->iw.irq_seq = -1;
-	radeon->irqsEmitted = 0;
-	radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
-			  radeon->radeonScreen->irq);
-
-	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
-
-	if (!radeon->do_irqs)
-		fprintf(stderr,
-			"IRQ's not enabled, falling back to %s: %d %d\n",
-			radeon->do_usleeps ? "usleeps" : "busy waits",
-			fthrottle_mode, radeon->radeonScreen->irq);
-
-	(*sPriv->systemTime->getUST) (&radeon->swap_ust);
-
-	return GL_TRUE;
-}
-
-
-/**
- * Cleanup common context fields.
- * Called by r200DestroyContext/r300DestroyContext
- */
-void radeonCleanupContext(radeonContextPtr radeon)
-{
-	/* _mesa_destroy_context() might result in calls to functions that
-	 * depend on the DriverCtx, so don't set it to NULL before.
-	 *
-	 * radeon->glCtx->DriverCtx = NULL;
-	 */
-
-	/* free the Mesa context */
-	_mesa_destroy_context(radeon->glCtx);
-
-	if (radeon->state.scissor.pClipRects) {
-		FREE(radeon->state.scissor.pClipRects);
-		radeon->state.scissor.pClipRects = 0;
-	}
-}
-
-
-/**
- * Swap front and back buffer.
- */
-void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
-{
-	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-		radeonContextPtr radeon;
-		GLcontext *ctx;
-
-		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-		ctx = radeon->glCtx;
-
-		if (ctx->Visual.doubleBufferMode) {
-			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-			if (radeon->doPageFlip) {
-				radeonPageFlip(dPriv);
-			} else {
-			    radeonCopyBuffer(dPriv, NULL);
-			}
-		}
-	} else {
-		/* XXX this shouldn't be an error but we can't handle it for now */
-		_mesa_problem(NULL, "%s: drawable has no context!",
-			      __FUNCTION__);
-	}
-}
-
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-			 int x, int y, int w, int h )
-{
-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-	radeonContextPtr radeon;
-	GLcontext *ctx;
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-	ctx = radeon->glCtx;
-
-	if (ctx->Visual.doubleBufferMode) {
-	    drm_clip_rect_t rect;
-	    rect.x1 = x + dPriv->x;
-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	    rect.x2 = rect.x1 + w;
-	    rect.y2 = rect.y1 + h;
-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-	    radeonCopyBuffer(dPriv, &rect);
-	}
-    } else {
-	/* XXX this shouldn't be an error but we can't handle it for now */
-	_mesa_problem(NULL, "%s: drawable has no context!",
-		      __FUNCTION__);
-    }
-}
-
-/* Force the context `c' to be the current context and associate with it
- * buffer `b'.
- */
-GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-			    __DRIdrawablePrivate * driDrawPriv,
-			    __DRIdrawablePrivate * driReadPriv)
-{
-	if (driContextPriv) {
-		radeonContextPtr radeon =
-			(radeonContextPtr) driContextPriv->driverPrivate;
-
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-				radeon->glCtx);
-
-		if (radeon->dri.drawable != driDrawPriv) {
-			if (driDrawPriv->swap_interval == (unsigned)-1) {
-				driDrawPriv->vblFlags =
-					(radeon->radeonScreen->irq != 0)
-					? driGetDefaultVBlankFlags(&radeon->
-								   optionCache)
-					: VBLANK_FLAG_NO_IRQ;
-
-				driDrawableInitVBlank(driDrawPriv);
-			}
-		}
-
-		radeon->dri.readable = driReadPriv;
-
-		if (radeon->dri.drawable != driDrawPriv ||
-		    radeon->lastStamp != driDrawPriv->lastStamp) {
-			radeon->dri.drawable = driDrawPriv;
-
-			radeonSetCliprects(radeon);
-			r300UpdateViewportOffset(radeon->glCtx);
-		}
-
-		_mesa_make_current(radeon->glCtx,
-				    (GLframebuffer *) driDrawPriv->
-				    driverPrivate,
-				    (GLframebuffer *) driReadPriv->
-				    driverPrivate);
-
-		_mesa_update_state(radeon->glCtx);		
-
-		radeonUpdatePageFlipping(radeon);
-	} else {
-		if (RADEON_DEBUG & DEBUG_DRI)
-			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-		_mesa_make_current(0, 0, 0);
-	}
-
-	if (RADEON_DEBUG & DEBUG_DRI)
-		fprintf(stderr, "End %s\n", __FUNCTION__);
-	return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
-{
-	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_DRI)
-		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
-			radeon->glCtx);
-
-	return GL_TRUE;
-}
-
diff --git a/src/mesa/drivers/dri/r300/radeon_context.h b/src/mesa/drivers/dri/r300/radeon_context.h
index 47cbc22a725..250570f6b89 100644
--- a/src/mesa/drivers/dri/r300/radeon_context.h
+++ b/src/mesa/drivers/dri/r300/radeon_context.h
@@ -49,20 +49,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "dri_util.h"
 
-struct radeon_context;
-typedef struct radeon_context radeonContextRec;
-typedef struct radeon_context *radeonContextPtr;
-
-/* Rasterizing fallbacks */
-/* See correponding strings in r200_swtcl.c */
-#define RADEON_FALLBACK_TEXTURE		0x0001
-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
-#define RADEON_FALLBACK_STENCIL		0x0004
-#define RADEON_FALLBACK_RENDER_MODE	0x0008
-#define RADEON_FALLBACK_BLEND_EQ	0x0010
-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
-#define RADEON_FALLBACK_DISABLE		0x0040
-#define RADEON_FALLBACK_BORDER_MODE	0x0080
+#include "radeon_screen.h"
 
 #if R200_MERGED
 extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
@@ -79,155 +66,11 @@ extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 /* TCL fallbacks */
 extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 
-#define RADEON_TCL_FALLBACK_RASTER		0x0001	/* rasterization */
-#define RADEON_TCL_FALLBACK_UNFILLED		0x0002	/* unfilled tris */
-#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE	0x0004	/* twoside tris */
-#define RADEON_TCL_FALLBACK_MATERIAL		0x0008	/* material in vb */
-#define RADEON_TCL_FALLBACK_TEXGEN_0		0x0010	/* texgen, unit 0 */
-#define RADEON_TCL_FALLBACK_TEXGEN_1		0x0020	/* texgen, unit 1 */
-#define RADEON_TCL_FALLBACK_TEXGEN_2		0x0040	/* texgen, unit 2 */
-#define RADEON_TCL_FALLBACK_TEXGEN_3		0x0080	/* texgen, unit 3 */
-#define RADEON_TCL_FALLBACK_TEXGEN_4		0x0100	/* texgen, unit 4 */
-#define RADEON_TCL_FALLBACK_TEXGEN_5		0x0200	/* texgen, unit 5 */
-#define RADEON_TCL_FALLBACK_TCL_DISABLE		0x0400	/* user disable */
-#define RADEON_TCL_FALLBACK_BITMAP		0x0800	/* draw bitmap with points */
-#define RADEON_TCL_FALLBACK_VERTEX_PROGRAM	0x1000	/* vertex program active */
-
 #if R200_MERGED
 #define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
 #else
 #define TCL_FALLBACK( ctx, bit, mode )	;
 #endif
 
-struct radeon_dri_mirror {
-	__DRIcontextPrivate *context;	/* DRI context */
-	__DRIscreenPrivate *screen;	/* DRI screen */
-	/**
-	 * DRI drawable bound to this context for drawing.
-	 */
-	__DRIdrawablePrivate *drawable;
-
-	/**
-	 * DRI drawable bound to this context for reading.
-	 */
-	__DRIdrawablePrivate *readable;
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int fd;
-	int drmMinor;
-};
-
-/**
- * Derived state for internal purposes.
- */
-struct radeon_scissor_state {
-	drm_clip_rect_t rect;
-	GLboolean enabled;
-
-	GLuint numClipRects;	/* Cliprects active */
-	GLuint numAllocedClipRects;	/* Cliprects available */
-	drm_clip_rect_t *pClipRects;
-};
-
-struct radeon_colorbuffer_state {
-	GLuint clear;
-	GLint drawOffset, drawPitch;
-};
-
-struct radeon_state {
-	struct radeon_colorbuffer_state color;
-	struct radeon_scissor_state scissor;
-};
-
-/**
- * Common per-context variables shared by R200 and R300.
- * R200- and R300-specific code "derive" their own context from this
- * structure.
- */
-struct radeon_context {
-	GLcontext *glCtx;	/* Mesa context */
-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
-
-	/* Fallback state */
-	GLuint Fallback;
-	GLuint TclFallback;
-
-	/* Page flipping */
-	GLuint doPageFlip;
-
-	/* Drawable, cliprect and scissor information */
-	GLuint numClipRects;	/* Cliprects for the draw buffer */
-	drm_clip_rect_t *pClipRects;
-	unsigned int lastStamp;
-	GLboolean lost_context;
-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
-
-	/* Mirrors of some DRI state */
-	struct radeon_dri_mirror dri;
-
-	/* Busy waiting */
-	GLuint do_usleeps;
-	GLuint do_irqs;
-	GLuint irqsEmitted;
-	drm_radeon_irq_wait_t iw;
-
-	/* buffer swap */
-	int64_t swap_ust;
-	int64_t swap_missed_ust;
-
-	GLuint swap_count;
-	GLuint swap_missed_count;
-
-	/* Derived state */
-	struct radeon_state state;
-
-	/* Configuration cache
-	 */
-	driOptionCache optionCache;
-};
-
-#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
-
-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-				int x, int y, int w, int h);
-extern GLboolean radeonInitContext(radeonContextPtr radeon,
-				   struct dd_function_table *functions,
-				   const __GLcontextModes * glVisual,
-				   __DRIcontextPrivate * driContextPriv,
-				   void *sharedContextPrivate);
-extern void radeonCleanupContext(radeonContextPtr radeon);
-extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-				   __DRIdrawablePrivate * driDrawPriv,
-				   __DRIdrawablePrivate * driReadPriv);
-extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
-
-/* ================================================================
- * Debugging:
- */
-#define DO_DEBUG		1
-
-#if DO_DEBUG
-extern int RADEON_DEBUG;
-#else
-#define RADEON_DEBUG		0
-#endif
-
-#define DEBUG_TEXTURE	0x0001
-#define DEBUG_STATE	0x0002
-#define DEBUG_IOCTL	0x0004
-#define DEBUG_PRIMS	0x0008
-#define DEBUG_VERTS	0x0010
-#define DEBUG_FALLBACKS	0x0020
-#define DEBUG_VFMT	0x0040
-#define DEBUG_CODEGEN	0x0080
-#define DEBUG_VERBOSE	0x0100
-#define DEBUG_DRI       0x0200
-#define DEBUG_DMA       0x0400
-#define DEBUG_SANITY    0x0800
-#define DEBUG_SYNC      0x1000
-#define DEBUG_PIXEL     0x2000
-#define DEBUG_MEMORY    0x4000
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.c b/src/mesa/drivers/dri/r300/radeon_ioctl.c
deleted file mode 100644
index f042a7b943a..00000000000
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.c
+++ /dev/null
@@ -1,396 +0,0 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <[email protected]>
- */
-
-#include <sched.h>
-#include <errno.h>
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/macros.h"
-#include "main/context.h"
-#include "swrast/swrast.h"
-#include "r300_context.h"
-#include "radeon_ioctl.h"
-#include "r300_ioctl.h"
-#include "r300_state.h"
-#include "radeon_reg.h"
-
-#include "drirenderbuffer.h"
-#include "vblank.h"
-
-static void radeonWaitForIdle(radeonContextPtr radeon);
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
-{
-	drm_radeon_getparam_t gp;
-	int ret;
-	uint32_t frame = 0;
-
-	gp.param = RADEON_PARAM_LAST_FRAME;
-	gp.value = (int *)&frame;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
-				  &gp, sizeof(gp));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-
-	return frame;
-}
-
-uint32_t radeonGetAge(radeonContextPtr radeon)
-{
-	drm_radeon_getparam_t gp;
-	int ret;
-	uint32_t age = 0;
-
-	gp.param = RADEON_PARAM_LAST_CLEAR;
-	gp.value = (int *)&age;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
-				  &gp, sizeof(gp));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-
-	return age;
-}
-
-static void radeonEmitIrqLocked(radeonContextPtr radeon)
-{
-	drm_radeon_irq_emit_t ie;
-	int ret;
-
-	ie.irq_seq = &radeon->iw.irq_seq;
-	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
-				  &ie, sizeof(ie));
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-}
-
-static void radeonWaitIrq(radeonContextPtr radeon)
-{
-	int ret;
-
-	do {
-		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
-				      &radeon->iw, sizeof(radeon->iw));
-	} while (ret && (errno == EINTR || errno == EBUSY));
-
-	if (ret) {
-		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
-			ret);
-		exit(1);
-	}
-}
-
-static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
-{
-	drm_radeon_sarea_t *sarea = radeon->sarea;
-
-	if (radeon->do_irqs) {
-		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
-			if (!radeon->irqsEmitted) {
-				while (radeonGetLastFrame(radeon) <
-				       sarea->last_frame) ;
-			} else {
-				UNLOCK_HARDWARE(radeon);
-				radeonWaitIrq(radeon);
-				LOCK_HARDWARE(radeon);
-			}
-			radeon->irqsEmitted = 10;
-		}
-
-		if (radeon->irqsEmitted) {
-			radeonEmitIrqLocked(radeon);
-			radeon->irqsEmitted--;
-		}
-	} else {
-		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
-			UNLOCK_HARDWARE(radeon);
-			if (radeon->do_usleeps)
-				DO_USLEEP(1);
-			LOCK_HARDWARE(radeon);
-		}
-	}
-}
-
-/* Copy the back color buffer to the front color buffer.
- */
-void radeonCopyBuffer(__DRIdrawablePrivate * dPriv,
-		      const drm_clip_rect_t	 * rect)
-{
-	radeonContextPtr radeon;
-	GLint nbox, i, ret;
-	GLboolean missed_target;
-	int64_t ust;
-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-	assert(dPriv);
-	assert(dPriv->driContextPriv);
-	assert(dPriv->driContextPriv->driverPrivate);
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "\n%s( %p )\n\n", __FUNCTION__,
-			(void *)radeon->glCtx);
-	}
-
-	r300Flush(radeon->glCtx);
-
-	LOCK_HARDWARE(radeon);
-
-	/* Throttle the frame rate -- only allow one pending swap buffers
-	 * request at a time.
-	 */
-	radeonWaitForFrameCompletion(radeon);
-	if (!rect)
-	{
-	    UNLOCK_HARDWARE(radeon);
-	    driWaitForVBlank(dPriv, &missed_target);
-	    LOCK_HARDWARE(radeon);
-	}
-
-	nbox = dPriv->numClipRects;	/* must be in locked region */
-
-	for (i = 0; i < nbox;) {
-		GLint nr = MIN2(i + RADEON_NR_SAREA_CLIPRECTS, nbox);
-		drm_clip_rect_t *box = dPriv->pClipRects;
-		drm_clip_rect_t *b = radeon->sarea->boxes;
-		GLint n = 0;
-
-		for ( ; i < nr ; i++ ) {
-
-		    *b = box[i];
-
-		    if (rect)
-		    {
-			if (rect->x1 > b->x1)
-			    b->x1 = rect->x1;
-			if (rect->y1 > b->y1)
-			    b->y1 = rect->y1;
-			if (rect->x2 < b->x2)
-			    b->x2 = rect->x2;
-			if (rect->y2 < b->y2)
-			    b->y2 = rect->y2;
-
-			if (b->x1 >= b->x2 || b->y1 >= b->y2)
-			    continue;
-		    }
-
-		    b++;
-		    n++;
-		}
-		radeon->sarea->nbox = n;
-
-		if (!n)
-		   continue;
-
-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_SWAP);
-
-		if (ret) {
-			fprintf(stderr, "DRM_RADEON_SWAP: return = %d\n",
-				ret);
-			UNLOCK_HARDWARE(radeon);
-			exit(1);
-		}
-	}
-
-	UNLOCK_HARDWARE(radeon);
-	if (!rect)
-	{
-	    ((r300ContextPtr)radeon)->hw.all_dirty = GL_TRUE;
-
-	    radeon->swap_count++;
-	    (*psp->systemTime->getUST) (&ust);
-	    if (missed_target) {
-		radeon->swap_missed_count++;
-		radeon->swap_missed_ust = ust - radeon->swap_ust;
-	    }
-
-	    radeon->swap_ust = ust;
-
-	    sched_yield();
-	}
-}
-
-void radeonPageFlip(__DRIdrawablePrivate * dPriv)
-{
-	radeonContextPtr radeon;
-	GLint ret;
-	GLboolean missed_target;
-	__DRIscreenPrivate *psp = dPriv->driScreenPriv;
-
-	assert(dPriv);
-	assert(dPriv->driContextPriv);
-	assert(dPriv->driContextPriv->driverPrivate);
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-	if (RADEON_DEBUG & DEBUG_IOCTL) {
-		fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-			radeon->sarea->pfCurrentPage);
-	}
-
-	r300Flush(radeon->glCtx);
-	LOCK_HARDWARE(radeon);
-
-	if (!dPriv->numClipRects) {
-		UNLOCK_HARDWARE(radeon);
-		usleep(10000);	/* throttle invisible client 10ms */
-		return;
-	}
-
-	/* Need to do this for the perf box placement:
-	 */
-	{
-		drm_clip_rect_t *box = dPriv->pClipRects;
-		drm_clip_rect_t *b = radeon->sarea->boxes;
-		b[0] = box[0];
-		radeon->sarea->nbox = 1;
-	}
-
-	/* Throttle the frame rate -- only allow a few pending swap buffers
-	 * request at a time.
-	 */
-	radeonWaitForFrameCompletion(radeon);
-	UNLOCK_HARDWARE(radeon);
-	driWaitForVBlank(dPriv, &missed_target);
-	if (missed_target) {
-		radeon->swap_missed_count++;
-		(void)(*psp->systemTime->getUST) (&radeon->swap_missed_ust);
-	}
-	LOCK_HARDWARE(radeon);
-
-	ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_FLIP);
-
-	UNLOCK_HARDWARE(radeon);
-
-	if (ret) {
-		fprintf(stderr, "DRM_RADEON_FLIP: return = %d\n", ret);
-		exit(1);
-	}
-
-	radeon->swap_count++;
-	(void)(*psp->systemTime->getUST) (&radeon->swap_ust);
-
-        driFlipRenderbuffers(radeon->glCtx->WinSysDrawBuffer, 
-                             radeon->sarea->pfCurrentPage);
-
-	if (radeon->sarea->pfCurrentPage == 1) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	}
-
-	if (IS_R300_CLASS(radeon->radeonScreen)) {
-		r300ContextPtr r300 = (r300ContextPtr)radeon;
-		R300_STATECHANGE(r300, cb);
-		r300->hw.cb.cmd[R300_CB_OFFSET] = r300->radeon.state.color.drawOffset + 
-						r300->radeon.radeonScreen->fbLocation;
-		r300->hw.cb.cmd[R300_CB_PITCH] = r300->radeon.state.color.drawPitch;
-		
-		if (r300->radeon.radeonScreen->cpp == 4)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_ARGB8888;
-		else
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_FORMAT_RGB565;
-	
-		if (r300->radeon.sarea->tiling_enabled)
-			r300->hw.cb.cmd[R300_CB_PITCH] |= R300_COLOR_TILE_ENABLE;
-	}
-}
-
-void radeonWaitForIdleLocked(radeonContextPtr radeon)
-{
-	int ret;
-	int i = 0;
-
-	do {
-		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
-		if (ret)
-			DO_USLEEP(1);
-	} while (ret && ++i < 100);
-
-	if (ret < 0) {
-		UNLOCK_HARDWARE(radeon);
-		fprintf(stderr, "Error: R300 timed out... exiting\n");
-		exit(-1);
-	}
-}
-
-static void radeonWaitForIdle(radeonContextPtr radeon)
-{
-	LOCK_HARDWARE(radeon);
-	radeonWaitForIdleLocked(radeon);
-	UNLOCK_HARDWARE(radeon);
-}
-
-void radeonFlush(GLcontext * ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	if (IS_R300_CLASS(radeon->radeonScreen))
-		r300Flush(ctx);
-}
-
-
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void radeonFinish(GLcontext * ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	radeonFlush(ctx);
-
-	if (radeon->do_irqs) {
-		LOCK_HARDWARE(radeon);
-		radeonEmitIrqLocked(radeon);
-		UNLOCK_HARDWARE(radeon);
-		radeonWaitIrq(radeon);
-	} else
-		radeonWaitForIdle(radeon);
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.c b/src/mesa/drivers/dri/r300/radeon_lock.c
deleted file mode 100644
index 4f47afd5dc6..00000000000
--- a/src/mesa/drivers/dri/r300/radeon_lock.c
+++ /dev/null
@@ -1,137 +0,0 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Gareth Hughes <[email protected]>
- *   Keith Whitwell <[email protected]>
- *   Kevin E. Martin <[email protected]>
- */
-
-#include "radeon_lock.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
-#include "r300_context.h"
-#include "r300_state.h"
-
-#include "main/framebuffer.h"
-
-#include "drirenderbuffer.h"
-
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-void radeonUpdatePageFlipping(radeonContextPtr rmesa)
-{
-	int use_back;
-
-	rmesa->doPageFlip = rmesa->sarea->pfState;
-	if (rmesa->glCtx->WinSysDrawBuffer) {
-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-				     rmesa->sarea->pfCurrentPage);
-		r300UpdateDrawBuffer(rmesa->glCtx);
-	}
-
-	use_back = rmesa->glCtx->DrawBuffer ?
-	    (rmesa->glCtx->DrawBuffer->_ColorDrawBufferIndexes[0] ==
-	     BUFFER_BACK_LEFT) : 1;
-	use_back ^= (rmesa->sarea->pfCurrentPage == 1);
-
-	if (use_back) {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->backOffset;
-		rmesa->state.color.drawPitch = rmesa->radeonScreen->backPitch;
-	} else {
-		rmesa->state.color.drawOffset =
-		    rmesa->radeonScreen->frontOffset;
-		rmesa->state.color.drawPitch =
-		    rmesa->radeonScreen->frontPitch;
-	}
-}
-
-/* Update the hardware state.  This is called if another context has
- * grabbed the hardware lock, which includes the X server.  This
- * function also updates the driver's window state after the X server
- * moves, resizes or restacks a window -- the change will be reflected
- * in the drawable position and clip rects.  Since the X server grabs
- * the hardware lock when it changes the window state, this routine will
- * automatically be called after such a change.
- */
-void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
-{
-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
-	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
-	drm_radeon_sarea_t *sarea = rmesa->sarea;
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-
-	assert(drawable != NULL);
-
-	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
-
-	/* The window might have moved, so we might need to get new clip
-	 * rects.
-	 *
-	 * NOTE: This releases and regrabs the hw lock to allow the X server
-	 * to respond to the DRI protocol request for new drawable info.
-	 * Since the hardware state depends on having the latest drawable
-	 * clip rects, all state checking must be done _after_ this call.
-	 */
-	DRI_VALIDATE_DRAWABLE_INFO(sPriv, drawable);
-	if (drawable != readable) {
-		DRI_VALIDATE_DRAWABLE_INFO(sPriv, readable);
-	}
-
-	if (rmesa->lastStamp != drawable->lastStamp) {
-		radeonUpdatePageFlipping(rmesa);
-		radeonSetCliprects(rmesa);
-		r300UpdateViewportOffset(rmesa->glCtx);
-		driUpdateFramebufferSize(rmesa->glCtx, drawable);
-	}
-
-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
-		int i;
-
-		sarea->ctx_owner = rmesa->dri.hwContext;
-		for (i = 0; i < r300->nr_heaps; i++) {
-			DRI_AGE_TEXTURES(r300->texture_heaps[i]);
-		}
-	}
-
-	rmesa->lost_context = GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.c b/src/mesa/drivers/dri/r300/radeon_nqssadce.c
index 4a2e1cba402..840c9733b16 100644
--- a/src/mesa/drivers/dri/r300/radeon_nqssadce.c
+++ b/src/mesa/drivers/dri/r300/radeon_nqssadce.c
@@ -46,6 +46,7 @@ static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint fil
 	switch(file) {
 	case PROGRAM_TEMPORARY: return &s->Temps[index];
 	case PROGRAM_OUTPUT: return &s->Outputs[index];
+	case PROGRAM_ADDRESS: return &s->Address;
 	default: return 0;
 	}
 }
@@ -56,7 +57,7 @@ static struct register_state *get_reg_state(struct nqssadce_state* s, GLuint fil
  *
  * @note Works correctly only for X, Y, Z, W swizzles, not for constant swizzles.
  */
-static struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg)
+struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg)
 {
 	struct prog_src_register tmp = srcreg;
 	int i;
@@ -114,47 +115,19 @@ static struct prog_instruction* track_used_srcreg(struct nqssadce_state* s,
 		deswz_source = sourced;
 	}
 
-	struct register_state *regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index);
+	struct register_state *regstate;
+
+	if (inst->SrcReg[src].RelAddr)
+		regstate = get_reg_state(s, PROGRAM_ADDRESS, 0);
+	else
+		regstate = get_reg_state(s, inst->SrcReg[src].File, inst->SrcReg[src].Index);
+
 	if (regstate)
 		regstate->Sourced |= deswz_source & 0xf;
 
 	return inst;
 }
 
-
-static void rewrite_depth_out(struct prog_instruction *inst)
-{
-	if (inst->DstReg.WriteMask & WRITEMASK_Z) {
-		inst->DstReg.WriteMask = WRITEMASK_W;
-	} else {
-		inst->DstReg.WriteMask = 0;
-		return;
-	}
-
-	switch (inst->Opcode) {
-	case OPCODE_FRC:
-	case OPCODE_MOV:
-		inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-		break;
-	case OPCODE_ADD:
-	case OPCODE_MAX:
-	case OPCODE_MIN:
-	case OPCODE_MUL:
-		inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-		inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
-		break;
-	case OPCODE_CMP:
-	case OPCODE_MAD:
-		inst->SrcReg[0] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[0]);
-		inst->SrcReg[1] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[1]);
-		inst->SrcReg[2] = lmul_swizzle(SWIZZLE_ZZZZ, inst->SrcReg[2]);
-		break;
-	default:
-		// Scalar instructions needn't be reswizzled
-		break;
-	}
-}
-
 static void unalias_srcregs(struct prog_instruction *inst, GLuint oldindex, GLuint newindex)
 {
 	int nsrc = _mesa_num_inst_src_regs(inst->Opcode);
@@ -189,11 +162,6 @@ static void process_instruction(struct nqssadce_state* s)
 		return;
 
 	if (inst->Opcode != OPCODE_KIL) {
-		if (s->Descr->RewriteDepthOut) {
-			if (inst->DstReg.File == PROGRAM_OUTPUT && inst->DstReg.Index == FRAG_RESULT_DEPTH)
-				rewrite_depth_out(inst);
-		}
-
 		struct register_state *regstate = get_reg_state(s, inst->DstReg.File, inst->DstReg.Index);
 		if (!regstate) {
 			_mesa_problem(s->Ctx, "NqssaDce: bad destination register (%i[%i])\n",
@@ -217,6 +185,7 @@ static void process_instruction(struct nqssadce_state* s)
 	 * might change the instruction stream under us, so we have
 	 * to be careful with the inst pointer. */
 	switch (inst->Opcode) {
+	case OPCODE_ARL:
 	case OPCODE_DDX:
 	case OPCODE_DDY:
 	case OPCODE_FRC:
@@ -227,6 +196,8 @@ static void process_instruction(struct nqssadce_state* s)
 	case OPCODE_MAX:
 	case OPCODE_MIN:
 	case OPCODE_MUL:
+	case OPCODE_SGE:
+	case OPCODE_SLT:
 		inst = track_used_srcreg(s, inst, 0, inst->DstReg.WriteMask);
 		inst = track_used_srcreg(s, inst, 1, inst->DstReg.WriteMask);
 		break;
@@ -258,12 +229,51 @@ static void process_instruction(struct nqssadce_state* s)
 	case OPCODE_TXP:
 		inst = track_used_srcreg(s, inst, 0, 0xf);
 		break;
+	case OPCODE_DST:
+		inst = track_used_srcreg(s, inst, 0, 0x6);
+		inst = track_used_srcreg(s, inst, 1, 0xa);
+		break;
+	case OPCODE_EXP:
+	case OPCODE_LOG:
+	case OPCODE_POW:
+		inst = track_used_srcreg(s, inst, 0, 0x3);
+		break;
+	case OPCODE_LIT:
+		inst = track_used_srcreg(s, inst, 0, 0xb);
+		break;
 	default:
 		_mesa_problem(s->Ctx, "NqssaDce: Unknown opcode %d\n", inst->Opcode);
 		return;
 	}
 }
 
+static void calculateInputsOutputs(struct gl_program *p)
+{
+	struct prog_instruction *inst;
+	GLuint InputsRead, OutputsWritten;
+
+	inst = p->Instructions;
+	InputsRead = 0;
+	OutputsWritten = 0;
+	while (inst->Opcode != OPCODE_END)
+	{
+		int i, num_src_regs;
+
+		num_src_regs = _mesa_num_inst_src_regs(inst->Opcode);
+		for (i = 0; i < num_src_regs; ++i) {
+			if (inst->SrcReg[i].File == PROGRAM_INPUT)
+				InputsRead |= 1 << inst->SrcReg[i].Index;
+		}
+
+		if (inst->DstReg.File == PROGRAM_OUTPUT)
+			OutputsWritten |= 1 << inst->DstReg.Index;
+
+		++inst;
+	}
+
+	p->InputsRead = InputsRead;
+	p->OutputsWritten = OutputsWritten;
+}
 
 void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr)
 {
@@ -280,4 +290,6 @@ void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce
 		s.IP--;
 		process_instruction(&s);
 	}
+
+	calculateInputsOutputs(p);
 }
diff --git a/src/mesa/drivers/dri/r300/radeon_nqssadce.h b/src/mesa/drivers/dri/r300/radeon_nqssadce.h
index a4f94abcb62..8626f21c25e 100644
--- a/src/mesa/drivers/dri/r300/radeon_nqssadce.h
+++ b/src/mesa/drivers/dri/r300/radeon_nqssadce.h
@@ -58,6 +58,7 @@ struct nqssadce_state {
 	 */
 	struct register_state Temps[MAX_PROGRAM_TEMPS];
 	struct register_state Outputs[VERT_RESULT_MAX];
+	struct register_state Address;
 };
 
 
@@ -83,14 +84,10 @@ struct radeon_nqssadce_descr {
 	 */
 	void (*BuildSwizzle)(struct nqssadce_state*, struct prog_dst_register dst, struct prog_src_register src);
 
-	/**
-	 * Rewrite instructions that write to DEPR.z to write to DEPR.w
-	 * instead (rewriting is done *before* the WriteMask test).
-	 */
-	GLboolean RewriteDepthOut;
 	void *Data;
 };
 
 void radeonNqssaDce(GLcontext *ctx, struct gl_program *p, struct radeon_nqssadce_descr* descr);
+struct prog_src_register lmul_swizzle(GLuint swizzle, struct prog_src_register srcreg);
 
 #endif /* __RADEON_PROGRAM_NQSSADCE_H_ */
diff --git a/src/mesa/drivers/dri/r300/radeon_program.h b/src/mesa/drivers/dri/r300/radeon_program.h
index b411f69bc88..88474d43a22 100644
--- a/src/mesa/drivers/dri/r300/radeon_program.h
+++ b/src/mesa/drivers/dri/r300/radeon_program.h
@@ -52,6 +52,38 @@ enum {
 #define SWIZZLE_0000 MAKE_SWIZZLE4(SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO, SWIZZLE_ZERO)
 #define SWIZZLE_1111 MAKE_SWIZZLE4(SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE, SWIZZLE_ONE)
 
+static inline GLuint get_swz(GLuint swz, GLuint idx)
+{
+	if (idx & 0x4)
+		return idx;
+	return GET_SWZ(swz, idx);
+}
+
+static inline GLuint combine_swizzles4(GLuint src, GLuint swz_x, GLuint swz_y, GLuint swz_z, GLuint swz_w)
+{
+	GLuint ret = 0;
+
+	ret |= get_swz(src, swz_x);
+	ret |= get_swz(src, swz_y) << 3;
+	ret |= get_swz(src, swz_z) << 6;
+	ret |= get_swz(src, swz_w) << 9;
+
+	return ret;
+}
+
+static inline GLuint combine_swizzles(GLuint src, GLuint swz)
+{
+	GLuint ret = 0;
+
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_X));
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_Y)) << 3;
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_Z)) << 6;
+	ret |= get_swz(src, GET_SWZ(swz, SWIZZLE_W)) << 9;
+
+	return ret;
+}
+
+
 /**
  * Transformation context that is passed to local transformations.
  *
diff --git a/src/mesa/drivers/dri/r300/radeon_program_pair.c b/src/mesa/drivers/dri/r300/radeon_program_pair.c
index 2e21f7bf666..d6fb474cf23 100644
--- a/src/mesa/drivers/dri/r300/radeon_program_pair.c
+++ b/src/mesa/drivers/dri/r300/radeon_program_pair.c
@@ -35,7 +35,7 @@
 
 #include "radeon_program_pair.h"
 
-#include "radeon_context.h"
+#include "radeon_common.h"
 
 #include "shader/prog_print.h"
 
@@ -47,7 +47,6 @@
 
 struct pair_state_instruction {
 	GLuint IsTex:1; /**< Is a texture instruction */
-	GLuint IsOutput:1; /**< Is output instruction */
 	GLuint NeedRGB:1; /**< Needs the RGB ALU */
 	GLuint NeedAlpha:1; /**< Needs the Alpha ALU */
 	GLuint IsTranscendent:1; /**< Is a special transcendent instruction */
@@ -124,7 +123,6 @@ struct pair_state {
 	GLboolean Debug;
 	GLboolean Verbose;
 	void *UserData;
-	GLubyte NumKillInsts;
 
 	/**
 	 * Translate Mesa registers to hardware registers
@@ -151,11 +149,6 @@ struct pair_state {
 	struct pair_state_instruction *ReadyTEX;
 
 	/**
-	 * Linked list of deferred instructions
-	 */
-	struct pair_state_instruction *DeferredInsts;
-
-	/**
 	 * Pool of @ref reg_value structures for fast allocation.
 	 */
 	struct reg_value *ValuePool;
@@ -238,9 +231,7 @@ static void instruction_ready(struct pair_state *s, int ip)
 	if (s->Verbose)
 		_mesa_printf("instruction_ready(%i)\n", ip);
 
-	if (s->NumKillInsts > 0 && pairinst->IsOutput)
-		add_pairinst_to_list(&s->DeferredInsts, pairinst);
-	else if (pairinst->IsTex)
+	if (pairinst->IsTex)
 		add_pairinst_to_list(&s->ReadyTEX, pairinst);
 	else if (!pairinst->NeedAlpha)
 		add_pairinst_to_list(&s->ReadyRGB, pairinst);
@@ -348,8 +339,6 @@ static void classify_instruction(struct pair_state *s,
 		error("Unknown opcode %d\n", inst->Opcode);
 		break;
 	}
-
-	pairinst->IsOutput = (inst->DstReg.File == PROGRAM_OUTPUT);
 }
 
 
@@ -613,16 +602,14 @@ static void emit_all_tex(struct pair_state *s)
 		struct prog_instruction *inst = s->Program->Instructions + ip;
 		commit_instruction(s, ip);
 
-		if (inst->Opcode == OPCODE_KIL)
-			--s->NumKillInsts;
-		else
+		if (inst->Opcode != OPCODE_KIL)
 			inst->DstReg.Index = get_hw_reg(s, inst->DstReg.File, inst->DstReg.Index);
-
 		inst->SrcReg[0].Index = get_hw_reg(s, inst->SrcReg[0].File, inst->SrcReg[0].Index);
 
 		if (s->Debug) {
 			_mesa_printf("   ");
 			_mesa_print_instruction(inst);
+			fflush(stdout);
 		}
 		s->Error = s->Error || !s->Handler->EmitTex(s->UserData, inst);
 	}
@@ -875,17 +862,6 @@ static void emit_alu(struct pair_state *s)
 	s->Error = s->Error || !s->Handler->EmitPaired(s->UserData, &pair);
 }
 
-static GLubyte countKillInsts(struct gl_program *prog)
-{
-	GLubyte i, count = 0;
-
-	for (i = 0; i < prog->NumInstructions; ++i) {
-		if (prog->Instructions[i].Opcode == OPCODE_KIL)
-			++count;
-	}
-
-	return count;
-}
 
 GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 	const struct radeon_pair_handler* handler, void *userdata)
@@ -894,12 +870,11 @@ GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 
 	_mesa_bzero(&s, sizeof(s));
 	s.Ctx = ctx;
-	s.Program = program;
+	s.Program = _mesa_clone_program(ctx, program);
 	s.Handler = handler;
 	s.UserData = userdata;
 	s.Debug = (RADEON_DEBUG & DEBUG_PIXEL) ? GL_TRUE : GL_FALSE;
 	s.Verbose = GL_FALSE && s.Debug;
-	s.NumKillInsts = countKillInsts(program);
 
 	s.Instructions = (struct pair_state_instruction*)_mesa_calloc(
 		sizeof(struct pair_state_instruction)*s.Program->NumInstructions);
@@ -918,21 +893,6 @@ GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 		if (s.ReadyTEX)
 			emit_all_tex(&s);
 
-		if (!s.NumKillInsts) {
-			struct pair_state_instruction *pairinst = s.DeferredInsts;
-			while (pairinst) {
-				if (!pairinst->NeedAlpha)
-					add_pairinst_to_list(&s.ReadyRGB, pairinst);
-				else if (!pairinst->NeedRGB)
-					add_pairinst_to_list(&s.ReadyAlpha, pairinst);
-				else
-					add_pairinst_to_list(&s.ReadyFullALU, pairinst);
-
-				pairinst = pairinst->NextReady;
-			}
-			s.DeferredInsts = NULL;
-		}
-
 		while(s.ReadyFullALU || s.ReadyRGB || s.ReadyAlpha)
 			emit_alu(&s);
 	}
@@ -944,6 +904,8 @@ GLboolean radeonPairProgram(GLcontext *ctx, struct gl_program *program,
 	_mesa_free(s.ValuePool);
 	_mesa_free(s.ReaderPool);
 
+	_mesa_reference_program(ctx, &s.Program, NULL);
+
 	return !s.Error;
 }
 
diff --git a/src/mesa/drivers/dri/r300/radeon_span.c b/src/mesa/drivers/dri/r300/radeon_span.c
deleted file mode 100644
index 16f9fb99e67..00000000000
--- a/src/mesa/drivers/dri/r300/radeon_span.c
+++ /dev/null
@@ -1,349 +0,0 @@
-/**************************************************************************
-
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <[email protected]>
- *   Gareth Hughes <[email protected]>
- *   Keith Whitwell <[email protected]>
- *
- */
-
-#include "main/glheader.h"
-#include "swrast/swrast.h"
-
-#include "r300_state.h"
-#include "radeon_ioctl.h"
-#include "r300_ioctl.h"
-#include "radeon_span.h"
-
-#include "drirenderbuffer.h"
-
-#define DBG 0
-
-/*
- * Note that all information needed to access pixels in a renderbuffer
- * should be obtained through the gl_renderbuffer parameter, not per-context
- * information.
- */
-#define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
-
-#define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
-
-#define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
-
-#define Y_FLIP(Y) (bottom - (Y))
-
-#define HW_LOCK()
-
-#define HW_UNLOCK()
-
-/* ================================================================
- * Color buffer
- */
-
-/* 16 bit, RGB565 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_RGB
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_5_6_5
-
-#define TAG(x)    radeon##x##_RGB565
-#define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
-#include "spantmp2.h"
-
-/* 32 bit, ARGB8888 color spanline and pixel functions
- */
-#define SPANTMP_PIXEL_FMT GL_BGRA
-#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
-
-#define TAG(x)    radeon##x##_ARGB8888
-#define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
-#include "spantmp2.h"
-
-/* ================================================================
- * Depth buffer
- */
-
-/* The Radeon family has depth tiling on all the time, so we have to convert
- * the x,y coordinates into the memory bus address (mba) in the same
- * manner as the engine.  In each case, the linear block address (ba)
- * is calculated, and then wired with x and y to produce the final
- * memory address.
- * The chip will do address translation on its own if the surface registers
- * are set up correctly. It is not quite enough to get it working with hyperz
- * too...
- */
-
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 4 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0..1] = 0           */
-
-#ifdef COMPILE_R300
-		ba = (y / 8) * (pitch / 8) + (x / 8);
-#else
-		ba = (y / 16) * (pitch / 16) + (x / 16);
-#endif
-
-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 2 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0]    = 0           */
-
-		ba = (y / 16) * (pitch / 32) + (x / 32);
-
-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-/* 16-bit depth buffer functions
- */
-#define VALUE_TYPE GLushort
-
-#define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
-
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
-
-#define TAG(x) radeon##x##_z16
-#include "depthtmp.h"
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- *
- * Careful: It looks like the R300 uses ZZZS byte order while the R200
- * uses SZZZ for 24 bit depth, 8 bit stencil mode.
- */
-#define VALUE_TYPE GLuint
-
-#ifdef COMPILE_R300
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x000000ff;							\
-   tmp |= ((d << 8) & 0xffffff00);					\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#else
-#define WRITE_DEPTH( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xff000000;							\
-   tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#endif
-
-#ifdef COMPILE_R300
-#define READ_DEPTH( d, _x, _y )						\
-  do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
-					 _y + yo )) & 0xffffff00) >> 8; \
-  }while(0)
-#else
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
-#endif
-
-#define TAG(x) radeon##x##_z24_s8
-#include "depthtmp.h"
-
-/* ================================================================
- * Stencil buffer
- */
-
-/* 24 bit depth, 8 bit stencil depthbuffer functions
- */
-#ifdef COMPILE_R300
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0xffffff00;							\
-   tmp |= (d) & 0xff;							\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#else
-#define WRITE_STENCIL( _x, _y, d )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   tmp &= 0x00ffffff;							\
-   tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
-} while (0)
-#endif
-
-#ifdef COMPILE_R300
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   d = tmp & 0x000000ff;						\
-} while (0)
-#else
-#define READ_STENCIL( d, _x, _y )					\
-do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
-   d = (tmp & 0xff000000) >> 24;					\
-} while (0)
-#endif
-
-#define TAG(x) radeon##x##_z24_s8
-#include "stenciltmp.h"
-
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
-
-static void radeonSpanRenderStart(GLcontext * ctx)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-#ifdef COMPILE_R300
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-	R300_FIREVERTICES(r300);
-#else
-	RADEON_FIREVERTICES(rmesa);
-#endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
-
-	/* Read the first pixel in the frame buffer.  This should
-	 * be a noop, right?  In fact without this conform fails as reading
-	 * from the framebuffer sometimes produces old results -- the
-	 * on-card read cache gets mixed up and doesn't notice that the
-	 * framebuffer has been updated.
-	 *
-	 * Note that we should probably be reading some otherwise unused
-	 * region of VRAM, otherwise we might get incorrect results when
-	 * reading pixels from the top left of the screen.
-	 *
-	 * I found this problem on an R420 with glean's texCube test.
-	 * Note that the R200 span code also *writes* the first pixel in the
-	 * framebuffer, but I've found this to be unnecessary.
-	 *  -- Nicolai Hähnle, June 2008
-	 */
-	{
-		int p;
-		driRenderbuffer *drb =
-			(driRenderbuffer *) ctx->WinSysDrawBuffer->_ColorDrawBuffers[0];
-		volatile int *buf =
-			(volatile int *)(rmesa->dri.screen->pFB + drb->offset);
-		p = *buf;
-	}
-}
-
-static void radeonSpanRenderFinish(GLcontext * ctx)
-{
-	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-	_swrast_flush(ctx);
-	UNLOCK_HARDWARE(rmesa);
-}
-
-void radeonInitSpanFuncs(GLcontext * ctx)
-{
-	struct swrast_device_driver *swdd =
-	    _swrast_GetDeviceDriverReference(ctx);
-	swdd->SpanRenderStart = radeonSpanRenderStart;
-	swdd->SpanRenderFinish = radeonSpanRenderFinish;
-}
-
-/**
- * Plug in the Get/Put routines for the given driRenderbuffer.
- */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
-{
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
-	}
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_state.c b/src/mesa/drivers/dri/r300/radeon_state.c
deleted file mode 100644
index c401da6c544..00000000000
--- a/src/mesa/drivers/dri/r300/radeon_state.c
+++ /dev/null
@@ -1,244 +0,0 @@
-/**************************************************************************
-
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
-
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <[email protected]>
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/api_arrayelt.h"
-#include "main/enums.h"
-#include "main/framebuffer.h"
-#include "main/colormac.h"
-#include "main/light.h"
-
-#include "swrast/swrast.h"
-#include "vbo/vbo.h"
-#include "tnl/tnl.h"
-#include "tnl/t_pipeline.h"
-#include "swrast_setup/swrast_setup.h"
-
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
-#include "r300_ioctl.h"
-
-
-/* =============================================================
- * Scissoring
- */
-
-static GLboolean intersect_rect(drm_clip_rect_t * out,
-				drm_clip_rect_t * a, drm_clip_rect_t * b)
-{
-	*out = *a;
-	if (b->x1 > out->x1)
-		out->x1 = b->x1;
-	if (b->y1 > out->y1)
-		out->y1 = b->y1;
-	if (b->x2 < out->x2)
-		out->x2 = b->x2;
-	if (b->y2 < out->y2)
-		out->y2 = b->y2;
-	if (out->x1 >= out->x2)
-		return GL_FALSE;
-	if (out->y1 >= out->y2)
-		return GL_FALSE;
-	return GL_TRUE;
-}
-
-void radeonRecalcScissorRects(radeonContextPtr radeon)
-{
-	drm_clip_rect_t *out;
-	int i;
-
-	/* Grow cliprect store?
-	 */
-	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
-		while (radeon->state.scissor.numAllocedClipRects <
-		       radeon->numClipRects) {
-			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
-			radeon->state.scissor.numAllocedClipRects *= 2;
-		}
-
-		if (radeon->state.scissor.pClipRects)
-			FREE(radeon->state.scissor.pClipRects);
-
-		radeon->state.scissor.pClipRects =
-		    MALLOC(radeon->state.scissor.numAllocedClipRects *
-			   sizeof(drm_clip_rect_t));
-
-		if (radeon->state.scissor.pClipRects == NULL) {
-			radeon->state.scissor.numAllocedClipRects = 0;
-			return;
-		}
-	}
-
-	out = radeon->state.scissor.pClipRects;
-	radeon->state.scissor.numClipRects = 0;
-
-	for (i = 0; i < radeon->numClipRects; i++) {
-		if (intersect_rect(out,
-				   &radeon->pClipRects[i],
-				   &radeon->state.scissor.rect)) {
-			radeon->state.scissor.numClipRects++;
-			out++;
-		}
-	}
-}
-
-void radeonUpdateScissor(GLcontext* ctx)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	if (radeon->dri.drawable) {
-		__DRIdrawablePrivate *dPriv = radeon->dri.drawable;
-		int x1 = dPriv->x + ctx->Scissor.X;
-		int y1 = dPriv->y + dPriv->h - (ctx->Scissor.Y + ctx->Scissor.Height);
-
-		radeon->state.scissor.rect.x1 = x1;
-		radeon->state.scissor.rect.y1 = y1;
-		radeon->state.scissor.rect.x2 = x1 + ctx->Scissor.Width;
-		radeon->state.scissor.rect.y2 = y1 + ctx->Scissor.Height;
-
-		radeonRecalcScissorRects(radeon);
-	}
-}
-
-static void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
-{
-	if (ctx->Scissor.Enabled) {
-		/* We don't pipeline cliprect changes */
-		r300Flush(ctx);
-		radeonUpdateScissor(ctx);
-	}
-}
-
-
-/**
- * Update cliprects and scissors.
- */
-void radeonSetCliprects(radeonContextPtr radeon)
-{
-	__DRIdrawablePrivate *const drawable = radeon->dri.drawable;
-	__DRIdrawablePrivate *const readable = radeon->dri.readable;
-	GLframebuffer *const draw_fb = (GLframebuffer*)drawable->driverPrivate;
-	GLframebuffer *const read_fb = (GLframebuffer*)readable->driverPrivate;
-
-	if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-		/* Can't ignore 2d windows if we are page flipping. */
-		if (drawable->numBackClipRects == 0 || radeon->doPageFlip ||
-		    radeon->sarea->pfCurrentPage == 1) {
-			radeon->numClipRects = drawable->numClipRects;
-			radeon->pClipRects = drawable->pClipRects;
-		} else {
-			radeon->numClipRects = drawable->numBackClipRects;
-			radeon->pClipRects = drawable->pBackClipRects;
-		}
-	} else {
-		/* front buffer (or none, or multiple buffers */
-		radeon->numClipRects = drawable->numClipRects;
-		radeon->pClipRects = drawable->pClipRects;
-	}
-
-	if ((draw_fb->Width != drawable->w) ||
-	    (draw_fb->Height != drawable->h)) {
-		_mesa_resize_framebuffer(radeon->glCtx, draw_fb,
-					 drawable->w, drawable->h);
-		draw_fb->Initialized = GL_TRUE;
-	}
-
-	if (drawable != readable) {
-		if ((read_fb->Width != readable->w) ||
-		    (read_fb->Height != readable->h)) {
-			_mesa_resize_framebuffer(radeon->glCtx, read_fb,
-						 readable->w, readable->h);
-			read_fb->Initialized = GL_TRUE;
-		}
-	}
-
-	if (radeon->state.scissor.enabled)
-		radeonRecalcScissorRects(radeon);
-
-	radeon->lastStamp = drawable->lastStamp;
-}
-
-
-/**
- * Handle common enable bits.
- * Called as a fallback by r200Enable/r300Enable.
- */
-void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state)
-{
-	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
-
-	switch(cap) {
-	case GL_SCISSOR_TEST:
-		/* We don't pipeline cliprect & scissor changes */
-		r300Flush(ctx);
-
-		radeon->state.scissor.enabled = state;
-		radeonUpdateScissor(ctx);
-		break;
-
-	default:
-		return;
-	}
-}
-
-
-/**
- * Initialize default state.
- * This function is called once at context init time from
- * r200InitState/r300InitState
- */
-void radeonInitState(radeonContextPtr radeon)
-{
-	radeon->Fallback = 0;
-
-	if (radeon->glCtx->Visual.doubleBufferMode && radeon->sarea->pfCurrentPage == 0) {
-		radeon->state.color.drawOffset = radeon->radeonScreen->backOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->backPitch;
-	} else {
-		radeon->state.color.drawOffset = radeon->radeonScreen->frontOffset;
-		radeon->state.color.drawPitch = radeon->radeonScreen->frontPitch;
-	}
-}
-
-
-/**
- * Initialize common state functions.
- * Called by r200InitStateFuncs/r300InitStateFuncs
- */
-void radeonInitStateFuncs(struct dd_function_table *functions)
-{
-	functions->Scissor = radeonScissor;
-}
diff --git a/src/mesa/drivers/dri/r300/radeon_state.h b/src/mesa/drivers/dri/r300/radeon_state.h
deleted file mode 100644
index 821cb40c7eb..00000000000
--- a/src/mesa/drivers/dri/r300/radeon_state.h
+++ /dev/null
@@ -1,43 +0,0 @@
-/*
-Copyright (C) 2004 Nicolai Haehnle.  All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation the rights to use, copy, modify, merge, publish,
-distribute, sublicense, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
-LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
-OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
-WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Nicolai Haehnle <[email protected]>
- */
-
-#ifndef __RADEON_STATE_H__
-#define __RADEON_STATE_H__
-
-extern void radeonRecalcScissorRects(radeonContextPtr radeon);
-extern void radeonSetCliprects(radeonContextPtr radeon);
-extern void radeonUpdateScissor(GLcontext* ctx);
-
-extern void radeonEnable(GLcontext* ctx, GLenum cap, GLboolean state);
-
-extern void radeonInitState(radeonContextPtr radeon);
-extern void radeonInitStateFuncs(struct dd_function_table* functions);
-
-#endif
diff --git a/src/mesa/drivers/dri/r600/Lindent b/src/mesa/drivers/dri/r600/Lindent
new file mode 100755
index 00000000000..7d8d8896e30
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/Lindent
@@ -0,0 +1,2 @@
+#!/bin/sh
+indent -npro -kr -i8 -ts8 -sob -l80 -ss -ncs "$@"
diff --git a/src/mesa/drivers/dri/r600/Makefile b/src/mesa/drivers/dri/r600/Makefile
new file mode 100644
index 00000000000..2f225f7cb5e
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/Makefile
@@ -0,0 +1,121 @@
+# src/mesa/drivers/dri/r300/Makefile
+
+TOP = ../../../../..
+include $(TOP)/configs/current
+
+CFLAGS += $(RADEON_CFLAGS)
+
+LIBNAME = r600_dri.so
+
+MINIGLX_SOURCES = server/radeon_dri.c
+
+ifeq ($(USING_EGL), 1)
+EGL_SOURCES = server/radeon_egl.c
+endif
+
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
+COMMON_SOURCES = \
+	../../common/driverfuncs.c \
+	../common/mm.c \
+	../common/utils.c \
+	../common/texmem.c \
+	../common/vblank.c \
+	../common/xmlconfig.c \
+	../common/dri_util.c
+
+RADEON_COMMON_SOURCES = \
+	radeon_texture.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_dma.c \
+	radeon_lock.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_fbo.c
+
+DRIVER_SOURCES = \
+		 radeon_screen.c \
+		 r600_context.c \
+		 r600_cmdbuf.c \
+		 r600_emit.c       \
+		 r700_assembler.c  \
+		 r700_fragprog.c \
+		 r700_vertprog.c \
+		 r700_shader.c \
+		 r700_shaderinst.c \
+		 r700_ioctl.c \
+		 r700_oglprog.c \
+		 r700_chip.c     \
+		 r700_state.c    \
+		 r700_clear.c    \
+		 r700_render.c   \
+		 r600_tex.c      \
+		 r600_texstate.c      \
+		 r700_debug.c    \
+		 $(RADEON_COMMON_SOURCES) \
+		 $(EGL_SOURCES) \
+		 $(CS_SOURCES)
+
+C_SOURCES = $(COMMON_SOURCES) $(DRIVER_SOURCES)
+
+DRIVER_DEFINES = -DCOMPILE_R600 -DR200_MERGED=0 \
+	-DRADEON_COMMON=1 -DRADEON_COMMON_FOR_R600 \
+#	-DRADEON_BO_TRACK \
+	-Wall
+
+SYMLINKS = \
+	server/radeon_dri.c \
+	server/radeon_dri.h \
+	server/radeon.h \
+	server/radeon_macros.h \
+	server/radeon_reg.h \
+	server/radeon_egl.c
+
+COMMON_SYMLINKS = \
+	radeon_chipset.h \
+	radeon_screen.c \
+	radeon_screen.h \
+	radeon_span.h \
+	radeon_span.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_bo_legacy.h \
+	radeon_cs_legacy.h \
+	radeon_bocs_wrapper.h \
+	radeon_lock.c \
+	radeon_lock.h \
+	radeon_common.c \
+	radeon_common.h \
+	radeon_common_context.c \
+	radeon_common_context.h \
+	radeon_cmdbuf.h \
+	radeon_dma.c \
+	radeon_dma.h \
+	radeon_mipmap_tree.c \
+	radeon_mipmap_tree.h \
+	radeon_texture.c \
+	radeon_texture.h \
+	radeon_fbo.c \
+	$(CS_SOURCES)
+
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+
+##### TARGETS #####
+
+include ../Makefile.template
+
+server:
+	mkdir -p server
+
+$(SYMLINKS): server
+	@[ -e $@ ] || ln -sf ../../radeon/$@ server/
+
+$(COMMON_SYMLINKS):
+	@[ -e $@ ] || ln -sf ../radeon/$@ ./
+
+symlinks: $(SYMLINKS) $(COMMON_SYMLINKS)
diff --git a/src/mesa/drivers/dri/r600/defaultendian.h b/src/mesa/drivers/dri/r600/defaultendian.h
new file mode 100644
index 00000000000..32caf32cd2e
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/defaultendian.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+
+#ifndef _DEFINEENDIAN_H_
+#define _DEFINEENDIAN_H_
+
+//We have to choose a reg bits orientation if there is no compile flag for it.
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#define LITTLEENDIAN_CPU
+#endif
+
+#endif //_DEFINEENDIAN_H_
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.c b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
new file mode 100644
index 00000000000..74fec02584b
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.c
@@ -0,0 +1,555 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * Mostly coppied from \radeon\radeon_cs_legacy.c
+ */
+
+#include <errno.h>
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "swrast/swrast.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+
+#include "r600_context.h"
+#include "radeon_reg.h"
+#include "r600_cmdbuf.h"
+#include "r600_emit.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_mipmap_tree.h"
+#include "radeon_reg.h"
+
+
+
+static struct radeon_cs * r600_cs_create(struct radeon_cs_manager *csm,
+                                   uint32_t ndw)
+{
+    struct radeon_cs *cs;
+
+    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
+    if (cs == NULL) {
+        return NULL;
+    }
+    cs->csm = csm;
+    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
+    cs->packets = (uint32_t*)malloc(4*cs->ndw);
+    if (cs->packets == NULL) {
+        free(cs);
+        return NULL;
+    }
+    cs->relocs_total_size = 0;
+    return cs;
+}
+
+int r600_cs_write_reloc(struct radeon_cs *cs,
+                        struct radeon_bo *bo,
+                        uint32_t read_domain,
+                        uint32_t write_domain,
+                        uint32_t flags,
+                        offset_modifiers* poffset_mod)
+{
+    struct r600_cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    /* check domains */
+    if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
+        /* in one CS a bo can only be in read or write domain but not
+         * in read & write domain at the same sime
+         */
+        return -EINVAL;
+    }
+    if (read_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    if (write_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    /* check if bo is already referenced */
+    for(i = 0; i < cs->crelocs; i++) {
+        uint32_t *indices;
+        uint32_t *reloc_indices;
+
+        if (relocs[i].base.bo->handle == bo->handle) {
+            /* Check domains must be in read or write. As we check already
+             * checked that in argument one of the read or write domain was
+             * set we only need to check that if previous reloc as the read
+             * domain set then the read_domain should also be set for this
+             * new relocation.
+             */
+            if (relocs[i].base.read_domain && !read_domain) {
+                return -EINVAL;
+            }
+            if (relocs[i].base.write_domain && !write_domain) {
+                return -EINVAL;
+            }
+            relocs[i].base.read_domain |= read_domain;
+            relocs[i].base.write_domain |= write_domain;
+            /* save indice */
+            relocs[i].cindices++;
+            indices = (uint32_t*)realloc(relocs[i].indices,
+                                         relocs[i].cindices * 4);
+            reloc_indices = (uint32_t*)realloc(relocs[i].reloc_indices,
+                                               relocs[i].cindices * 4);
+            if ( (indices == NULL) || (reloc_indices == NULL) ) {
+                relocs[i].cindices -= 1;
+                return -ENOMEM;
+            }
+            relocs[i].indices = indices;
+            relocs[i].reloc_indices = reloc_indices;
+            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw - 1;
+            relocs[i].reloc_indices[relocs[i].cindices - 1] = cs->section_cdw;
+            cs->section_ndw += 2;
+            cs->section_cdw += 2;
+
+            relocs[i].offset_mod.shift     = poffset_mod->shift;
+            relocs[i].offset_mod.shiftbits = poffset_mod->shiftbits;
+            relocs[i].offset_mod.mask      = poffset_mod->mask;
+
+            return 0;
+        }
+    }
+    /* add bo to reloc */
+    relocs = (struct r600_cs_reloc_legacy*)
+             realloc(cs->relocs,
+                     sizeof(struct r600_cs_reloc_legacy) * (cs->crelocs + 1));
+    if (relocs == NULL) {
+        return -ENOMEM;
+    }
+    cs->relocs = relocs;
+    relocs[cs->crelocs].base.bo = bo;
+    relocs[cs->crelocs].base.read_domain = read_domain;
+    relocs[cs->crelocs].base.write_domain = write_domain;
+    relocs[cs->crelocs].base.flags = flags;
+    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
+    relocs[cs->crelocs].reloc_indices = (uint32_t*)malloc(4);
+    if ( (relocs[cs->crelocs].indices == NULL) || (relocs[cs->crelocs].reloc_indices == NULL) )
+    {
+        return -ENOMEM;
+    }
+    relocs[cs->crelocs].offset_mod.shift     = poffset_mod->shift;
+    relocs[cs->crelocs].offset_mod.shiftbits = poffset_mod->shiftbits;
+    relocs[cs->crelocs].offset_mod.mask      = poffset_mod->mask;
+
+    relocs[cs->crelocs].indices[0] = cs->cdw - 1;
+    relocs[cs->crelocs].reloc_indices[0] = cs->section_cdw;
+    cs->section_ndw += 2;
+    cs->section_cdw += 2;
+    relocs[cs->crelocs].cindices = 1;
+    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
+    cs->crelocs++;
+
+    radeon_bo_ref(bo);
+
+    return 0;
+}
+
+static int r600_cs_begin(struct radeon_cs *cs,
+                    uint32_t ndw,
+                    const char *file,
+                    const char *func,
+                    int line)
+{
+    if (cs->section) {
+        fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
+                cs->section_file, cs->section_func, cs->section_line);
+        fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+
+    if (cs->cdw + ndw + 32 > cs->ndw) { /* Left 32 DWORD (8 offset+pitch) spare room for reloc indices */
+        uint32_t tmp, *ptr;
+	int num = (ndw > 0x3FF) ? ndw : 0x3FF;
+
+        tmp = (cs->cdw + 1 + num) & (~num);
+        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
+        if (ptr == NULL) {
+            return -ENOMEM;
+        }
+        cs->packets = ptr;
+        cs->ndw = tmp;
+    }
+
+    cs->section = 1;
+    cs->section_ndw = 0; 
+    cs->section_cdw = cs->cdw + ndw; /* start of reloc indices. */
+    cs->section_file = file;
+    cs->section_func = func;
+    cs->section_line = line;
+
+    return 0;
+}
+
+static int r600_cs_end(struct radeon_cs *cs,
+                  const char *file,
+                  const char *func,
+                  int line)
+
+{
+    if (!cs->section) {
+        fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    cs->section = 0;
+
+    if ( (cs->section_ndw + cs->cdw) != cs->section_cdw ) 
+    {
+        fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
+                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
+        fprintf(stderr, "cs->section_ndw = %d, cs->cdw = %d, cs->section_cdw = %d \n",
+                cs->section_ndw, cs->cdw, cs->section_cdw);
+        fprintf(stderr, "CS section end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+
+    cs->cdw = cs->section_cdw;
+    return 0;
+}
+
+static int r600_cs_process_relocs(struct radeon_cs *cs, 
+                                  uint32_t * reloc_chunk,
+                                  uint32_t * length_dw_reloc_chunk) 
+{
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_reloc_legacy *relocs;
+    int i, j, r;
+
+    uint32_t offset_dw = 0;
+
+    csm = (struct r600_cs_manager_legacy*)cs->csm;
+    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+restart:
+    for (i = 0; i < cs->crelocs; i++) 
+    {
+        for (j = 0; j < relocs[i].cindices; j++) 
+        {
+            uint32_t soffset, eoffset, asicoffset;
+
+            r = radeon_bo_legacy_validate(relocs[i].base.bo,
+                                           &soffset, &eoffset);
+	        if (r == -EAGAIN)
+            {
+	             goto restart;
+            }
+            if (r) 
+            {
+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+                        relocs[i].base.bo, soffset, eoffset);
+                return r;
+            }
+            asicoffset = soffset;
+            if (asicoffset >= eoffset) 
+            {
+	      /*                radeon_bo_debug(relocs[i].base.bo, 12); */
+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+                        relocs[i].base.bo, soffset, eoffset);
+                fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
+                        relocs[i].base.bo,
+                        cs->packets[relocs[i].indices[j]],
+                        eoffset);
+                exit(0);
+                return -EINVAL;
+            }
+            /* apply offset operator */
+            switch (relocs[i].offset_mod.shift)
+            {
+            case NO_SHIFT:
+                asicoffset = asicoffset & relocs[i].offset_mod.mask;
+                break;
+            case LEFT_SHIFT:
+                asicoffset = (asicoffset << relocs[i].offset_mod.shiftbits) & relocs[i].offset_mod.mask;
+                break;
+            case RIGHT_SHIFT:
+                asicoffset = (asicoffset >> relocs[i].offset_mod.shiftbits) & relocs[i].offset_mod.mask;
+                break;
+            default:
+                break;
+            };              
+
+            /* pkt3 nop header in ib chunk */
+            cs->packets[relocs[i].reloc_indices[j]] = 0xC0001000;
+
+            /* reloc index in ib chunk */
+            cs->packets[relocs[i].reloc_indices[j] + 1] = offset_dw;
+            
+            /* asic offset in reloc chunk */ /* see alex drm r600_nomm_relocate */
+            reloc_chunk[offset_dw] = asicoffset;
+            reloc_chunk[offset_dw + 3] = 0;
+
+            offset_dw += 4;
+        }
+    }
+
+    *length_dw_reloc_chunk = offset_dw;
+
+    return 0;
+}
+
+static int r600_cs_set_age(struct radeon_cs *cs) /* -------------- */
+{
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct r600_cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct r600_cs_reloc_legacy *)cs->relocs;
+    for (i = 0; i < cs->crelocs; i++) {
+        radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
+        radeon_bo_unref(relocs[i].base.bo);
+    }
+    return 0;
+}
+
+static void dump_cmdbuf(struct radeon_cs *cs)
+{
+	int i;
+	fprintf(stderr,"--start--\n");
+	for (i = 0; i < cs->cdw; i++){
+		fprintf(stderr,"0x%08x\n", cs->packets[i]);
+	}
+	fprintf(stderr,"--end--\n");
+
+}
+
+static int r600_cs_emit(struct radeon_cs *cs)
+{
+    struct r600_cs_manager_legacy *csm = (struct r600_cs_manager_legacy*)cs->csm;
+    struct drm_radeon_cs       cs_cmd;
+    struct drm_radeon_cs_chunk cs_chunk[2];
+    drm_radeon_cmd_buffer_t cmd; 
+    /* drm_r300_cmd_header_t age; */
+    uint32_t length_dw_reloc_chunk;
+    uint64_t ull;
+    uint64_t chunk_ptrs[2];
+    uint32_t reloc_chunk[128]; 
+    int r;
+    int retry = 0;
+
+    /* TODO : put chip level things here if need. */
+    /* csm->ctx->vtbl.emit_cs_header(cs, csm->ctx); */
+
+    BATCH_LOCALS(csm->ctx);
+    drm_radeon_getparam_t gp;
+    uint32_t              current_scratchx_age;
+
+    gp.param = RADEON_PARAM_LAST_CLEAR;
+    gp.value = (int *)&current_scratchx_age;
+    r = drmCommandWriteRead(cs->csm->fd, 
+                            DRM_RADEON_GETPARAM,
+                            &gp, 
+                            sizeof(gp));
+    if (r) 
+    {
+        fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, r);
+        exit(1);
+    }
+
+    csm->pending_age = 0;
+    csm->pending_count = 1;
+
+    current_scratchx_age++;
+    csm->pending_age = current_scratchx_age;
+
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+    R600_OUT_BATCH((SCRATCH_REG2 - R600_SET_CONFIG_REG_OFFSET) >> 2);
+    R600_OUT_BATCH(current_scratchx_age);
+    END_BATCH();
+    COMMIT_BATCH();
+
+    //TODO ioctl to get back cs id assigned in drm
+    //csm->pending_age = cs_id_back;
+    
+    r = r600_cs_process_relocs(cs, &(reloc_chunk[0]), &length_dw_reloc_chunk);
+    if (r) {
+        return 0;
+    }
+      
+    /* raw ib chunk */
+    cs_chunk[0].chunk_id   = RADEON_CHUNK_ID_IB;
+    cs_chunk[0].length_dw  = cs->cdw;
+    cs_chunk[0].chunk_data = (unsigned long)(cs->packets);
+
+    /* reloc chaunk */
+    cs_chunk[1].chunk_id   = RADEON_CHUNK_ID_RELOCS;
+    cs_chunk[1].length_dw  = length_dw_reloc_chunk;
+    cs_chunk[1].chunk_data = (unsigned long)&(reloc_chunk[0]);
+
+    chunk_ptrs[0] = (uint64_t)(unsigned long)&(cs_chunk[0]);
+    chunk_ptrs[1] = (uint64_t)(unsigned long)&(cs_chunk[1]);
+
+    cs_cmd.num_chunks = 2;
+    /* cs_cmd.cs_id      = 0; */
+    cs_cmd.chunks     = (uint64_t)(unsigned long)chunk_ptrs;
+
+    //dump_cmdbuf(cs);
+
+    do 
+    {
+        r = drmCommandWriteRead(cs->csm->fd, DRM_RADEON_CS, &cs_cmd, sizeof(cs_cmd));
+        retry++;
+    } while (r == -EAGAIN && retry < 1000);
+
+    if (r) {
+        return r;
+    }
+
+    r600_cs_set_age(cs);
+
+    cs->csm->read_used = 0;
+    cs->csm->vram_write_used = 0;
+    cs->csm->gart_write_used = 0;
+
+    return 0;
+}
+
+static void inline r600_cs_free_reloc(void *relocs_p, int crelocs)
+{
+    struct r600_cs_reloc_legacy *relocs = relocs_p;
+    int i;
+    if (!relocs_p)
+      return;
+    for (i = 0; i < crelocs; i++)
+    {
+        free(relocs[i].indices);
+        free(relocs[i].reloc_indices);
+    }
+}
+
+static int r600_cs_destroy(struct radeon_cs *cs)
+{
+    r600_cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    free(cs->packets);
+    free(cs);
+    return 0;
+}
+
+static int r600_cs_erase(struct radeon_cs *cs)
+{
+    r600_cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    cs->relocs_total_size = 0;
+    cs->relocs = NULL;
+    cs->crelocs = 0;
+    cs->cdw = 0;
+    cs->section = 0;
+    return 0;
+}
+
+static int r600_cs_need_flush(struct radeon_cs *cs)
+{
+    /* this function used to flush when the BO usage got to
+     * a certain size, now the higher levels handle this better */
+    return 0;
+}
+
+static void r600_cs_print(struct radeon_cs *cs, FILE *file)
+{
+}
+
+static struct radeon_cs_funcs  r600_cs_funcs = {
+    r600_cs_create,
+    r600_cs_write_reloc,
+    r600_cs_begin,
+    r600_cs_end,
+    r600_cs_emit,
+    r600_cs_destroy,
+    r600_cs_erase,
+    r600_cs_need_flush,
+    r600_cs_print
+};
+
+struct radeon_cs_manager * r600_radeon_cs_manager_legacy_ctor(struct radeon_context *ctx)
+{
+    struct r600_cs_manager_legacy *csm;
+
+    csm = (struct r600_cs_manager_legacy*)
+          calloc(1, sizeof(struct r600_cs_manager_legacy));
+    if (csm == NULL) {
+        return NULL;
+    }
+    csm->base.funcs = &r600_cs_funcs;
+    csm->base.fd = ctx->dri.fd;
+    csm->ctx = ctx;
+    csm->pending_age = 1;
+    return (struct radeon_cs_manager*)csm;
+}
+
+void r600InitCmdBuf(context_t *r600) /* from rcommonInitCmdBuf */
+{
+    radeonContextPtr rmesa = &r600->radeon;
+	
+    GLuint size;
+	/* Initialize command buffer */
+	size = 256 * driQueryOptioni(&rmesa->optionCache,
+				     "command_buffer_size");
+	if (size < 2 * rmesa->hw.max_state_size) {
+		size = 2 * rmesa->hw.max_state_size + 65535;
+	}
+	if (size > 64 * 256)
+		size = 64 * 256;
+
+	if (rmesa->radeonScreen->kernel_mm) {
+		int fd = rmesa->radeonScreen->driScreen->fd;
+		rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
+	} else {
+		rmesa->cmdbuf.csm = r600_radeon_cs_manager_legacy_ctor(rmesa);
+	}
+	if (rmesa->cmdbuf.csm == NULL) {
+		/* FIXME: fatal error */
+		return;
+	}
+	rmesa->cmdbuf.cs = radeon_cs_create(rmesa->cmdbuf.csm, size);
+	assert(rmesa->cmdbuf.cs != NULL);
+	rmesa->cmdbuf.size = size;
+	
+	if (!rmesa->radeonScreen->kernel_mm) {
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, rmesa->radeonScreen->texSize[0]);
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, rmesa->radeonScreen->gartTextures.size);
+	} else {
+		struct drm_radeon_gem_info mminfo;
+
+		if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO, &mminfo, sizeof(mminfo)))
+		{
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, mminfo.vram_visible);
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, mminfo.gart_size);
+		}
+	}
+}
+
diff --git a/src/mesa/drivers/dri/r600/r600_cmdbuf.h b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
new file mode 100644
index 00000000000..bd1ed7fdff0
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_cmdbuf.h
@@ -0,0 +1,229 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Nicolai Haehnle <[email protected]>
+ */
+
+#ifndef __R600_CMDBUF_H__
+#define __R600_CMDBUF_H__
+
+#include "r600_context.h"
+#include "r600_emit.h"
+
+#define RADEON_CP_PACKET3_NOP                       0xC0001000
+#define RADEON_CP_PACKET3_NEXT_CHAR                 0xC0001900
+#define RADEON_CP_PACKET3_PLY_NEXTSCAN              0xC0001D00
+#define RADEON_CP_PACKET3_SET_SCISSORS              0xC0001E00
+#define RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM     0xC0002300
+#define RADEON_CP_PACKET3_LOAD_MICROCODE            0xC0002400
+#define RADEON_CP_PACKET3_WAIT_FOR_IDLE             0xC0002600
+#define RADEON_CP_PACKET3_3D_DRAW_VBUF              0xC0002800
+#define RADEON_CP_PACKET3_3D_DRAW_IMMD              0xC0002900
+#define RADEON_CP_PACKET3_3D_DRAW_INDX              0xC0002A00
+#define RADEON_CP_PACKET3_LOAD_PALETTE              0xC0002C00
+#define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define RADEON_CP_PACKET3_CNTL_PAINT                0xC0009100
+#define RADEON_CP_PACKET3_CNTL_BITBLT               0xC0009200
+#define RADEON_CP_PACKET3_CNTL_SMALLTEXT            0xC0009300
+#define RADEON_CP_PACKET3_CNTL_HOSTDATA_BLT         0xC0009400
+#define RADEON_CP_PACKET3_CNTL_POLYLINE             0xC0009500
+#define RADEON_CP_PACKET3_CNTL_POLYSCANLINES        0xC0009800
+#define RADEON_CP_PACKET3_CNTL_PAINT_MULTI          0xC0009A00
+#define RADEON_CP_PACKET3_CNTL_BITBLT_MULTI         0xC0009B00
+#define RADEON_CP_PACKET3_CNTL_TRANS_BITBLT         0xC0009C00
+
+/* r6xx/r7xx packet 3 type offsets */
+#define R600_SET_CONFIG_REG_OFFSET                  0x00008000
+#define R600_SET_CONFIG_REG_END                     0x0000ac00
+#define R600_SET_CONTEXT_REG_OFFSET                 0x00028000
+#define R600_SET_CONTEXT_REG_END                    0x00029000
+#define R600_SET_ALU_CONST_OFFSET                   0x00030000
+#define R600_SET_ALU_CONST_END                      0x00032000
+#define R600_SET_RESOURCE_OFFSET                    0x00038000
+#define R600_SET_RESOURCE_END                       0x0003c000
+#define R600_SET_SAMPLER_OFFSET                     0x0003c000
+#define R600_SET_SAMPLER_END                        0x0003cff0
+#define R600_SET_CTL_CONST_OFFSET                   0x0003cff0
+#define R600_SET_CTL_CONST_END                      0x0003e200
+#define R600_SET_LOOP_CONST_OFFSET                  0x0003e200
+#define R600_SET_LOOP_CONST_END                     0x0003e380
+#define R600_SET_BOOL_CONST_OFFSET                  0x0003e380
+#define R600_SET_BOOL_CONST_END                     0x00040000
+
+/* r6xx/r7xx packet 3 types */
+#define R600_IT_INDIRECT_BUFFER_END               0x00001700
+#define R600_IT_SET_PREDICATION                   0x00002000
+#define R600_IT_REG_RMW                           0x00002100
+#define R600_IT_COND_EXEC                         0x00002200
+#define R600_IT_PRED_EXEC                         0x00002300
+#define R600_IT_START_3D_CMDBUF                   0x00002400
+#define R600_IT_DRAW_INDEX_2                      0x00002700
+#define R600_IT_CONTEXT_CONTROL                   0x00002800
+#define R600_IT_DRAW_INDEX_IMMD_BE                0x00002900
+#define R600_IT_INDEX_TYPE                        0x00002A00
+#define R600_IT_DRAW_INDEX                        0x00002B00
+#define R600_IT_DRAW_INDEX_AUTO                   0x00002D00
+#define R600_IT_DRAW_INDEX_IMMD                   0x00002E00
+#define R600_IT_NUM_INSTANCES                     0x00002F00
+#define R600_IT_STRMOUT_BUFFER_UPDATE             0x00003400
+#define R600_IT_INDIRECT_BUFFER_MP                0x00003800
+#define R600_IT_MEM_SEMAPHORE                     0x00003900
+#define R600_IT_MPEG_INDEX                        0x00003A00
+#define R600_IT_WAIT_REG_MEM                      0x00003C00
+#define R600_IT_MEM_WRITE                         0x00003D00
+#define R600_IT_INDIRECT_BUFFER                   0x00003200
+#define R600_IT_CP_INTERRUPT                      0x00004000
+#define R600_IT_SURFACE_SYNC                      0x00004300
+#define R600_IT_ME_INITIALIZE                     0x00004400
+#define R600_IT_COND_WRITE                        0x00004500
+#define R600_IT_EVENT_WRITE                       0x00004600
+#define R600_IT_EVENT_WRITE_EOP                   0x00004700
+#define R600_IT_ONE_REG_WRITE                     0x00005700
+#define R600_IT_SET_CONFIG_REG                    0x00006800
+#define R600_IT_SET_CONTEXT_REG                   0x00006900
+#define R600_IT_SET_ALU_CONST                     0x00006A00
+#define R600_IT_SET_BOOL_CONST                    0x00006B00
+#define R600_IT_SET_LOOP_CONST                    0x00006C00
+#define R600_IT_SET_RESOURCE                      0x00006D00
+#define R600_IT_SET_SAMPLER                       0x00006E00
+#define R600_IT_SET_CTL_CONST                     0x00006F00
+#define R600_IT_SURFACE_BASE_UPDATE               0x00007300
+
+struct r600_cs_manager_legacy
+{
+    struct radeon_cs_manager    base;
+    struct radeon_context       *ctx;
+    /* hack for scratch stuff */
+    uint32_t                    pending_age;
+    uint32_t                    pending_count;
+};
+
+struct r600_cs_reloc_legacy {
+    struct radeon_cs_reloc  base;
+    uint32_t                cindices;
+    uint32_t                *indices;
+    uint32_t                *reloc_indices;
+    struct offset_modifiers offset_mod;
+};
+
+extern int r600_cs_write_reloc(struct radeon_cs *cs,
+                        struct radeon_bo *bo,
+                        uint32_t read_domain,
+                        uint32_t write_domain,
+                        uint32_t flags,
+                        offset_modifiers* poffset_mod);
+
+static inline void r600_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
+{
+    cs->packets[cs->cdw++] = dword;
+}
+
+struct radeon_cs_manager * r600_radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
+
+/**
+ * Write one dword to the command buffer.
+ */
+#define R600_OUT_BATCH(data) \
+	do { \
+        r600_cs_write_dword(b_l_rmesa->cmdbuf.cs, data);\
+	} while(0)
+
+/**
+ * Write n dwords from ptr to the command buffer.
+ */
+#define R600_OUT_BATCH_TABLE(ptr,n) \
+	do { \
+		int _i; \
+        for (_i=0; _i < n; _i++) {\
+            r600_cs_write_dword(b_l_rmesa->cmdbuf.cs, ptr[_i]);\
+        }\
+	} while(0)
+
+/**
+ * Write a relocated dword to the command buffer.
+ */
+#define R600_OUT_BATCH_RELOC(data, bo, offset, rd, wd, flags, offset_mod) 	\
+	do { 							\
+        if (0 && offset) {					\
+            fprintf(stderr, "(%s:%s:%d) offset : %d\n",		\
+            __FILE__, __FUNCTION__, __LINE__, offset);		\
+        }							\
+        r600_cs_write_dword(b_l_rmesa->cmdbuf.cs, offset);	\
+        r600_cs_write_reloc(b_l_rmesa->cmdbuf.cs, 		\
+                              bo, rd, wd, flags, offset_mod);		\
+	} while(0)
+
+/* R600/R700 */
+#define R600_OUT_BATCH_REGS(reg, num)					\
+do {								\
+	if ((reg) >= R600_SET_CONFIG_REG_OFFSET && (reg) < R600_SET_CONFIG_REG_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_CONFIG_REG_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_CONTEXT_REG_OFFSET && (reg) < R600_SET_CONTEXT_REG_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONTEXT_REG, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_CONTEXT_REG_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_ALU_CONST_OFFSET && (reg) < R600_SET_ALU_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_ALU_CONST_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_RESOURCE_OFFSET && (reg) < R600_SET_RESOURCE_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_RESOURCE_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_SAMPLER_OFFSET && (reg) < R600_SET_SAMPLER_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_SAMPLER, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_SAMPLER_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_CTL_CONST_OFFSET && (reg) < R600_SET_CTL_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_CTL_CONST_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_LOOP_CONST_OFFSET && (reg) < R600_SET_LOOP_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_LOOP_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_LOOP_CONST_OFFSET) >> 2);	\
+	} else if ((reg) >= R600_SET_BOOL_CONST_OFFSET && (reg) < R600_SET_BOOL_CONST_END) { \
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_BOOL_CONST, (num)));	\
+		R600_OUT_BATCH(((reg) - R600_SET_BOOL_CONST_OFFSET) >> 2);	\
+	} else {							\
+		R600_OUT_BATCH(CP_PACKET0((reg), (num))); \
+	}								\
+} while (0)
+
+/** Single register write to command buffer; requires 3 dwords for most things. */
+#define R600_OUT_BATCH_REGVAL(reg, val)		\
+	R600_OUT_BATCH_REGS((reg), 1);		\
+	R600_OUT_BATCH((val))
+
+/** Continuous register range write to command buffer; requires 1 dword,
+ * expects count dwords afterwards for register contents. */
+#define R600_OUT_BATCH_REGSEQ(reg, count)	\
+	R600_OUT_BATCH_REGS((reg), (count))
+
+extern void r600InitCmdBuf(context_t *r600);
+
+#endif				/* __R600_CMDBUF_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_context.c b/src/mesa/drivers/dri/r600/r600_context.c
new file mode 100644
index 00000000000..78bad8726b2
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_context.c
@@ -0,0 +1,413 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <[email protected]>
+ *
+ * \author Nicolai Haehnle <[email protected]>
+ */
+
+#include "main/glheader.h"
+#include "main/api_arrayelt.h"
+#include "main/context.h"
+#include "main/simple_list.h"
+#include "main/imports.h"
+#include "main/matrix.h"
+#include "main/extensions.h"
+#include "main/state.h"
+#include "main/bufferobj.h"
+#include "main/texobj.h"
+
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+
+#include "drivers/common/driverfuncs.h"
+
+#include "r600_context.h"
+#include "radeon_context.h"
+#include "radeon_span.h"
+#include "r600_cmdbuf.h"
+#include "r600_emit.h"
+#include "radeon_bocs_wrapper.h"
+
+#include "r700_state.h"
+#include "r700_ioctl.h"
+
+
+#include "vblank.h"
+#include "utils.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+/* hw_tcl_on derives from future_hw_tcl_on when its safe to change it. */
+int future_hw_tcl_on = 1;
+int hw_tcl_on = 1;
+
+#define need_GL_VERSION_2_0
+#define need_GL_ARB_point_parameters
+#define need_GL_ARB_vertex_program
+#define need_GL_EXT_blend_equation_separate
+#define need_GL_EXT_blend_func_separate
+#define need_GL_EXT_blend_minmax
+#define need_GL_EXT_framebuffer_object
+#define need_GL_EXT_fog_coord
+#define need_GL_EXT_gpu_program_parameters
+#define need_GL_EXT_secondary_color
+#define need_GL_EXT_stencil_two_side
+#define need_GL_ATI_separate_stencil
+#define need_GL_NV_vertex_program
+
+#include "extension_helper.h"
+
+extern const struct tnl_pipeline_stage *r700_pipeline[];
+
+const struct dri_extension card_extensions[] = {
+  /* *INDENT-OFF* */
+  {"GL_ARB_depth_texture",		NULL},
+  {"GL_ARB_fragment_program",		NULL},
+  {"GL_ARB_multitexture",		NULL},
+  {"GL_ARB_point_parameters",		GL_ARB_point_parameters_functions},
+  {"GL_ARB_shadow",			NULL},
+  {"GL_ARB_shadow_ambient",		NULL},
+  {"GL_ARB_texture_border_clamp",	NULL},
+  {"GL_ARB_texture_cube_map",		NULL},
+  {"GL_ARB_texture_env_add",		NULL},
+  {"GL_ARB_texture_env_combine",	NULL},
+  {"GL_ARB_texture_env_crossbar",	NULL},
+  {"GL_ARB_texture_env_dot3",		NULL},
+  {"GL_ARB_texture_mirrored_repeat",	NULL},
+  {"GL_ARB_vertex_program",		GL_ARB_vertex_program_functions},
+  {"GL_EXT_blend_equation_separate",	GL_EXT_blend_equation_separate_functions},
+  {"GL_EXT_blend_func_separate",	GL_EXT_blend_func_separate_functions},
+  {"GL_EXT_blend_minmax",		GL_EXT_blend_minmax_functions},
+  {"GL_EXT_blend_subtract",		NULL},
+  {"GL_EXT_packed_depth_stencil",	NULL},
+  {"GL_EXT_fog_coord",			GL_EXT_fog_coord_functions },
+  {"GL_EXT_gpu_program_parameters",     GL_EXT_gpu_program_parameters_functions},
+  {"GL_EXT_secondary_color", 		GL_EXT_secondary_color_functions},
+  {"GL_EXT_shadow_funcs",		NULL},
+  {"GL_EXT_stencil_two_side",		GL_EXT_stencil_two_side_functions},
+  {"GL_EXT_stencil_wrap",		NULL},
+  {"GL_EXT_texture_edge_clamp",		NULL},
+  {"GL_EXT_texture_env_combine", 	NULL},
+  {"GL_EXT_texture_env_dot3", 		NULL},
+  {"GL_EXT_texture_filter_anisotropic",	NULL},
+  {"GL_EXT_texture_lod_bias",		NULL},
+  {"GL_EXT_texture_mirror_clamp",	NULL},
+  {"GL_EXT_texture_rectangle",		NULL},
+  {"GL_ATI_separate_stencil",		GL_ATI_separate_stencil_functions},
+  {"GL_ATI_texture_env_combine3",	NULL},
+  {"GL_ATI_texture_mirror_once",	NULL},
+  {"GL_MESA_pack_invert",		NULL},
+  {"GL_MESA_ycbcr_texture",		NULL},
+  {"GL_MESAX_texture_float",		NULL},
+  {"GL_NV_blend_square",		NULL},
+  {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
+  {"GL_SGIS_generate_mipmap",		NULL},
+  {NULL,				NULL}
+  /* *INDENT-ON* */
+};
+
+
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
+/**
+ * The GL 2.0 functions are needed to make display lists work with
+ * functions added by GL_ATI_separate_stencil.
+ */
+const struct dri_extension gl_20_extension[] = {
+  {"GL_VERSION_2_0",			GL_VERSION_2_0_functions },
+};
+
+
+static void r600RunPipeline(GLcontext * ctx)
+{
+    _mesa_lock_context_textures(ctx);
+
+    if (ctx->NewState)
+        _mesa_update_state_locked(ctx);
+    
+    _tnl_run_pipeline(ctx);
+    _mesa_unlock_context_textures(ctx);
+}
+
+static void r600_get_lock(radeonContextPtr rmesa)
+{
+	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	if (sarea->ctx_owner != rmesa->dri.hwContext) {
+		sarea->ctx_owner = rmesa->dri.hwContext;
+		if (!rmesa->radeonScreen->kernel_mm)
+			radeon_bo_legacy_texture_age(rmesa->radeonScreen->bom);
+	}
+}		  
+
+static void r600_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+    /* please flush pipe do all pending work */
+    /* to be enabled */
+}
+
+static void r600_vtbl_pre_emit_atoms(radeonContextPtr radeon)
+{
+	/* to be enabled */
+}
+
+static void r600_fallback(GLcontext *ctx, GLuint bit, GLboolean mode)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	if (mode)
+		context->radeon.Fallback |= bit;
+	else
+		context->radeon.Fallback &= ~bit;
+}
+
+static void r600_init_vtbl(radeonContextPtr radeon)
+{
+	radeon->vtbl.get_lock = r600_get_lock;
+	radeon->vtbl.update_viewport_offset = r700UpdateViewportOffset;
+	radeon->vtbl.emit_cs_header = r600_vtbl_emit_cs_header;
+	radeon->vtbl.swtcl_flush = NULL;
+	radeon->vtbl.pre_emit_atoms = r600_vtbl_pre_emit_atoms;
+	radeon->vtbl.fallback = r600_fallback;
+}
+
+/* Create the device specific rendering context.
+ */
+GLboolean r600CreateContext(const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	struct dd_function_table functions;
+	context_t *r600;
+	GLcontext *ctx;
+	int tcl_mode;
+
+	assert(glVisual);
+	assert(driContextPriv);
+	assert(screen);
+
+	/* Allocate the R600 context */
+	r600 = (context_t*) CALLOC(sizeof(*r600));
+	if (!r600)
+		return GL_FALSE;
+
+	if (!(screen->chip_flags & RADEON_CHIPSET_TCL))
+		hw_tcl_on = future_hw_tcl_on = 0;
+
+	r600_init_vtbl(&r600->radeon);
+	/* Parse configuration files.
+	 * Do this here so that initialMaxAnisotropy is set before we create
+	 * the default textures.
+	 */
+	driParseConfigFiles(&r600->radeon.optionCache, &screen->optionCache,
+			    screen->driScreen->myNum, "r600");
+
+	r600->radeon.initialMaxAnisotropy = driQueryOptionf(&r600->radeon.optionCache,
+						     "def_max_anisotropy");
+
+	/* Init default driver functions then plug in our R600-specific functions
+	 * (the texture functions are especially important)
+	 */
+	_mesa_init_driver_functions(&functions);
+
+	r700InitChipObject(r600);  /* let the eag... */
+
+	r700InitStateFuncs(&functions);
+	r600InitTextureFuncs(&functions);
+	r700InitShaderFuncs(&functions);
+	r700InitIoctlFuncs(&functions);
+
+	if (!radeonInitContext(&r600->radeon, &functions,
+			       glVisual, driContextPriv,
+			       sharedContextPrivate)) {
+		FREE(r600);
+		return GL_FALSE;
+	}
+
+	/* Init r600 context data */
+	/* Set the maximum texture size small enough that we can guarentee that
+	 * all texture units can bind a maximal texture and have them both in
+	 * texturable memory at once.
+	 */
+
+	ctx = r600->radeon.glCtx;
+
+	ctx->Const.MaxTextureImageUnits =
+	    driQueryOptioni(&r600->radeon.optionCache, "texture_image_units");
+	ctx->Const.MaxTextureCoordUnits =
+	    driQueryOptioni(&r600->radeon.optionCache, "texture_coord_units");
+	ctx->Const.MaxTextureUnits =
+	    MIN2(ctx->Const.MaxTextureImageUnits,
+		 ctx->Const.MaxTextureCoordUnits);
+	ctx->Const.MaxTextureMaxAnisotropy = 16.0;
+	ctx->Const.MaxTextureLodBias = 16.0;
+
+	ctx->Const.MaxTextureLevels = 13;
+	ctx->Const.MaxTextureRectSize = 4096;
+
+	ctx->Const.MinPointSize   = 0x0001 / 8.0;
+	ctx->Const.MinPointSizeAA = 0x0001 / 8.0;
+	ctx->Const.MaxPointSize   = 0xffff / 8.0;
+	ctx->Const.MaxPointSizeAA = 0xffff / 8.0;
+
+	ctx->Const.MinLineWidth   = 0x0001 / 8.0;
+	ctx->Const.MinLineWidthAA = 0x0001 / 8.0;
+	ctx->Const.MaxLineWidth   = 0xffff / 8.0;
+	ctx->Const.MaxLineWidthAA = 0xffff / 8.0;
+
+	/* Needs further modifications */
+#if 0
+	ctx->Const.MaxArrayLockSize =
+	    ( /*512 */ RADEON_BUFFER_SIZE * 16 * 1024) / (4 * 4);
+#endif
+
+	ctx->Const.MaxDrawBuffers = 1;
+
+	/* Initialize the software rasterizer and helper modules.
+	 */
+	_swrast_CreateContext(ctx);
+	_vbo_CreateContext(ctx);
+	_tnl_CreateContext(ctx);
+	_swsetup_CreateContext(ctx);
+	_swsetup_Wakeup(ctx);
+	_ae_create_context(ctx);
+
+	/* Install the customized pipeline:
+	 */
+	_tnl_destroy_pipeline(ctx);
+	_tnl_install_pipeline(ctx, r700_pipeline);
+
+	/* Try and keep materials and vertices separate:
+	 */
+/* 	_tnl_isolate_materials(ctx, GL_TRUE); */
+
+	/* Configure swrast and TNL to match hardware characteristics:
+	 */
+	_swrast_allow_pixel_fog(ctx, GL_FALSE);
+	_swrast_allow_vertex_fog(ctx, GL_TRUE);
+	_tnl_allow_pixel_fog(ctx, GL_FALSE);
+	_tnl_allow_vertex_fog(ctx, GL_TRUE);
+
+	/* currently bogus data */
+	ctx->Const.VertexProgram.MaxInstructions = VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeInstructions =
+		VSF_MAX_FRAGMENT_LENGTH / 4;
+	ctx->Const.VertexProgram.MaxNativeAttribs = 16;	/* r420 */
+	ctx->Const.VertexProgram.MaxTemps = 32;
+	ctx->Const.VertexProgram.MaxNativeTemps =
+		/*VSF_MAX_FRAGMENT_TEMPS */ 32;
+	ctx->Const.VertexProgram.MaxNativeParameters = 256;	/* r420 */
+	ctx->Const.VertexProgram.MaxNativeAddressRegs = 1;
+
+	ctx->Const.FragmentProgram.MaxNativeTemps = PFS_NUM_TEMP_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAttribs = 11;	/* copy i915... */
+	ctx->Const.FragmentProgram.MaxNativeParameters = PFS_NUM_CONST_REGS;
+	ctx->Const.FragmentProgram.MaxNativeAluInstructions = PFS_MAX_ALU_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexInstructions = PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeInstructions =
+	    PFS_MAX_ALU_INST + PFS_MAX_TEX_INST;
+	ctx->Const.FragmentProgram.MaxNativeTexIndirections =
+	    PFS_MAX_TEX_INDIRECT;
+	ctx->Const.FragmentProgram.MaxNativeAddressRegs = 0;	/* and these are?? */
+	ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
+	ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
+
+	driInitExtensions(ctx, card_extensions, GL_TRUE);
+	if (r600->radeon.radeonScreen->kernel_mm)
+	  driInitExtensions(ctx, mm_extensions, GL_FALSE);
+
+	if (driQueryOptionb
+	    (&r600->radeon.optionCache, "disable_stencil_two_side"))
+		_mesa_disable_extension(ctx, "GL_EXT_stencil_two_side");
+
+	if (r600->radeon.glCtx->Mesa_DXTn
+	    && !driQueryOptionb(&r600->radeon.optionCache, "disable_s3tc")) {
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+		_mesa_enable_extension(ctx, "GL_S3_s3tc");
+	} else
+	    if (driQueryOptionb(&r600->radeon.optionCache, "force_s3tc_enable"))
+	{
+		_mesa_enable_extension(ctx, "GL_EXT_texture_compression_s3tc");
+	}
+
+	radeon_fbo_init(&r600->radeon);
+   	radeonInitSpanFuncs( ctx );
+
+	r600InitCmdBuf(r600);
+
+	r700InitState(r600->radeon.glCtx);
+
+	TNL_CONTEXT(ctx)->Driver.RunPipeline = r600RunPipeline;
+
+	tcl_mode = driQueryOptioni(&r600->radeon.optionCache, "tcl_mode");
+	if (driQueryOptionb(&r600->radeon.optionCache, "no_rast")) {
+		fprintf(stderr, "disabling 3D acceleration\n");
+#if R200_MERGED
+		FALLBACK(&r600->radeon, RADEON_FALLBACK_DISABLE, 1);
+#endif
+	}
+	if (tcl_mode == DRI_CONF_TCL_SW ||
+	    !(r600->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+		if (r600->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+			r600->radeon.radeonScreen->chip_flags &=
+			    ~RADEON_CHIPSET_TCL;
+			fprintf(stderr, "Disabling HW TCL support\n");
+		}
+		TCL_FALLBACK(r600->radeon.glCtx,
+			     RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+	}
+
+	return GL_TRUE;
+}
+
+/* Clean our own things only, radeonDestroyContext will do every thing else. */
+void
+r600DestroyContext (__DRIcontextPrivate * driContextPriv)
+{
+    GET_CURRENT_CONTEXT (ctx);
+    context_t *context = ctx ? R700_CONTEXT(ctx) : NULL;
+
+    if (context)
+	    FREE(context->hw.pStateList);
+}
+
+
+
diff --git a/src/mesa/drivers/dri/r600/r600_context.h b/src/mesa/drivers/dri/r600/r600_context.h
new file mode 100644
index 00000000000..bcb33e1386f
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_context.h
@@ -0,0 +1,185 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <[email protected]>
+ * \author Nicolai Haehnle <[email protected]>
+ */
+
+#ifndef __R600_CONTEXT_H__
+#define __R600_CONTEXT_H__
+
+#include "tnl/t_vertex.h"
+#include "drm.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "texmem.h"
+#include "radeon_common.h"
+
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+
+#include "r700_chip.h"
+#include "r600_tex.h"
+#include "r700_oglprog.h"
+
+struct r600_context;
+typedef struct r600_context context_t;
+
+GLboolean r700SendPSState(context_t *context);
+GLboolean r700SendVSState(context_t *context);
+GLboolean r700SendSQConfig(context_t *context);
+
+#include "main/mm.h"
+
+/* From http://gcc. gnu.org/onlinedocs/gcc-3.2.3/gcc/Variadic-Macros.html .
+   I suppose we could inline this and use macro to fetch out __LINE__ and stuff in case we run into trouble
+   with other compilers ... GLUE!
+*/
+#define WARN_ONCE(a, ...)	{ \
+	static int warn##__LINE__=1; \
+	if(warn##__LINE__){ \
+		fprintf(stderr, "*********************************WARN_ONCE*********************************\n"); \
+		fprintf(stderr, "File %s function %s line %d\n", \
+			__FILE__, __FUNCTION__, __LINE__); \
+		fprintf(stderr,  a, ## __VA_ARGS__);\
+		fprintf(stderr, "***************************************************************************\n"); \
+		warn##__LINE__=0;\
+		} \
+	}
+
+/************ DMA BUFFERS **************/
+
+/* The blit width for texture uploads
+ */
+#define R600_BLIT_WIDTH_BYTES 1024
+#define R600_MAX_TEXTURE_UNITS 8
+
+struct r600_texture_state {
+	int tc_count;		/* number of incoming texture coordinates from VAP */
+};
+
+/* Perhaps more if we store programs in vmem? */
+/* drm_r600_cmd_header_t->vpu->count is unsigned char */
+#define VSF_MAX_FRAGMENT_LENGTH (255*4)
+
+/* Can be tested with colormat currently. */
+#define VSF_MAX_FRAGMENT_TEMPS (14)
+
+#define STATE_R600_WINDOW_DIMENSION (STATE_INTERNAL_DRIVER+0)
+#define STATE_R600_TEXRECT_FACTOR (STATE_INTERNAL_DRIVER+1)
+
+extern int hw_tcl_on;
+
+#define COLOR_IS_RGBA
+#define TAG(x) r600##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+#define PFS_MAX_ALU_INST	64
+#define PFS_MAX_TEX_INST	64
+#define PFS_MAX_TEX_INDIRECT 4
+#define PFS_NUM_TEMP_REGS	32
+#define PFS_NUM_CONST_REGS	16
+
+#define R600_MAX_AOS_ARRAYS		16
+
+#define REG_COORDS	0
+#define REG_COLOR0	1
+#define REG_TEX0	2
+
+#define R600_FALLBACK_NONE 0
+#define R600_FALLBACK_TCL 1
+#define R600_FALLBACK_RAST 2
+
+enum 
+{
+    NO_SHIFT    = 0,
+    LEFT_SHIFT  = 1,
+    RIGHT_SHIFT = 2,
+};
+
+typedef struct offset_modifiers
+{
+    GLuint shift;
+    GLuint shiftbits;
+    GLuint mask;
+} offset_modifiers;
+
+/**
+ * \brief R600 context structure.
+ */
+struct r600_context {
+	struct radeon_context radeon;	/* parent class, must be first */
+
+	/* ------ */
+	R700_CHIP_CONTEXT hw;
+
+	/* Vertex buffers
+	 */
+	GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+	GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+
+};
+
+#define R700_CONTEXT(ctx)		((context_t *)(ctx->DriverCtx))
+#define GL_CONTEXT(context)     ((GLcontext *)(context->radeon.glCtx))
+
+extern void r600DestroyContext(__DRIcontextPrivate * driContextPriv);
+extern GLboolean r600CreateContext(const __GLcontextModes * glVisual,
+				   __DRIcontextPrivate * driContextPriv,
+				   void *sharedContextPrivate);
+
+#define R700_CONTEXT_STATES(context) ((R700_CHIP_CONTEXT *)(&context->hw))
+
+extern GLboolean r700InitChipObject(context_t *context);
+extern GLboolean r700SendContextStates(context_t *context);
+extern GLboolean r700SendViewportState(context_t *context, int id);
+extern GLboolean r700SendRenderTargetState(context_t *context, int id);
+
+extern int       r700SetupStreams(GLcontext * ctx);
+extern void      r700SetupVTXConstants(GLcontext  * ctx, 
+				       unsigned int nStreamID,
+				       void *       pAos,
+				       unsigned int size,      /* number of elements in vector */
+				       unsigned int stride,
+				       unsigned int Count);    /* number of vectors in stream */
+
+#define RADEON_D_CAPTURE 0
+#define RADEON_D_PLAYBACK 1
+#define RADEON_D_PLAYBACK_RAW 2
+#define RADEON_D_T 3
+
+#define r600PackFloat32 radeonPackFloat32
+#define r600PackFloat24 radeonPackFloat24
+
+#endif				/* __R600_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_emit.c b/src/mesa/drivers/dri/r600/r600_emit.c
new file mode 100644
index 00000000000..685f7fe4736
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_emit.c
@@ -0,0 +1,122 @@
+/**************************************************************************
+
+Copyright 2008, 2009 Advanced Micro Devices Inc. (AMD)
+
+Copyright (C) Advanced Micro Devices Inc. (AMD)  2009.  All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/image.h"
+
+#include "swrast_setup/swrast_setup.h"
+#include "math/m_translate.h"
+#include "tnl/tnl.h"
+#include "tnl/t_context.h"
+
+#include "r600_context.h"
+#include "r600_emit.h"
+
+void r600EmitCacheFlush(context_t *rmesa)
+{
+	BATCH_LOCALS(&rmesa->radeon);
+}
+
+GLboolean r600EmitShader(GLcontext * ctx, 
+                         void ** shaderbo,
+			             GLvoid * data, 
+                         int sizeinDWORD,
+                         char * szShaderUsage) 
+{
+    radeonContextPtr radeonctx = RADEON_CONTEXT(ctx);
+
+    struct radeon_bo * pbo;
+    uint32_t *out;
+
+shader_again_alloc:	
+#ifdef RADEON_DEBUG_BO
+	pbo = radeon_bo_open(radeonctx->radeonScreen->bom,
+					     0, 
+                         sizeinDWORD * 4, 
+                         256, 
+                         RADEON_GEM_DOMAIN_GTT,
+					     0,
+                         szShaderUsage);
+#else
+    pbo = radeon_bo_open(radeonctx->radeonScreen->bom,
+					     0, 
+                         sizeinDWORD * 4, 
+                         256, 
+                         RADEON_GEM_DOMAIN_GTT,
+					     0);
+#endif /* RADEON_DEBUG_BO */
+
+	if (!pbo) 
+    {
+		rcommonFlushCmdBuf(radeonctx, __FUNCTION__);
+		goto shader_again_alloc;
+	}
+
+        if (radeon_cs_space_check_with_bo(radeonctx->cmdbuf.cs,
+                                          pbo,
+                                          RADEON_GEM_DOMAIN_GTT, 0))
+                fprintf(stderr,"failure to revalidate BOs - badness\n");
+
+
+	radeon_bo_map(pbo, 1);
+
+    radeon_bo_ref(pbo);
+
+    out = (uint32_t*)(pbo->ptr);
+
+    memcpy(out, data, sizeinDWORD * 4);
+
+    radeon_bo_unmap(pbo); 
+
+    *shaderbo = (void*)pbo;
+
+    return GL_TRUE;
+}
+
+GLboolean r600DeleteShader(GLcontext * ctx, 
+                           void * shaderbo) 
+{
+    struct radeon_bo * pbo = (struct radeon_bo *)shaderbo;
+
+    if (pbo) {
+	    radeon_bo_unmap(pbo);
+	    radeon_bo_unref(pbo); /* when bo->cref <= 0, bo will be bo_free */
+    }
+
+    return GL_TRUE;
+}
diff --git a/src/mesa/drivers/dri/r200/r200_span.h b/src/mesa/drivers/dri/r600/r600_emit.h
index bae56443092..661774d11ea 100644
--- a/src/mesa/drivers/dri/r200/r200_span.h
+++ b/src/mesa/drivers/dri/r600/r600_emit.h
@@ -1,9 +1,8 @@
-/*
-Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+/**************************************************************************
+
+Copyright 2008, 2009 Advanced Micro Devices Inc. (AMD)
 
-The Weather Channel (TM) funded Tungsten Graphics to develop the
-initial release of the Radeon 8500 driver under the XFree86 license.
-This notice must be preserved.
+Copyright (C) Advanced Micro Devices Inc. (AMD)  2009.  All Rights Reserved.
 
 Permission is hereby granted, free of charge, to any person obtaining
 a copy of this software and associated documentation files (the
@@ -29,17 +28,28 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 /*
  * Authors:
- *   Keith Whitwell <[email protected]>
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
  */
 
-#ifndef __R200_SPAN_H__
-#define __R200_SPAN_H__
 
-#include "drirenderbuffer.h"
+#ifndef __R600_EMIT_H__
+#define __R600_EMIT_H__
+
+#include "main/glheader.h"
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+#include "radeon_reg.h"
+
+void r600EmitCacheFlush(context_t *rmesa);
 
-extern void r200InitSpanFuncs( GLcontext *ctx );
+extern GLboolean r600EmitShader(GLcontext * ctx, 
+                                void ** shaderbo,
+			                    GLvoid * data, 
+                                int sizeinDWORD,
+                                char * szShaderUsage); 
 
-extern void
-radeonSetSpanFunctions(driRenderbuffer *rb, const GLvisual *vis);
+extern GLboolean r600DeleteShader(GLcontext * ctx, 
+                                 void * shaderbo);
 
 #endif
diff --git a/src/mesa/drivers/dri/r600/r600_reg.h b/src/mesa/drivers/dri/r600/r600_reg.h
new file mode 100644
index 00000000000..ffe5ee4f74c
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg.h
@@ -0,0 +1,121 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_H_
+#define _R600_REG_H_
+
+/*
+ * Register definitions
+ */
+
+#include "r600_reg_auto_r6xx.h"
+#include "r600_reg_r6xx.h"
+#include "r600_reg_r7xx.h"
+
+
+/* SET_*_REG offsets + ends */
+enum 
+{
+    SET_CONFIG_REG_offset                = 0x00008000,
+    SET_CONFIG_REG_end                   = 0x0000ac00,
+    SET_CONTEXT_REG_offset               = 0x00028000,
+    SET_CONTEXT_REG_end                  = 0x00029000,
+    SET_ALU_CONST_offset                 = 0x00030000,
+    SET_ALU_CONST_end                    = 0x00032000,
+    SET_RESOURCE_offset                  = 0x00038000,
+    SET_RESOURCE_end                     = 0x0003c000,
+    SET_SAMPLER_offset                   = 0x0003c000,
+    SET_SAMPLER_end                      = 0x0003cff0,
+    SET_CTL_CONST_offset                 = 0x0003cff0,
+    SET_CTL_CONST_end                    = 0x0003e200,
+    SET_LOOP_CONST_offset                = 0x0003e200,
+    SET_LOOP_CONST_end                   = 0x0003e380,
+    SET_BOOL_CONST_offset                = 0x0003e380,
+    SET_BOOL_CONST_end                   = 0x00040000,
+};
+
+/* packet3 IT_SURFACE_BASE_UPDATE bits */
+enum 
+{
+    DEPTH_BASE                           = (1 << 0),
+    COLOR0_BASE                          = (1 << 1),
+    COLOR1_BASE                          = (1 << 2),
+    COLOR2_BASE                          = (1 << 3),
+    COLOR3_BASE                          = (1 << 4),
+    COLOR4_BASE                          = (1 << 5),
+    COLOR5_BASE                          = (1 << 6),
+    COLOR6_BASE                          = (1 << 7),
+    COLOR7_BASE                          = (1 << 8),
+    STRMOUT_BASE0                        = (1 << 9),
+    STRMOUT_BASE1                        = (1 << 10),
+    STRMOUT_BASE2                        = (1 << 11),
+    STRMOUT_BASE3                        = (1 << 12),
+    COHER_BASE0                          = (1 << 13),
+    COHER_BASE1                          = (1 << 14),
+};
+
+/* Packet3 commands */
+enum 
+{
+    IT_NOP                               = 0x10,
+    IT_INDIRECT_BUFFER_END               = 0x17,
+    IT_SET_PREDICATION                   = 0x20,
+    IT_REG_RMW                           = 0x21,
+    IT_COND_EXEC                         = 0x22,
+    IT_PRED_EXEC                         = 0x23,
+    IT_START_3D_CMDBUF                   = 0x24,
+    IT_DRAW_INDEX_2                      = 0x27,
+    IT_CONTEXT_CONTROL                   = 0x28,
+    IT_DRAW_INDEX_IMMD_BE                = 0x29,
+    IT_INDEX_TYPE                        = 0x2A,
+    IT_DRAW_INDEX                        = 0x2B,
+    IT_DRAW_INDEX_AUTO                   = 0x2D,
+    IT_DRAW_INDEX_IMMD                   = 0x2E,
+    IT_NUM_INSTANCES                     = 0x2F,
+    IT_STRMOUT_BUFFER_UPDATE             = 0x34,
+    IT_INDIRECT_BUFFER_MP                = 0x38,
+    IT_MEM_SEMAPHORE                     = 0x39,
+    IT_MPEG_INDEX                        = 0x3A,
+    IT_WAIT_REG_MEM                      = 0x3C,
+    IT_MEM_WRITE                         = 0x3D,
+    IT_INDIRECT_BUFFER                   = 0x32,
+    IT_CP_INTERRUPT                      = 0x40,
+    IT_SURFACE_SYNC                      = 0x43,
+    IT_ME_INITIALIZE                     = 0x44,
+    IT_COND_WRITE                        = 0x45,
+    IT_EVENT_WRITE                       = 0x46,
+    IT_EVENT_WRITE_EOP                   = 0x47,
+    IT_ONE_REG_WRITE                     = 0x57,
+    IT_SET_CONFIG_REG                    = 0x68,
+    IT_SET_CONTEXT_REG                   = 0x69,
+    IT_SET_ALU_CONST                     = 0x6A,
+    IT_SET_BOOL_CONST                    = 0x6B,
+    IT_SET_LOOP_CONST                    = 0x6C,
+    IT_SET_RESOURCE                      = 0x6D,
+    IT_SET_SAMPLER                       = 0x6E,
+    IT_SET_CTL_CONST                     = 0x6F,
+    IT_SURFACE_BASE_UPDATE               = 0x73,
+};
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_reg_auto_r6xx.h b/src/mesa/drivers/dri/r600/r600_reg_auto_r6xx.h
new file mode 100644
index 00000000000..9d5aa3c7e49
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg_auto_r6xx.h
@@ -0,0 +1,3087 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _AUTOREGS
+#define _AUTOREGS
+
+enum {
+
+    VGT_VTX_VECT_EJECT_REG                                = 0x000088b0,
+	PRIM_COUNT_mask                                   = 0x3ff << 0,
+	PRIM_COUNT_shift                                  = 0,
+    VGT_LAST_COPY_STATE                                   = 0x000088c0,
+	SRC_STATE_ID_mask                                 = 0x07 << 0,
+	SRC_STATE_ID_shift                                = 0,
+	DST_STATE_ID_mask                                 = 0x07 << 16,
+	DST_STATE_ID_shift                                = 16,
+    VGT_CACHE_INVALIDATION                                = 0x000088c4,
+	CACHE_INVALIDATION_mask                           = 0x03 << 0,
+	CACHE_INVALIDATION_shift                          = 0,
+	    VC_ONLY                                       = 0x00,
+	    TC_ONLY                                       = 0x01,
+	    VC_AND_TC                                     = 0x02,
+	VS_NO_EXTRA_BUFFER_bit                            = 1 << 5,
+    VGT_GS_PER_ES                                         = 0x000088c8,
+    VGT_ES_PER_GS                                         = 0x000088cc,
+    VGT_GS_VERTEX_REUSE                                   = 0x000088d4,
+	VERT_REUSE_mask                                   = 0x1f << 0,
+	VERT_REUSE_shift                                  = 0,
+    VGT_MC_LAT_CNTL                                       = 0x000088d8,
+	MC_TIME_STAMP_RES_mask                            = 0x03 << 0,
+	MC_TIME_STAMP_RES_shift                           = 0,
+	    X_0_992_MAX_LATENCY                           = 0x00,
+	    X_0_496_MAX_LATENCY                           = 0x01,
+	    X_0_248_MAX_LATENCY                           = 0x02,
+	    X_0_124_MAX_LATENCY                           = 0x03,
+    VGT_GS_PER_VS                                         = 0x000088e8,
+	GS_PER_VS_mask                                    = 0x0f << 0,
+	GS_PER_VS_shift                                   = 0,
+    VGT_CNTL_STATUS                                       = 0x000088f0,
+	VGT_OUT_INDX_BUSY_bit                             = 1 << 0,
+	VGT_OUT_BUSY_bit                                  = 1 << 1,
+	VGT_PT_BUSY_bit                                   = 1 << 2,
+	VGT_TE_BUSY_bit                                   = 1 << 3,
+	VGT_VR_BUSY_bit                                   = 1 << 4,
+	VGT_GRP_BUSY_bit                                  = 1 << 5,
+	VGT_DMA_REQ_BUSY_bit                              = 1 << 6,
+	VGT_DMA_BUSY_bit                                  = 1 << 7,
+	VGT_GS_BUSY_bit                                   = 1 << 8,
+	VGT_BUSY_bit                                      = 1 << 9,
+    VGT_PRIMITIVE_TYPE                                    = 0x00008958,
+	VGT_PRIMITIVE_TYPE__PRIM_TYPE_mask                = 0x3f << 0,
+	VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift               = 0,
+	    DI_PT_NONE                                    = 0x00,
+	    DI_PT_POINTLIST                               = 0x01,
+	    DI_PT_LINELIST                                = 0x02,
+	    DI_PT_LINESTRIP                               = 0x03,
+	    DI_PT_TRILIST                                 = 0x04,
+	    DI_PT_TRIFAN                                  = 0x05,
+	    DI_PT_TRISTRIP                                = 0x06,
+	    DI_PT_UNUSED_0                                = 0x07,
+	    DI_PT_UNUSED_1                                = 0x08,
+	    DI_PT_UNUSED_2                                = 0x09,
+	    DI_PT_LINELIST_ADJ                            = 0x0a,
+	    DI_PT_LINESTRIP_ADJ                           = 0x0b,
+	    DI_PT_TRILIST_ADJ                             = 0x0c,
+	    DI_PT_TRISTRIP_ADJ                            = 0x0d,
+	    DI_PT_UNUSED_3                                = 0x0e,
+	    DI_PT_UNUSED_4                                = 0x0f,
+	    DI_PT_TRI_WITH_WFLAGS                         = 0x10,
+	    DI_PT_RECTLIST                                = 0x11,
+	    DI_PT_LINELOOP                                = 0x12,
+	    DI_PT_QUADLIST                                = 0x13,
+	    DI_PT_QUADSTRIP                               = 0x14,
+	    DI_PT_POLYGON                                 = 0x15,
+	    DI_PT_2D_COPY_RECT_LIST_V0                    = 0x16,
+	    DI_PT_2D_COPY_RECT_LIST_V1                    = 0x17,
+	    DI_PT_2D_COPY_RECT_LIST_V2                    = 0x18,
+	    DI_PT_2D_COPY_RECT_LIST_V3                    = 0x19,
+	    DI_PT_2D_FILL_RECT_LIST                       = 0x1a,
+	    DI_PT_2D_LINE_STRIP                           = 0x1b,
+	    DI_PT_2D_TRI_STRIP                            = 0x1c,
+    VGT_INDEX_TYPE                                        = 0x0000895c,
+	INDEX_TYPE_mask                                   = 0x03 << 0,
+	INDEX_TYPE_shift                                  = 0,
+	    DI_INDEX_SIZE_16_BIT                          = 0x00,
+	    DI_INDEX_SIZE_32_BIT                          = 0x01,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_0                      = 0x00008960,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_1                      = 0x00008964,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_2                      = 0x00008968,
+    VGT_STRMOUT_BUFFER_FILLED_SIZE_3                      = 0x0000896c,
+    VGT_NUM_INDICES                                       = 0x00008970,
+    VGT_NUM_INSTANCES                                     = 0x00008974,
+    PA_CL_CNTL_STATUS                                     = 0x00008a10,
+	CL_BUSY_bit                                       = 1 << 31,
+    PA_CL_ENHANCE                                         = 0x00008a14,
+	CLIP_VTX_REORDER_ENA_bit                          = 1 << 0,
+	NUM_CLIP_SEQ_mask                                 = 0x03 << 1,
+	NUM_CLIP_SEQ_shift                                = 1,
+	CLIPPED_PRIM_SEQ_STALL_bit                        = 1 << 3,
+	VE_NAN_PROC_DISABLE_bit                           = 1 << 4,
+    PA_SU_CNTL_STATUS                                     = 0x00008a50,
+	SU_BUSY_bit                                       = 1 << 31,
+    PA_SC_LINE_STIPPLE_STATE                              = 0x00008b10,
+	CURRENT_PTR_mask                                  = 0x0f << 0,
+	CURRENT_PTR_shift                                 = 0,
+	CURRENT_COUNT_mask                                = 0xff << 8,
+	CURRENT_COUNT_shift                               = 8,
+    PA_SC_MULTI_CHIP_CNTL                                 = 0x00008b20,
+	LOG2_NUM_CHIPS_mask                               = 0x07 << 0,
+	LOG2_NUM_CHIPS_shift                              = 0,
+	MULTI_CHIP_TILE_SIZE_mask                         = 0x03 << 3,
+	MULTI_CHIP_TILE_SIZE_shift                        = 3,
+	    X_16_X_16_PIXEL_TILE_PER_CHIP                 = 0x00,
+	    X_32_X_32_PIXEL_TILE_PER_CHIP                 = 0x01,
+	    X_64_X_64_PIXEL_TILE_PER_CHIP                 = 0x02,
+	    X_128X128_PIXEL_TILE_PER_CHIP                 = 0x03,
+	CHIP_TILE_X_LOC_mask                              = 0x07 << 5,
+	CHIP_TILE_X_LOC_shift                             = 5,
+	CHIP_TILE_Y_LOC_mask                              = 0x07 << 8,
+	CHIP_TILE_Y_LOC_shift                             = 8,
+	CHIP_SUPER_TILE_B_bit                             = 1 << 11,
+    PA_SC_AA_SAMPLE_LOCS_2S                               = 0x00008b40,
+	S0_X_mask                                         = 0x0f << 0,
+	S0_X_shift                                        = 0,
+	S0_Y_mask                                         = 0x0f << 4,
+	S0_Y_shift                                        = 4,
+	S1_X_mask                                         = 0x0f << 8,
+	S1_X_shift                                        = 8,
+	S1_Y_mask                                         = 0x0f << 12,
+	S1_Y_shift                                        = 12,
+    PA_SC_AA_SAMPLE_LOCS_4S                               = 0x00008b44,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+	S2_X_mask                                         = 0x0f << 16,
+	S2_X_shift                                        = 16,
+	S2_Y_mask                                         = 0x0f << 20,
+	S2_Y_shift                                        = 20,
+	S3_X_mask                                         = 0x0f << 24,
+	S3_X_shift                                        = 24,
+	S3_Y_mask                                         = 0x0f << 28,
+	S3_Y_shift                                        = 28,
+    PA_SC_AA_SAMPLE_LOCS_8S_WD0                           = 0x00008b48,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+/* 	S2_X_mask                                         = 0x0f << 16, */
+/* 	S2_X_shift                                        = 16, */
+/* 	S2_Y_mask                                         = 0x0f << 20, */
+/* 	S2_Y_shift                                        = 20, */
+/* 	S3_X_mask                                         = 0x0f << 24, */
+/* 	S3_X_shift                                        = 24, */
+/* 	S3_Y_mask                                         = 0x0f << 28, */
+/* 	S3_Y_shift                                        = 28, */
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1                           = 0x00008b4c,
+	S4_X_mask                                         = 0x0f << 0,
+	S4_X_shift                                        = 0,
+	S4_Y_mask                                         = 0x0f << 4,
+	S4_Y_shift                                        = 4,
+	S5_X_mask                                         = 0x0f << 8,
+	S5_X_shift                                        = 8,
+	S5_Y_mask                                         = 0x0f << 12,
+	S5_Y_shift                                        = 12,
+	S6_X_mask                                         = 0x0f << 16,
+	S6_X_shift                                        = 16,
+	S6_Y_mask                                         = 0x0f << 20,
+	S6_Y_shift                                        = 20,
+	S7_X_mask                                         = 0x0f << 24,
+	S7_X_shift                                        = 24,
+	S7_Y_mask                                         = 0x0f << 28,
+	S7_Y_shift                                        = 28,
+    PA_SC_CNTL_STATUS                                     = 0x00008be0,
+	MPASS_OVERFLOW_bit                                = 1 << 30,
+    PA_SC_ENHANCE                                         = 0x00008bf0,
+	FORCE_EOV_MAX_CLK_CNT_mask                        = 0xfff << 0,
+	FORCE_EOV_MAX_CLK_CNT_shift                       = 0,
+	FORCE_EOV_MAX_TILE_CNT_mask                       = 0xfff << 12,
+	FORCE_EOV_MAX_TILE_CNT_shift                      = 12,
+    SQ_CONFIG                                             = 0x00008c00,
+	VC_ENABLE_bit                                     = 1 << 0,
+	EXPORT_SRC_C_bit                                  = 1 << 1,
+	DX9_CONSTS_bit                                    = 1 << 2,
+	ALU_INST_PREFER_VECTOR_bit                        = 1 << 3,
+	SQ_CONFIG__DX10_CLAMP_bit                         = 1 << 4,
+	ALU_PREFER_ONE_WATERFALL_bit                      = 1 << 5,
+	ALU_MAX_ONE_WATERFALL_bit                         = 1 << 6,
+	CLAUSE_SEQ_PRIO_mask                              = 0x03 << 8,
+	CLAUSE_SEQ_PRIO_shift                             = 8,
+	    SQ_CL_PRIO_RND_ROBIN                          = 0x00,
+	    SQ_CL_PRIO_MACRO_SEQ                          = 0x01,
+	    SQ_CL_PRIO_NONE                               = 0x02,
+	PS_PRIO_mask                                      = 0x03 << 24,
+	PS_PRIO_shift                                     = 24,
+	VS_PRIO_mask                                      = 0x03 << 26,
+	VS_PRIO_shift                                     = 26,
+	GS_PRIO_mask                                      = 0x03 << 28,
+	GS_PRIO_shift                                     = 28,
+	ES_PRIO_mask                                      = 0x03 << 30,
+	ES_PRIO_shift                                     = 30,
+    SQ_GPR_RESOURCE_MGMT_1                                = 0x00008c04,
+	NUM_PS_GPRS_mask                                  = 0xff << 0,
+	NUM_PS_GPRS_shift                                 = 0,
+	NUM_VS_GPRS_mask                                  = 0xff << 16,
+	NUM_VS_GPRS_shift                                 = 16,
+	NUM_CLAUSE_TEMP_GPRS_mask                         = 0x0f << 28,
+	NUM_CLAUSE_TEMP_GPRS_shift                        = 28,
+    SQ_GPR_RESOURCE_MGMT_2                                = 0x00008c08,
+	NUM_GS_GPRS_mask                                  = 0xff << 0,
+	NUM_GS_GPRS_shift                                 = 0,
+	NUM_ES_GPRS_mask                                  = 0xff << 16,
+	NUM_ES_GPRS_shift                                 = 16,
+    SQ_THREAD_RESOURCE_MGMT                               = 0x00008c0c,
+	NUM_PS_THREADS_mask                               = 0xff << 0,
+	NUM_PS_THREADS_shift                              = 0,
+	NUM_VS_THREADS_mask                               = 0xff << 8,
+	NUM_VS_THREADS_shift                              = 8,
+	NUM_GS_THREADS_mask                               = 0xff << 16,
+	NUM_GS_THREADS_shift                              = 16,
+	NUM_ES_THREADS_mask                               = 0xff << 24,
+	NUM_ES_THREADS_shift                              = 24,
+    SQ_STACK_RESOURCE_MGMT_1                              = 0x00008c10,
+	NUM_PS_STACK_ENTRIES_mask                         = 0xfff << 0,
+	NUM_PS_STACK_ENTRIES_shift                        = 0,
+	NUM_VS_STACK_ENTRIES_mask                         = 0xfff << 16,
+	NUM_VS_STACK_ENTRIES_shift                        = 16,
+    SQ_STACK_RESOURCE_MGMT_2                              = 0x00008c14,
+	NUM_GS_STACK_ENTRIES_mask                         = 0xfff << 0,
+	NUM_GS_STACK_ENTRIES_shift                        = 0,
+	NUM_ES_STACK_ENTRIES_mask                         = 0xfff << 16,
+	NUM_ES_STACK_ENTRIES_shift                        = 16,
+    SQ_ESGS_RING_BASE                                     = 0x00008c40,
+    SQ_ESGS_RING_SIZE                                     = 0x00008c44,
+    SQ_GSVS_RING_BASE                                     = 0x00008c48,
+    SQ_GSVS_RING_SIZE                                     = 0x00008c4c,
+    SQ_ESTMP_RING_BASE                                    = 0x00008c50,
+    SQ_ESTMP_RING_SIZE                                    = 0x00008c54,
+    SQ_GSTMP_RING_BASE                                    = 0x00008c58,
+    SQ_GSTMP_RING_SIZE                                    = 0x00008c5c,
+    SQ_VSTMP_RING_BASE                                    = 0x00008c60,
+    SQ_VSTMP_RING_SIZE                                    = 0x00008c64,
+    SQ_PSTMP_RING_BASE                                    = 0x00008c68,
+    SQ_PSTMP_RING_SIZE                                    = 0x00008c6c,
+    SQ_FBUF_RING_BASE                                     = 0x00008c70,
+    SQ_FBUF_RING_SIZE                                     = 0x00008c74,
+    SQ_REDUC_RING_BASE                                    = 0x00008c78,
+    SQ_REDUC_RING_SIZE                                    = 0x00008c7c,
+    SQ_ALU_WORD1_OP3                                      = 0x00008dfc,
+	SRC2_SEL_mask                                     = 0x1ff << 0,
+	SRC2_SEL_shift                                    = 0,
+	    SQ_ALU_SRC_0                                  = 0xf8,
+	    SQ_ALU_SRC_1                                  = 0xf9,
+	    SQ_ALU_SRC_1_INT                              = 0xfa,
+	    SQ_ALU_SRC_M_1_INT                            = 0xfb,
+	    SQ_ALU_SRC_0_5                                = 0xfc,
+	    SQ_ALU_SRC_LITERAL                            = 0xfd,
+	    SQ_ALU_SRC_PV                                 = 0xfe,
+	    SQ_ALU_SRC_PS                                 = 0xff,
+	SRC2_REL_bit                                      = 1 << 9,
+	SRC2_CHAN_mask                                    = 0x03 << 10,
+	SRC2_CHAN_shift                                   = 10,
+	    SQ_CHAN_X                                     = 0x00,
+	    SQ_CHAN_Y                                     = 0x01,
+	    SQ_CHAN_Z                                     = 0x02,
+	    SQ_CHAN_W                                     = 0x03,
+	SRC2_NEG_bit                                      = 1 << 12,
+	SQ_ALU_WORD1_OP3__ALU_INST_mask                   = 0x1f << 13,
+	SQ_ALU_WORD1_OP3__ALU_INST_shift                  = 13,
+	    SQ_OP3_INST_MUL_LIT                           = 0x0c,
+	    SQ_OP3_INST_MUL_LIT_M2                        = 0x0d,
+	    SQ_OP3_INST_MUL_LIT_M4                        = 0x0e,
+	    SQ_OP3_INST_MUL_LIT_D2                        = 0x0f,
+	    SQ_OP3_INST_MULADD                            = 0x10,
+	    SQ_OP3_INST_MULADD_M2                         = 0x11,
+	    SQ_OP3_INST_MULADD_M4                         = 0x12,
+	    SQ_OP3_INST_MULADD_D2                         = 0x13,
+	    SQ_OP3_INST_MULADD_IEEE                       = 0x14,
+	    SQ_OP3_INST_MULADD_IEEE_M2                    = 0x15,
+	    SQ_OP3_INST_MULADD_IEEE_M4                    = 0x16,
+	    SQ_OP3_INST_MULADD_IEEE_D2                    = 0x17,
+	    SQ_OP3_INST_CNDE                              = 0x18,
+	    SQ_OP3_INST_CNDGT                             = 0x19,
+	    SQ_OP3_INST_CNDGE                             = 0x1a,
+	    SQ_OP3_INST_CNDE_INT                          = 0x1c,
+	    SQ_OP3_INST_CNDGT_INT                         = 0x1d,
+	    SQ_OP3_INST_CNDGE_INT                         = 0x1e,
+    SQ_TEX_WORD2                                          = 0x00008dfc,
+	OFFSET_X_mask                                     = 0x1f << 0,
+	OFFSET_X_shift                                    = 0,
+	OFFSET_Y_mask                                     = 0x1f << 5,
+	OFFSET_Y_shift                                    = 5,
+	OFFSET_Z_mask                                     = 0x1f << 10,
+	OFFSET_Z_shift                                    = 10,
+	SAMPLER_ID_mask                                   = 0x1f << 15,
+	SAMPLER_ID_shift                                  = 15,
+	SQ_TEX_WORD2__SRC_SEL_X_mask                      = 0x07 << 20,
+	SQ_TEX_WORD2__SRC_SEL_X_shift                     = 20,
+	    SQ_SEL_X                                      = 0x00,
+	    SQ_SEL_Y                                      = 0x01,
+	    SQ_SEL_Z                                      = 0x02,
+	    SQ_SEL_W                                      = 0x03,
+	    SQ_SEL_0                                      = 0x04,
+	    SQ_SEL_1                                      = 0x05,
+	SRC_SEL_Y_mask                                    = 0x07 << 23,
+	SRC_SEL_Y_shift                                   = 23,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SRC_SEL_Z_mask                                    = 0x07 << 26,
+	SRC_SEL_Z_shift                                   = 26,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SRC_SEL_W_mask                                    = 0x07 << 29,
+	SRC_SEL_W_shift                                   = 29,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+    SQ_CF_ALLOC_EXPORT_WORD1                              = 0x00008dfc,
+	BURST_COUNT_mask                                  = 0x0f << 17,
+	BURST_COUNT_shift                                 = 17,
+	END_OF_PROGRAM_bit                                = 1 << 21,
+	VALID_PIXEL_MODE_bit                              = 1 << 22,
+	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_mask            = 0x7f << 23,
+	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_shift           = 23,
+	    SQ_CF_INST_MEM_STREAM0                        = 0x20,
+	    SQ_CF_INST_MEM_STREAM1                        = 0x21,
+	    SQ_CF_INST_MEM_STREAM2                        = 0x22,
+	    SQ_CF_INST_MEM_STREAM3                        = 0x23,
+	    SQ_CF_INST_MEM_SCRATCH                        = 0x24,
+	    SQ_CF_INST_MEM_REDUCTION                      = 0x25,
+	    SQ_CF_INST_MEM_RING                           = 0x26,
+	    SQ_CF_INST_EXPORT                             = 0x27,
+	    SQ_CF_INST_EXPORT_DONE                        = 0x28,
+	WHOLE_QUAD_MODE_bit                               = 1 << 30,
+	BARRIER_bit                                       = 1 << 31,
+    SQ_CF_ALU_WORD1                                       = 0x00008dfc,
+	KCACHE_MODE1_mask                                 = 0x03 << 0,
+	KCACHE_MODE1_shift                                = 0,
+	    SQ_CF_KCACHE_NOP                              = 0x00,
+	    SQ_CF_KCACHE_LOCK_1                           = 0x01,
+	    SQ_CF_KCACHE_LOCK_2                           = 0x02,
+	    SQ_CF_KCACHE_LOCK_LOOP_INDEX                  = 0x03,
+	KCACHE_ADDR0_mask                                 = 0xff << 2,
+	KCACHE_ADDR0_shift                                = 2,
+	KCACHE_ADDR1_mask                                 = 0xff << 10,
+	KCACHE_ADDR1_shift                                = 10,
+	SQ_CF_ALU_WORD1__COUNT_mask                       = 0x7f << 18,
+	SQ_CF_ALU_WORD1__COUNT_shift                      = 18,
+	SQ_CF_ALU_WORD1__ALT_CONST_bit                    = 1 << 25,
+	SQ_CF_ALU_WORD1__CF_INST_mask                     = 0x0f << 26,
+	SQ_CF_ALU_WORD1__CF_INST_shift                    = 26,
+	    SQ_CF_INST_ALU                                = 0x08,
+	    SQ_CF_INST_ALU_PUSH_BEFORE                    = 0x09,
+	    SQ_CF_INST_ALU_POP_AFTER                      = 0x0a,
+	    SQ_CF_INST_ALU_POP2_AFTER                     = 0x0b,
+	    SQ_CF_INST_ALU_CONTINUE                       = 0x0d,
+	    SQ_CF_INST_ALU_BREAK                          = 0x0e,
+	    SQ_CF_INST_ALU_ELSE_AFTER                     = 0x0f,
+/* 	WHOLE_QUAD_MODE_bit                               = 1 << 30, */
+/* 	BARRIER_bit                                       = 1 << 31, */
+    SQ_TEX_WORD1                                          = 0x00008dfc,
+	SQ_TEX_WORD1__DST_GPR_mask                        = 0x7f << 0,
+	SQ_TEX_WORD1__DST_GPR_shift                       = 0,
+	SQ_TEX_WORD1__DST_REL_bit                         = 1 << 7,
+	SQ_TEX_WORD1__DST_SEL_X_mask                      = 0x07 << 9,
+	SQ_TEX_WORD1__DST_SEL_X_shift                     = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	    SQ_SEL_MASK                                   = 0x07,
+	SQ_TEX_WORD1__DST_SEL_Y_mask                      = 0x07 << 12,
+	SQ_TEX_WORD1__DST_SEL_Y_shift                     = 12,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__DST_SEL_Z_mask                      = 0x07 << 15,
+	SQ_TEX_WORD1__DST_SEL_Z_shift                     = 15,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__DST_SEL_W_mask                      = 0x07 << 18,
+	SQ_TEX_WORD1__DST_SEL_W_shift                     = 18,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_TEX_WORD1__LOD_BIAS_mask                       = 0x7f << 21,
+	SQ_TEX_WORD1__LOD_BIAS_shift                      = 21,
+	COORD_TYPE_X_bit                                  = 1 << 28,
+	COORD_TYPE_Y_bit                                  = 1 << 29,
+	COORD_TYPE_Z_bit                                  = 1 << 30,
+	COORD_TYPE_W_bit                                  = 1 << 31,
+    SQ_VTX_WORD0                                          = 0x00008dfc,
+	VTX_INST_mask                                     = 0x1f << 0,
+	VTX_INST_shift                                    = 0,
+	    SQ_VTX_INST_FETCH                             = 0x00,
+	    SQ_VTX_INST_SEMANTIC                          = 0x01,
+	FETCH_TYPE_mask                                   = 0x03 << 5,
+	FETCH_TYPE_shift                                  = 5,
+	    SQ_VTX_FETCH_VERTEX_DATA                      = 0x00,
+	    SQ_VTX_FETCH_INSTANCE_DATA                    = 0x01,
+	    SQ_VTX_FETCH_NO_INDEX_OFFSET                  = 0x02,
+	FETCH_WHOLE_QUAD_bit                              = 1 << 7,
+	BUFFER_ID_mask                                    = 0xff << 8,
+	BUFFER_ID_shift                                   = 8,
+	SRC_GPR_mask                                      = 0x7f << 16,
+	SRC_GPR_shift                                     = 16,
+	SRC_REL_bit                                       = 1 << 23,
+	SQ_VTX_WORD0__SRC_SEL_X_mask                      = 0x03 << 24,
+	SQ_VTX_WORD0__SRC_SEL_X_shift                     = 24,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+	MEGA_FETCH_COUNT_mask                             = 0x3f << 26,
+	MEGA_FETCH_COUNT_shift                            = 26,
+    SQ_CF_ALLOC_EXPORT_WORD1_SWIZ                         = 0x00008dfc,
+	SEL_X_mask                                        = 0x07 << 0,
+	SEL_X_shift                                       = 0,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_Y_mask                                        = 0x07 << 3,
+	SEL_Y_shift                                       = 3,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_Z_mask                                        = 0x07 << 6,
+	SEL_Z_shift                                       = 6,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SEL_W_mask                                        = 0x07 << 9,
+	SEL_W_shift                                       = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+    SQ_ALU_WORD1                                          = 0x00008dfc,
+	ENCODING_mask                                     = 0x07 << 15,
+	ENCODING_shift                                    = 15,
+	BANK_SWIZZLE_mask                                 = 0x07 << 18,
+	BANK_SWIZZLE_shift                                = 18,
+	    SQ_ALU_VEC_012                                = 0x00,
+	    SQ_ALU_VEC_021                                = 0x01,
+	    SQ_ALU_VEC_120                                = 0x02,
+	    SQ_ALU_VEC_102                                = 0x03,
+	    SQ_ALU_VEC_201                                = 0x04,
+	    SQ_ALU_VEC_210                                = 0x05,
+	SQ_ALU_WORD1__DST_GPR_mask                        = 0x7f << 21,
+	SQ_ALU_WORD1__DST_GPR_shift                       = 21,
+	SQ_ALU_WORD1__DST_REL_bit                         = 1 << 28,
+	DST_CHAN_mask                                     = 0x03 << 29,
+	DST_CHAN_shift                                    = 29,
+	    CHAN_X                                        = 0x00,
+	    CHAN_Y                                        = 0x01,
+	    CHAN_Z                                        = 0x02,
+	    CHAN_W                                        = 0x03,
+	SQ_ALU_WORD1__CLAMP_bit                           = 1 << 31,
+    SQ_CF_ALU_WORD0                                       = 0x00008dfc,
+	SQ_CF_ALU_WORD0__ADDR_mask                        = 0x3fffff << 0,
+	SQ_CF_ALU_WORD0__ADDR_shift                       = 0,
+	KCACHE_BANK0_mask                                 = 0x0f << 22,
+	KCACHE_BANK0_shift                                = 22,
+	KCACHE_BANK1_mask                                 = 0x0f << 26,
+	KCACHE_BANK1_shift                                = 26,
+	KCACHE_MODE0_mask                                 = 0x03 << 30,
+	KCACHE_MODE0_shift                                = 30,
+/* 	    SQ_CF_KCACHE_NOP                              = 0x00, */
+/* 	    SQ_CF_KCACHE_LOCK_1                           = 0x01, */
+/* 	    SQ_CF_KCACHE_LOCK_2                           = 0x02, */
+/* 	    SQ_CF_KCACHE_LOCK_LOOP_INDEX                  = 0x03, */
+    SQ_VTX_WORD2                                          = 0x00008dfc,
+	SQ_VTX_WORD2__OFFSET_mask                         = 0xffff << 0,
+	SQ_VTX_WORD2__OFFSET_shift                        = 0,
+	SQ_VTX_WORD2__ENDIAN_SWAP_mask                    = 0x03 << 16,
+	SQ_VTX_WORD2__ENDIAN_SWAP_shift                   = 16,
+	    SQ_ENDIAN_NONE                                = 0x00,
+	    SQ_ENDIAN_8IN16                               = 0x01,
+	    SQ_ENDIAN_8IN32                               = 0x02,
+	CONST_BUF_NO_STRIDE_bit                           = 1 << 18,
+	MEGA_FETCH_bit                                    = 1 << 19,
+	SQ_VTX_WORD2__ALT_CONST_bit                       = 1 << 20,
+    SQ_ALU_WORD1_OP2_V2                                   = 0x00008dfc,
+	SRC0_ABS_bit                                      = 1 << 0,
+	SRC1_ABS_bit                                      = 1 << 1,
+	UPDATE_EXECUTE_MASK_bit                           = 1 << 2,
+	UPDATE_PRED_bit                                   = 1 << 3,
+	WRITE_MASK_bit                                    = 1 << 4,
+	SQ_ALU_WORD1_OP2_V2__OMOD_mask                    = 0x03 << 5,
+	SQ_ALU_WORD1_OP2_V2__OMOD_shift                   = 5,
+	    SQ_ALU_OMOD_OFF                               = 0x00,
+	    SQ_ALU_OMOD_M2                                = 0x01,
+	    SQ_ALU_OMOD_M4                                = 0x02,
+	    SQ_ALU_OMOD_D2                                = 0x03,
+	SQ_ALU_WORD1_OP2_V2__ALU_INST_mask                = 0x7ff << 7,
+	SQ_ALU_WORD1_OP2_V2__ALU_INST_shift               = 7,
+	    SQ_OP2_INST_ADD                               = 0x00,
+	    SQ_OP2_INST_MUL                               = 0x01,
+	    SQ_OP2_INST_MUL_IEEE                          = 0x02,
+	    SQ_OP2_INST_MAX                               = 0x03,
+	    SQ_OP2_INST_MIN                               = 0x04,
+	    SQ_OP2_INST_MAX_DX10                          = 0x05,
+	    SQ_OP2_INST_MIN_DX10                          = 0x06,
+	    SQ_OP2_INST_SETE                              = 0x08,
+	    SQ_OP2_INST_SETGT                             = 0x09,
+	    SQ_OP2_INST_SETGE                             = 0x0a,
+	    SQ_OP2_INST_SETNE                             = 0x0b,
+	    SQ_OP2_INST_SETE_DX10                         = 0x0c,
+	    SQ_OP2_INST_SETGT_DX10                        = 0x0d,
+	    SQ_OP2_INST_SETGE_DX10                        = 0x0e,
+	    SQ_OP2_INST_SETNE_DX10                        = 0x0f,
+	    SQ_OP2_INST_FRACT                             = 0x10,
+	    SQ_OP2_INST_TRUNC                             = 0x11,
+	    SQ_OP2_INST_CEIL                              = 0x12,
+	    SQ_OP2_INST_RNDNE                             = 0x13,
+	    SQ_OP2_INST_FLOOR                             = 0x14,
+	    SQ_OP2_INST_MOVA                              = 0x15,
+	    SQ_OP2_INST_MOVA_FLOOR                        = 0x16,
+	    SQ_OP2_INST_MOVA_INT                          = 0x18,
+	    SQ_OP2_INST_MOV                               = 0x19,
+	    SQ_OP2_INST_NOP                               = 0x1a,
+	    SQ_OP2_INST_PRED_SETGT_UINT                   = 0x1e,
+	    SQ_OP2_INST_PRED_SETGE_UINT                   = 0x1f,
+	    SQ_OP2_INST_PRED_SETE                         = 0x20,
+	    SQ_OP2_INST_PRED_SETGT                        = 0x21,
+	    SQ_OP2_INST_PRED_SETGE                        = 0x22,
+	    SQ_OP2_INST_PRED_SETNE                        = 0x23,
+	    SQ_OP2_INST_PRED_SET_INV                      = 0x24,
+	    SQ_OP2_INST_PRED_SET_POP                      = 0x25,
+	    SQ_OP2_INST_PRED_SET_CLR                      = 0x26,
+	    SQ_OP2_INST_PRED_SET_RESTORE                  = 0x27,
+	    SQ_OP2_INST_PRED_SETE_PUSH                    = 0x28,
+	    SQ_OP2_INST_PRED_SETGT_PUSH                   = 0x29,
+	    SQ_OP2_INST_PRED_SETGE_PUSH                   = 0x2a,
+	    SQ_OP2_INST_PRED_SETNE_PUSH                   = 0x2b,
+	    SQ_OP2_INST_KILLE                             = 0x2c,
+	    SQ_OP2_INST_KILLGT                            = 0x2d,
+	    SQ_OP2_INST_KILLGE                            = 0x2e,
+	    SQ_OP2_INST_KILLNE                            = 0x2f,
+	    SQ_OP2_INST_AND_INT                           = 0x30,
+	    SQ_OP2_INST_OR_INT                            = 0x31,
+	    SQ_OP2_INST_XOR_INT                           = 0x32,
+	    SQ_OP2_INST_NOT_INT                           = 0x33,
+	    SQ_OP2_INST_ADD_INT                           = 0x34,
+	    SQ_OP2_INST_SUB_INT                           = 0x35,
+	    SQ_OP2_INST_MAX_INT                           = 0x36,
+	    SQ_OP2_INST_MIN_INT                           = 0x37,
+	    SQ_OP2_INST_MAX_UINT                          = 0x38,
+	    SQ_OP2_INST_MIN_UINT                          = 0x39,
+	    SQ_OP2_INST_SETE_INT                          = 0x3a,
+	    SQ_OP2_INST_SETGT_INT                         = 0x3b,
+	    SQ_OP2_INST_SETGE_INT                         = 0x3c,
+	    SQ_OP2_INST_SETNE_INT                         = 0x3d,
+	    SQ_OP2_INST_SETGT_UINT                        = 0x3e,
+	    SQ_OP2_INST_SETGE_UINT                        = 0x3f,
+	    SQ_OP2_INST_KILLGT_UINT                       = 0x40,
+	    SQ_OP2_INST_KILLGE_UINT                       = 0x41,
+	    SQ_OP2_INST_PRED_SETE_INT                     = 0x42,
+	    SQ_OP2_INST_PRED_SETGT_INT                    = 0x43,
+	    SQ_OP2_INST_PRED_SETGE_INT                    = 0x44,
+	    SQ_OP2_INST_PRED_SETNE_INT                    = 0x45,
+	    SQ_OP2_INST_KILLE_INT                         = 0x46,
+	    SQ_OP2_INST_KILLGT_INT                        = 0x47,
+	    SQ_OP2_INST_KILLGE_INT                        = 0x48,
+	    SQ_OP2_INST_KILLNE_INT                        = 0x49,
+	    SQ_OP2_INST_PRED_SETE_PUSH_INT                = 0x4a,
+	    SQ_OP2_INST_PRED_SETGT_PUSH_INT               = 0x4b,
+	    SQ_OP2_INST_PRED_SETGE_PUSH_INT               = 0x4c,
+	    SQ_OP2_INST_PRED_SETNE_PUSH_INT               = 0x4d,
+	    SQ_OP2_INST_PRED_SETLT_PUSH_INT               = 0x4e,
+	    SQ_OP2_INST_PRED_SETLE_PUSH_INT               = 0x4f,
+	    SQ_OP2_INST_DOT4                              = 0x50,
+	    SQ_OP2_INST_DOT4_IEEE                         = 0x51,
+	    SQ_OP2_INST_CUBE                              = 0x52,
+	    SQ_OP2_INST_MAX4                              = 0x53,
+	    SQ_OP2_INST_MOVA_GPR_INT                      = 0x60,
+	    SQ_OP2_INST_EXP_IEEE                          = 0x61,
+	    SQ_OP2_INST_LOG_CLAMPED                       = 0x62,
+	    SQ_OP2_INST_LOG_IEEE                          = 0x63,
+	    SQ_OP2_INST_RECIP_CLAMPED                     = 0x64,
+	    SQ_OP2_INST_RECIP_FF                          = 0x65,
+	    SQ_OP2_INST_RECIP_IEEE                        = 0x66,
+	    SQ_OP2_INST_RECIPSQRT_CLAMPED                 = 0x67,
+	    SQ_OP2_INST_RECIPSQRT_FF                      = 0x68,
+	    SQ_OP2_INST_RECIPSQRT_IEEE                    = 0x69,
+	    SQ_OP2_INST_SQRT_IEEE                         = 0x6a,
+	    SQ_OP2_INST_FLT_TO_INT                        = 0x6b,
+	    SQ_OP2_INST_INT_TO_FLT                        = 0x6c,
+	    SQ_OP2_INST_UINT_TO_FLT                       = 0x6d,
+	    SQ_OP2_INST_SIN                               = 0x6e,
+	    SQ_OP2_INST_COS                               = 0x6f,
+	    SQ_OP2_INST_ASHR_INT                          = 0x70,
+	    SQ_OP2_INST_LSHR_INT                          = 0x71,
+	    SQ_OP2_INST_LSHL_INT                          = 0x72,
+	    SQ_OP2_INST_MULLO_INT                         = 0x73,
+	    SQ_OP2_INST_MULHI_INT                         = 0x74,
+	    SQ_OP2_INST_MULLO_UINT                        = 0x75,
+	    SQ_OP2_INST_MULHI_UINT                        = 0x76,
+	    SQ_OP2_INST_RECIP_INT                         = 0x77,
+	    SQ_OP2_INST_RECIP_UINT                        = 0x78,
+	    SQ_OP2_INST_FLT_TO_UINT                       = 0x79,
+    SQ_CF_ALLOC_EXPORT_WORD1_BUF                          = 0x00008dfc,
+	ARRAY_SIZE_mask                                   = 0xfff << 0,
+	ARRAY_SIZE_shift                                  = 0,
+	COMP_MASK_mask                                    = 0x0f << 12,
+	COMP_MASK_shift                                   = 12,
+    SQ_CF_WORD0                                           = 0x00008dfc,
+    SQ_CF_ALLOC_EXPORT_WORD0                              = 0x00008dfc,
+	ARRAY_BASE_mask                                   = 0x1fff << 0,
+	ARRAY_BASE_shift                                  = 0,
+	SQ_CF_ALLOC_EXPORT_WORD0__TYPE_mask               = 0x03 << 13,
+	SQ_CF_ALLOC_EXPORT_WORD0__TYPE_shift              = 13,
+	    SQ_EXPORT_PIXEL                               = 0x00,
+	    SQ_EXPORT_POS                                 = 0x01,
+	    SQ_EXPORT_PARAM                               = 0x02,
+	    X_UNUSED_FOR_SX_EXPORTS                       = 0x03,
+	RW_GPR_mask                                       = 0x7f << 15,
+	RW_GPR_shift                                      = 15,
+	RW_REL_bit                                        = 1 << 22,
+	INDEX_GPR_mask                                    = 0x7f << 23,
+	INDEX_GPR_shift                                   = 23,
+	ELEM_SIZE_mask                                    = 0x03 << 30,
+	ELEM_SIZE_shift                                   = 30,
+    SQ_VTX_WORD1                                          = 0x00008dfc,
+	SQ_VTX_WORD1__DST_SEL_X_mask                      = 0x07 << 9,
+	SQ_VTX_WORD1__DST_SEL_X_shift                     = 9,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_Y_mask                      = 0x07 << 12,
+	SQ_VTX_WORD1__DST_SEL_Y_shift                     = 12,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_Z_mask                      = 0x07 << 15,
+	SQ_VTX_WORD1__DST_SEL_Z_shift                     = 15,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	SQ_VTX_WORD1__DST_SEL_W_mask                      = 0x07 << 18,
+	SQ_VTX_WORD1__DST_SEL_W_shift                     = 18,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+/* 	    SQ_SEL_MASK                                   = 0x07, */
+	USE_CONST_FIELDS_bit                              = 1 << 21,
+	SQ_VTX_WORD1__DATA_FORMAT_mask                    = 0x3f << 22,
+	SQ_VTX_WORD1__DATA_FORMAT_shift                   = 22,
+	SQ_VTX_WORD1__NUM_FORMAT_ALL_mask                 = 0x03 << 28,
+	SQ_VTX_WORD1__NUM_FORMAT_ALL_shift                = 28,
+	    SQ_NUM_FORMAT_NORM                            = 0x00,
+	    SQ_NUM_FORMAT_INT                             = 0x01,
+	    SQ_NUM_FORMAT_SCALED                          = 0x02,
+	SQ_VTX_WORD1__FORMAT_COMP_ALL_bit                 = 1 << 30,
+	SQ_VTX_WORD1__SRF_MODE_ALL_bit                    = 1 << 31,
+    SQ_ALU_WORD1_OP2                                      = 0x00008dfc,
+/* 	SRC0_ABS_bit                                      = 1 << 0, */
+/* 	SRC1_ABS_bit                                      = 1 << 1, */
+/* 	UPDATE_EXECUTE_MASK_bit                           = 1 << 2, */
+/* 	UPDATE_PRED_bit                                   = 1 << 3, */
+/* 	WRITE_MASK_bit                                    = 1 << 4, */
+	FOG_MERGE_bit                                     = 1 << 5,
+	SQ_ALU_WORD1_OP2__OMOD_mask                       = 0x03 << 6,
+	SQ_ALU_WORD1_OP2__OMOD_shift                      = 6,
+/* 	    SQ_ALU_OMOD_OFF                               = 0x00, */
+/* 	    SQ_ALU_OMOD_M2                                = 0x01, */
+/* 	    SQ_ALU_OMOD_M4                                = 0x02, */
+/* 	    SQ_ALU_OMOD_D2                                = 0x03, */
+	SQ_ALU_WORD1_OP2__ALU_INST_mask                   = 0x3ff << 8,
+	SQ_ALU_WORD1_OP2__ALU_INST_shift                  = 8,
+/* 	    SQ_OP2_INST_ADD                               = 0x00, */
+/* 	    SQ_OP2_INST_MUL                               = 0x01, */
+/* 	    SQ_OP2_INST_MUL_IEEE                          = 0x02, */
+/* 	    SQ_OP2_INST_MAX                               = 0x03, */
+/* 	    SQ_OP2_INST_MIN                               = 0x04, */
+/* 	    SQ_OP2_INST_MAX_DX10                          = 0x05, */
+/* 	    SQ_OP2_INST_MIN_DX10                          = 0x06, */
+/* 	    SQ_OP2_INST_SETE                              = 0x08, */
+/* 	    SQ_OP2_INST_SETGT                             = 0x09, */
+/* 	    SQ_OP2_INST_SETGE                             = 0x0a, */
+/* 	    SQ_OP2_INST_SETNE                             = 0x0b, */
+/* 	    SQ_OP2_INST_SETE_DX10                         = 0x0c, */
+/* 	    SQ_OP2_INST_SETGT_DX10                        = 0x0d, */
+/* 	    SQ_OP2_INST_SETGE_DX10                        = 0x0e, */
+/* 	    SQ_OP2_INST_SETNE_DX10                        = 0x0f, */
+/* 	    SQ_OP2_INST_FRACT                             = 0x10, */
+/* 	    SQ_OP2_INST_TRUNC                             = 0x11, */
+/* 	    SQ_OP2_INST_CEIL                              = 0x12, */
+/* 	    SQ_OP2_INST_RNDNE                             = 0x13, */
+/* 	    SQ_OP2_INST_FLOOR                             = 0x14, */
+/* 	    SQ_OP2_INST_MOVA                              = 0x15, */
+/* 	    SQ_OP2_INST_MOVA_FLOOR                        = 0x16, */
+/* 	    SQ_OP2_INST_MOVA_INT                          = 0x18, */
+/* 	    SQ_OP2_INST_MOV                               = 0x19, */
+/* 	    SQ_OP2_INST_NOP                               = 0x1a, */
+/* 	    SQ_OP2_INST_PRED_SETGT_UINT                   = 0x1e, */
+/* 	    SQ_OP2_INST_PRED_SETGE_UINT                   = 0x1f, */
+/* 	    SQ_OP2_INST_PRED_SETE                         = 0x20, */
+/* 	    SQ_OP2_INST_PRED_SETGT                        = 0x21, */
+/* 	    SQ_OP2_INST_PRED_SETGE                        = 0x22, */
+/* 	    SQ_OP2_INST_PRED_SETNE                        = 0x23, */
+/* 	    SQ_OP2_INST_PRED_SET_INV                      = 0x24, */
+/* 	    SQ_OP2_INST_PRED_SET_POP                      = 0x25, */
+/* 	    SQ_OP2_INST_PRED_SET_CLR                      = 0x26, */
+/* 	    SQ_OP2_INST_PRED_SET_RESTORE                  = 0x27, */
+/* 	    SQ_OP2_INST_PRED_SETE_PUSH                    = 0x28, */
+/* 	    SQ_OP2_INST_PRED_SETGT_PUSH                   = 0x29, */
+/* 	    SQ_OP2_INST_PRED_SETGE_PUSH                   = 0x2a, */
+/* 	    SQ_OP2_INST_PRED_SETNE_PUSH                   = 0x2b, */
+/* 	    SQ_OP2_INST_KILLE                             = 0x2c, */
+/* 	    SQ_OP2_INST_KILLGT                            = 0x2d, */
+/* 	    SQ_OP2_INST_KILLGE                            = 0x2e, */
+/* 	    SQ_OP2_INST_KILLNE                            = 0x2f, */
+/* 	    SQ_OP2_INST_AND_INT                           = 0x30, */
+/* 	    SQ_OP2_INST_OR_INT                            = 0x31, */
+/* 	    SQ_OP2_INST_XOR_INT                           = 0x32, */
+/* 	    SQ_OP2_INST_NOT_INT                           = 0x33, */
+/* 	    SQ_OP2_INST_ADD_INT                           = 0x34, */
+/* 	    SQ_OP2_INST_SUB_INT                           = 0x35, */
+/* 	    SQ_OP2_INST_MAX_INT                           = 0x36, */
+/* 	    SQ_OP2_INST_MIN_INT                           = 0x37, */
+/* 	    SQ_OP2_INST_MAX_UINT                          = 0x38, */
+/* 	    SQ_OP2_INST_MIN_UINT                          = 0x39, */
+/* 	    SQ_OP2_INST_SETE_INT                          = 0x3a, */
+/* 	    SQ_OP2_INST_SETGT_INT                         = 0x3b, */
+/* 	    SQ_OP2_INST_SETGE_INT                         = 0x3c, */
+/* 	    SQ_OP2_INST_SETNE_INT                         = 0x3d, */
+/* 	    SQ_OP2_INST_SETGT_UINT                        = 0x3e, */
+/* 	    SQ_OP2_INST_SETGE_UINT                        = 0x3f, */
+/* 	    SQ_OP2_INST_KILLGT_UINT                       = 0x40, */
+/* 	    SQ_OP2_INST_KILLGE_UINT                       = 0x41, */
+/* 	    SQ_OP2_INST_PRED_SETE_INT                     = 0x42, */
+/* 	    SQ_OP2_INST_PRED_SETGT_INT                    = 0x43, */
+/* 	    SQ_OP2_INST_PRED_SETGE_INT                    = 0x44, */
+/* 	    SQ_OP2_INST_PRED_SETNE_INT                    = 0x45, */
+/* 	    SQ_OP2_INST_KILLE_INT                         = 0x46, */
+/* 	    SQ_OP2_INST_KILLGT_INT                        = 0x47, */
+/* 	    SQ_OP2_INST_KILLGE_INT                        = 0x48, */
+/* 	    SQ_OP2_INST_KILLNE_INT                        = 0x49, */
+/* 	    SQ_OP2_INST_PRED_SETE_PUSH_INT                = 0x4a, */
+/* 	    SQ_OP2_INST_PRED_SETGT_PUSH_INT               = 0x4b, */
+/* 	    SQ_OP2_INST_PRED_SETGE_PUSH_INT               = 0x4c, */
+/* 	    SQ_OP2_INST_PRED_SETNE_PUSH_INT               = 0x4d, */
+/* 	    SQ_OP2_INST_PRED_SETLT_PUSH_INT               = 0x4e, */
+/* 	    SQ_OP2_INST_PRED_SETLE_PUSH_INT               = 0x4f, */
+/* 	    SQ_OP2_INST_DOT4                              = 0x50, */
+/* 	    SQ_OP2_INST_DOT4_IEEE                         = 0x51, */
+/* 	    SQ_OP2_INST_CUBE                              = 0x52, */
+/* 	    SQ_OP2_INST_MAX4                              = 0x53, */
+/* 	    SQ_OP2_INST_MOVA_GPR_INT                      = 0x60, */
+/* 	    SQ_OP2_INST_EXP_IEEE                          = 0x61, */
+/* 	    SQ_OP2_INST_LOG_CLAMPED                       = 0x62, */
+/* 	    SQ_OP2_INST_LOG_IEEE                          = 0x63, */
+/* 	    SQ_OP2_INST_RECIP_CLAMPED                     = 0x64, */
+/* 	    SQ_OP2_INST_RECIP_FF                          = 0x65, */
+/* 	    SQ_OP2_INST_RECIP_IEEE                        = 0x66, */
+/* 	    SQ_OP2_INST_RECIPSQRT_CLAMPED                 = 0x67, */
+/* 	    SQ_OP2_INST_RECIPSQRT_FF                      = 0x68, */
+/* 	    SQ_OP2_INST_RECIPSQRT_IEEE                    = 0x69, */
+/* 	    SQ_OP2_INST_SQRT_IEEE                         = 0x6a, */
+/* 	    SQ_OP2_INST_FLT_TO_INT                        = 0x6b, */
+/* 	    SQ_OP2_INST_INT_TO_FLT                        = 0x6c, */
+/* 	    SQ_OP2_INST_UINT_TO_FLT                       = 0x6d, */
+/* 	    SQ_OP2_INST_SIN                               = 0x6e, */
+/* 	    SQ_OP2_INST_COS                               = 0x6f, */
+/* 	    SQ_OP2_INST_ASHR_INT                          = 0x70, */
+/* 	    SQ_OP2_INST_LSHR_INT                          = 0x71, */
+/* 	    SQ_OP2_INST_LSHL_INT                          = 0x72, */
+/* 	    SQ_OP2_INST_MULLO_INT                         = 0x73, */
+/* 	    SQ_OP2_INST_MULHI_INT                         = 0x74, */
+/* 	    SQ_OP2_INST_MULLO_UINT                        = 0x75, */
+/* 	    SQ_OP2_INST_MULHI_UINT                        = 0x76, */
+/* 	    SQ_OP2_INST_RECIP_INT                         = 0x77, */
+/* 	    SQ_OP2_INST_RECIP_UINT                        = 0x78, */
+/* 	    SQ_OP2_INST_FLT_TO_UINT                       = 0x79, */
+    SQ_CF_WORD1                                           = 0x00008dfc,
+	POP_COUNT_mask                                    = 0x07 << 0,
+	POP_COUNT_shift                                   = 0,
+	CF_CONST_mask                                     = 0x1f << 3,
+	CF_CONST_shift                                    = 3,
+	COND_mask                                         = 0x03 << 8,
+	COND_shift                                        = 8,
+	    SQ_CF_COND_ACTIVE                             = 0x00,
+	    SQ_CF_COND_FALSE                              = 0x01,
+	    SQ_CF_COND_BOOL                               = 0x02,
+	    SQ_CF_COND_NOT_BOOL                           = 0x03,
+	SQ_CF_WORD1__COUNT_mask                           = 0x07 << 10,
+	SQ_CF_WORD1__COUNT_shift                          = 10,
+	CALL_COUNT_mask                                   = 0x3f << 13,
+	CALL_COUNT_shift                                  = 13,
+	COUNT_3_bit                                       = 1 << 19,
+/* 	END_OF_PROGRAM_bit                                = 1 << 21, */
+/* 	VALID_PIXEL_MODE_bit                              = 1 << 22, */
+	SQ_CF_WORD1__CF_INST_mask                         = 0x7f << 23,
+	SQ_CF_WORD1__CF_INST_shift                        = 23,
+	    SQ_CF_INST_NOP                                = 0x00,
+	    SQ_CF_INST_TEX                                = 0x01,
+	    SQ_CF_INST_VTX                                = 0x02,
+	    SQ_CF_INST_VTX_TC                             = 0x03,
+	    SQ_CF_INST_LOOP_START                         = 0x04,
+	    SQ_CF_INST_LOOP_END                           = 0x05,
+	    SQ_CF_INST_LOOP_START_DX10                    = 0x06,
+	    SQ_CF_INST_LOOP_START_NO_AL                   = 0x07,
+	    SQ_CF_INST_LOOP_CONTINUE                      = 0x08,
+	    SQ_CF_INST_LOOP_BREAK                         = 0x09,
+	    SQ_CF_INST_JUMP                               = 0x0a,
+	    SQ_CF_INST_PUSH                               = 0x0b,
+	    SQ_CF_INST_PUSH_ELSE                          = 0x0c,
+	    SQ_CF_INST_ELSE                               = 0x0d,
+	    SQ_CF_INST_POP                                = 0x0e,
+	    SQ_CF_INST_POP_JUMP                           = 0x0f,
+	    SQ_CF_INST_POP_PUSH                           = 0x10,
+	    SQ_CF_INST_POP_PUSH_ELSE                      = 0x11,
+	    SQ_CF_INST_CALL                               = 0x12,
+	    SQ_CF_INST_CALL_FS                            = 0x13,
+	    SQ_CF_INST_RETURN                             = 0x14,
+	    SQ_CF_INST_EMIT_VERTEX                        = 0x15,
+	    SQ_CF_INST_EMIT_CUT_VERTEX                    = 0x16,
+	    SQ_CF_INST_CUT_VERTEX                         = 0x17,
+	    SQ_CF_INST_KILL                               = 0x18,
+/* 	WHOLE_QUAD_MODE_bit                               = 1 << 30, */
+/* 	BARRIER_bit                                       = 1 << 31, */
+    SQ_VTX_WORD1_SEM                                      = 0x00008dfc,
+	SEMANTIC_ID_mask                                  = 0xff << 0,
+	SEMANTIC_ID_shift                                 = 0,
+    SQ_TEX_WORD0                                          = 0x00008dfc,
+	TEX_INST_mask                                     = 0x1f << 0,
+	TEX_INST_shift                                    = 0,
+	    SQ_TEX_INST_VTX_FETCH                         = 0x00,
+	    SQ_TEX_INST_VTX_SEMANTIC                      = 0x01,
+	    SQ_TEX_INST_LD                                = 0x03,
+	    SQ_TEX_INST_GET_TEXTURE_RESINFO               = 0x04,
+	    SQ_TEX_INST_GET_NUMBER_OF_SAMPLES             = 0x05,
+	    SQ_TEX_INST_GET_LOD                           = 0x06,
+	    SQ_TEX_INST_GET_GRADIENTS_H                   = 0x07,
+	    SQ_TEX_INST_GET_GRADIENTS_V                   = 0x08,
+	    SQ_TEX_INST_GET_LERP                          = 0x09,
+	    SQ_TEX_INST_RESERVED_10                       = 0x0a,
+	    SQ_TEX_INST_SET_GRADIENTS_H                   = 0x0b,
+	    SQ_TEX_INST_SET_GRADIENTS_V                   = 0x0c,
+	    SQ_TEX_INST_PASS                              = 0x0d,
+	    X_Z_SET_INDEX_FOR_ARRAY_OF_CUBEMAPS           = 0x0e,
+	    SQ_TEX_INST_SAMPLE                            = 0x10,
+	    SQ_TEX_INST_SAMPLE_L                          = 0x11,
+	    SQ_TEX_INST_SAMPLE_LB                         = 0x12,
+	    SQ_TEX_INST_SAMPLE_LZ                         = 0x13,
+	    SQ_TEX_INST_SAMPLE_G                          = 0x14,
+	    SQ_TEX_INST_SAMPLE_G_L                        = 0x15,
+	    SQ_TEX_INST_SAMPLE_G_LB                       = 0x16,
+	    SQ_TEX_INST_SAMPLE_G_LZ                       = 0x17,
+	    SQ_TEX_INST_SAMPLE_C                          = 0x18,
+	    SQ_TEX_INST_SAMPLE_C_L                        = 0x19,
+	    SQ_TEX_INST_SAMPLE_C_LB                       = 0x1a,
+	    SQ_TEX_INST_SAMPLE_C_LZ                       = 0x1b,
+	    SQ_TEX_INST_SAMPLE_C_G                        = 0x1c,
+	    SQ_TEX_INST_SAMPLE_C_G_L                      = 0x1d,
+	    SQ_TEX_INST_SAMPLE_C_G_LB                     = 0x1e,
+	    SQ_TEX_INST_SAMPLE_C_G_LZ                     = 0x1f,
+	BC_FRAC_MODE_bit                                  = 1 << 5,
+/* 	FETCH_WHOLE_QUAD_bit                              = 1 << 7, */
+	RESOURCE_ID_mask                                  = 0xff << 8,
+	RESOURCE_ID_shift                                 = 8,
+/* 	SRC_GPR_mask                                      = 0x7f << 16, */
+/* 	SRC_GPR_shift                                     = 16, */
+/* 	SRC_REL_bit                                       = 1 << 23, */
+	SQ_TEX_WORD0__ALT_CONST_bit                       = 1 << 24,
+    SQ_VTX_WORD1_GPR                                      = 0x00008dfc,
+	SQ_VTX_WORD1_GPR__DST_GPR_mask                    = 0x7f << 0,
+	SQ_VTX_WORD1_GPR__DST_GPR_shift                   = 0,
+	SQ_VTX_WORD1_GPR__DST_REL_bit                     = 1 << 7,
+    SQ_ALU_WORD0                                          = 0x00008dfc,
+	SRC0_SEL_mask                                     = 0x1ff << 0,
+	SRC0_SEL_shift                                    = 0,
+/* 	    SQ_ALU_SRC_0                                  = 0xf8, */
+/* 	    SQ_ALU_SRC_1                                  = 0xf9, */
+/* 	    SQ_ALU_SRC_1_INT                              = 0xfa, */
+/* 	    SQ_ALU_SRC_M_1_INT                            = 0xfb, */
+/* 	    SQ_ALU_SRC_0_5                                = 0xfc, */
+/* 	    SQ_ALU_SRC_LITERAL                            = 0xfd, */
+/* 	    SQ_ALU_SRC_PV                                 = 0xfe, */
+/* 	    SQ_ALU_SRC_PS                                 = 0xff, */
+	SRC0_REL_bit                                      = 1 << 9,
+	SRC0_CHAN_mask                                    = 0x03 << 10,
+	SRC0_CHAN_shift                                   = 10,
+/* 	    SQ_CHAN_X                                     = 0x00, */
+/* 	    SQ_CHAN_Y                                     = 0x01, */
+/* 	    SQ_CHAN_Z                                     = 0x02, */
+/* 	    SQ_CHAN_W                                     = 0x03, */
+	SRC0_NEG_bit                                      = 1 << 12,
+	SRC1_SEL_mask                                     = 0x1ff << 13,
+	SRC1_SEL_shift                                    = 13,
+/* 	    SQ_ALU_SRC_0                                  = 0xf8, */
+/* 	    SQ_ALU_SRC_1                                  = 0xf9, */
+/* 	    SQ_ALU_SRC_1_INT                              = 0xfa, */
+/* 	    SQ_ALU_SRC_M_1_INT                            = 0xfb, */
+/* 	    SQ_ALU_SRC_0_5                                = 0xfc, */
+/* 	    SQ_ALU_SRC_LITERAL                            = 0xfd, */
+/* 	    SQ_ALU_SRC_PV                                 = 0xfe, */
+/* 	    SQ_ALU_SRC_PS                                 = 0xff, */
+	SRC1_REL_bit                                      = 1 << 22,
+	SRC1_CHAN_mask                                    = 0x03 << 23,
+	SRC1_CHAN_shift                                   = 23,
+/* 	    SQ_CHAN_X                                     = 0x00, */
+/* 	    SQ_CHAN_Y                                     = 0x01, */
+/* 	    SQ_CHAN_Z                                     = 0x02, */
+/* 	    SQ_CHAN_W                                     = 0x03, */
+	SRC1_NEG_bit                                      = 1 << 25,
+	INDEX_MODE_mask                                   = 0x07 << 26,
+	INDEX_MODE_shift                                  = 26,
+	    SQ_INDEX_AR_X                                 = 0x00,
+	    SQ_INDEX_AR_Y                                 = 0x01,
+	    SQ_INDEX_AR_Z                                 = 0x02,
+	    SQ_INDEX_AR_W                                 = 0x03,
+	    SQ_INDEX_LOOP                                 = 0x04,
+	PRED_SEL_mask                                     = 0x03 << 29,
+	PRED_SEL_shift                                    = 29,
+	    SQ_PRED_SEL_OFF                               = 0x00,
+	    SQ_PRED_SEL_ZERO                              = 0x02,
+	    SQ_PRED_SEL_ONE                               = 0x03,
+	LAST_bit                                          = 1 << 31,
+    SX_EXPORT_BUFFER_SIZES                                = 0x0000900c,
+	COLOR_BUFFER_SIZE_mask                            = 0xff << 0,
+	COLOR_BUFFER_SIZE_shift                           = 0,
+	POSITION_BUFFER_SIZE_mask                         = 0xff << 8,
+	POSITION_BUFFER_SIZE_shift                        = 8,
+	SMX_BUFFER_SIZE_mask                              = 0xff << 16,
+	SMX_BUFFER_SIZE_shift                             = 16,
+    SX_MEMORY_EXPORT_BASE                                 = 0x00009010,
+    SX_MEMORY_EXPORT_SIZE                                 = 0x00009014,
+    SPI_CONFIG_CNTL                                       = 0x00009100,
+	GPR_WRITE_PRIORITY_mask                           = 0x1f << 0,
+	GPR_WRITE_PRIORITY_shift                          = 0,
+	    X_PRIORITY_ORDER                              = 0x00,
+	    X_PRIORITY_ORDER_VS                           = 0x01,
+	DISABLE_INTERP_1_bit                              = 1 << 5,
+	DEBUG_THREAD_TYPE_SEL_mask                        = 0x03 << 6,
+	DEBUG_THREAD_TYPE_SEL_shift                       = 6,
+	DEBUG_GROUP_SEL_mask                              = 0x1f << 8,
+	DEBUG_GROUP_SEL_shift                             = 8,
+	DEBUG_GRBM_OVERRIDE_bit                           = 1 << 13,
+    SPI_CONFIG_CNTL_1                                     = 0x0000913c,
+	VTX_DONE_DELAY_mask                               = 0x0f << 0,
+	VTX_DONE_DELAY_shift                              = 0,
+	    X_DELAY_10_CLKS                               = 0x00,
+	    X_DELAY_11_CLKS                               = 0x01,
+	    X_DELAY_12_CLKS                               = 0x02,
+	    X_DELAY_13_CLKS                               = 0x03,
+	    X_DELAY_14_CLKS                               = 0x04,
+	    X_DELAY_15_CLKS                               = 0x05,
+	    X_DELAY_16_CLKS                               = 0x06,
+	    X_DELAY_17_CLKS                               = 0x07,
+	    X_DELAY_2_CLKS                                = 0x08,
+	    X_DELAY_3_CLKS                                = 0x09,
+	    X_DELAY_4_CLKS                                = 0x0a,
+	    X_DELAY_5_CLKS                                = 0x0b,
+	    X_DELAY_6_CLKS                                = 0x0c,
+	    X_DELAY_7_CLKS                                = 0x0d,
+	    X_DELAY_8_CLKS                                = 0x0e,
+	    X_DELAY_9_CLKS                                = 0x0f,
+	INTERP_ONE_PRIM_PER_ROW_bit                       = 1 << 4,
+    TD_FILTER4                                            = 0x00009400,
+	WEIGHT_1_mask                                     = 0x7ff << 0,
+	WEIGHT_1_shift                                    = 0,
+	WEIGHT_0_mask                                     = 0x7ff << 11,
+	WEIGHT_0_shift                                    = 11,
+	WEIGHT_PAIR_bit                                   = 1 << 22,
+	PHASE_mask                                        = 0x0f << 23,
+	PHASE_shift                                       = 23,
+	DIRECTION_bit                                     = 1 << 27,
+    TD_FILTER4_1                                          = 0x00009404,
+	TD_FILTER4_1_num                                  = 35,
+/* 	WEIGHT_1_mask                                     = 0x7ff << 0, */
+/* 	WEIGHT_1_shift                                    = 0, */
+/* 	WEIGHT_0_mask                                     = 0x7ff << 11, */
+/* 	WEIGHT_0_shift                                    = 11, */
+    TD_CNTL                                               = 0x00009490,
+	SYNC_PHASE_SH_mask                                = 0x03 << 0,
+	SYNC_PHASE_SH_shift                               = 0,
+	SYNC_PHASE_VC_SMX_mask                            = 0x03 << 4,
+	SYNC_PHASE_VC_SMX_shift                           = 4,
+    TD0_CNTL                                              = 0x00009494,
+	TD0_CNTL_num                                      = 4,
+	ID_OVERRIDE_mask                                  = 0x03 << 28,
+	ID_OVERRIDE_shift                                 = 28,
+    TD0_STATUS                                            = 0x000094a4,
+	TD0_STATUS_num                                    = 4,
+	BUSY_bit                                          = 1 << 31,
+    TA_CNTL                                               = 0x00009504,
+	GRADIENT_CREDIT_mask                              = 0x1f << 0,
+	GRADIENT_CREDIT_shift                             = 0,
+	WALKER_CREDIT_mask                                = 0x1f << 8,
+	WALKER_CREDIT_shift                               = 8,
+	ALIGNER_CREDIT_mask                               = 0x1f << 16,
+	ALIGNER_CREDIT_shift                              = 16,
+	TD_FIFO_CREDIT_mask                               = 0x3ff << 22,
+	TD_FIFO_CREDIT_shift                              = 22,
+    TA_CNTL_AUX                                           = 0x00009508,
+	DISABLE_CUBE_WRAP_bit                             = 1 << 0,
+	SYNC_GRADIENT_bit                                 = 1 << 24,
+	SYNC_WALKER_bit                                   = 1 << 25,
+	SYNC_ALIGNER_bit                                  = 1 << 26,
+	BILINEAR_PRECISION_bit                            = 1 << 31,
+    TA0_CNTL                                              = 0x00009510,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA1_CNTL                                              = 0x00009514,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA2_CNTL                                              = 0x00009518,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA3_CNTL                                              = 0x0000951c,
+/* 	ID_OVERRIDE_mask                                  = 0x03 << 28, */
+/* 	ID_OVERRIDE_shift                                 = 28, */
+    TA0_STATUS                                            = 0x00009520,
+	FG_PFIFO_EMPTYB_bit                               = 1 << 12,
+	FG_LFIFO_EMPTYB_bit                               = 1 << 13,
+	FG_SFIFO_EMPTYB_bit                               = 1 << 14,
+	FL_PFIFO_EMPTYB_bit                               = 1 << 16,
+	FL_LFIFO_EMPTYB_bit                               = 1 << 17,
+	FL_SFIFO_EMPTYB_bit                               = 1 << 18,
+	FA_PFIFO_EMPTYB_bit                               = 1 << 20,
+	FA_LFIFO_EMPTYB_bit                               = 1 << 21,
+	FA_SFIFO_EMPTYB_bit                               = 1 << 22,
+	IN_BUSY_bit                                       = 1 << 24,
+	FG_BUSY_bit                                       = 1 << 25,
+	FL_BUSY_bit                                       = 1 << 27,
+	TA_BUSY_bit                                       = 1 << 28,
+	FA_BUSY_bit                                       = 1 << 29,
+	AL_BUSY_bit                                       = 1 << 30,
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA1_STATUS                                            = 0x00009524,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA2_STATUS                                            = 0x00009528,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TA3_STATUS                                            = 0x0000952c,
+/* 	FG_PFIFO_EMPTYB_bit                               = 1 << 12, */
+/* 	FG_LFIFO_EMPTYB_bit                               = 1 << 13, */
+/* 	FG_SFIFO_EMPTYB_bit                               = 1 << 14, */
+/* 	FL_PFIFO_EMPTYB_bit                               = 1 << 16, */
+/* 	FL_LFIFO_EMPTYB_bit                               = 1 << 17, */
+/* 	FL_SFIFO_EMPTYB_bit                               = 1 << 18, */
+/* 	FA_PFIFO_EMPTYB_bit                               = 1 << 20, */
+/* 	FA_LFIFO_EMPTYB_bit                               = 1 << 21, */
+/* 	FA_SFIFO_EMPTYB_bit                               = 1 << 22, */
+/* 	IN_BUSY_bit                                       = 1 << 24, */
+/* 	FG_BUSY_bit                                       = 1 << 25, */
+/* 	FL_BUSY_bit                                       = 1 << 27, */
+/* 	TA_BUSY_bit                                       = 1 << 28, */
+/* 	FA_BUSY_bit                                       = 1 << 29, */
+/* 	AL_BUSY_bit                                       = 1 << 30, */
+/* 	BUSY_bit                                          = 1 << 31, */
+    TC_STATUS                                             = 0x00009600,
+	TC_BUSY_bit                                       = 1 << 0,
+    TC_INVALIDATE                                         = 0x00009604,
+	START_bit                                         = 1 << 0,
+    TC_CNTL                                               = 0x00009608,
+	FORCE_HIT_bit                                     = 1 << 0,
+	FORCE_MISS_bit                                    = 1 << 1,
+	L2_SIZE_mask                                      = 0x0f << 5,
+	L2_SIZE_shift                                     = 5,
+	    _256K                                         = 0x00,
+	    _224K                                         = 0x01,
+	    _192K                                         = 0x02,
+	    _160K                                         = 0x03,
+	    _128K                                         = 0x04,
+	    _96K                                          = 0x05,
+	    _64K                                          = 0x06,
+	    _32K                                          = 0x07,
+	L2_DISABLE_LATE_HIT_bit                           = 1 << 9,
+	DISABLE_VERT_PERF_bit                             = 1 << 10,
+	DISABLE_INVAL_BUSY_bit                            = 1 << 11,
+	DISABLE_INVAL_SAME_SURFACE_bit                    = 1 << 12,
+	PARTITION_MODE_mask                               = 0x03 << 13,
+	PARTITION_MODE_shift                              = 13,
+	    X_VERTEX                                      = 0x00,
+	MISS_ARB_MODE_bit                                 = 1 << 15,
+	HIT_ARB_MODE_bit                                  = 1 << 16,
+	DISABLE_WRITE_DELAY_bit                           = 1 << 17,
+	HIT_FIFO_DEPTH_bit                                = 1 << 18,
+    VC_CNTL                                               = 0x00009700,
+	L2_INVALIDATE_bit                                 = 1 << 0,
+	RESERVED_bit                                      = 1 << 1,
+	CC_FORCE_MISS_bit                                 = 1 << 2,
+	MI_CHAN_SEL_mask                                  = 0x03 << 3,
+	MI_CHAN_SEL_shift                                 = 3,
+	    X_MC0_USES_CH_0_1                             = 0x00,
+	    X_MC0_USES_CH_0_3                             = 0x01,
+	    X_VC_MC0_IS_ACTIVE                            = 0x02,
+	    X_VC_MC1_IS_DISABLED                          = 0x03,
+	MI_STEER_DISABLE_bit                              = 1 << 5,
+	MI_CREDIT_CTR_mask                                = 0x0f << 6,
+	MI_CREDIT_CTR_shift                               = 6,
+	MI_CREDIT_WE_bit                                  = 1 << 10,
+	MI_REQ_STALL_THLD_mask                            = 0x07 << 11,
+	MI_REQ_STALL_THLD_shift                           = 11,
+	    X_LATENCY_EXCEEDS_399_CLOCKS                  = 0x00,
+	    X_LATENCY_EXCEEDS_415_CLOCKS                  = 0x01,
+	    X_LATENCY_EXCEEDS_431_CLOCKS                  = 0x02,
+	    X_LATENCY_EXCEEDS_447_CLOCKS                  = 0x03,
+	    X_LATENCY_EXCEEDS_463_CLOCKS                  = 0x04,
+	    X_LATENCY_EXCEEDS_479_CLOCKS                  = 0x05,
+	    X_LATENCY_EXCEEDS_495_CLOCKS                  = 0x06,
+	    X_LATENCY_EXCEEDS_511_CLOCKS                  = 0x07,
+	VC_CNTL__MI_TIMESTAMP_RES_mask                    = 0x1f << 14,
+	VC_CNTL__MI_TIMESTAMP_RES_shift                   = 14,
+	    X_1X_SYSTEM_CLOCK                             = 0x00,
+	    X_2X_SYSTEM_CLOCK                             = 0x01,
+	    X_4X_SYSTEM_CLOCK                             = 0x02,
+	    X_8X_SYSTEM_CLOCK                             = 0x03,
+	    X_16X_SYSTEM_CLOCK                            = 0x04,
+	    X_32X_SYSTEM_CLOCK                            = 0x05,
+	    X_64X_SYSTEM_CLOCK                            = 0x06,
+	    X_128X_SYSTEM_CLOCK                           = 0x07,
+	    X_256X_SYSTEM_CLOCK                           = 0x08,
+	    X_512X_SYSTEM_CLOCK                           = 0x09,
+	    X_1024X_SYSTEM_CLOCK                          = 0x0a,
+	    X_2048X_SYSTEM_CLOCK                          = 0x0b,
+	    X_4092X_SYSTEM_CLOCK                          = 0x0c,
+	    X_8192X_SYSTEM_CLOCK                          = 0x0d,
+	    X_16384X_SYSTEM_CLOCK                         = 0x0e,
+	    X_32768X_SYSTEM_CLOCK                         = 0x0f,
+    VC_CNTL_STATUS                                        = 0x00009704,
+	RP_BUSY_bit                                       = 1 << 0,
+	RG_BUSY_bit                                       = 1 << 1,
+	VC_BUSY_bit                                       = 1 << 2,
+	CLAMP_DETECT_bit                                  = 1 << 3,
+    VC_CONFIG                                             = 0x00009718,
+	WRITE_DIS_bit                                     = 1 << 0,
+	GPR_DATA_PHASE_ADJ_mask                           = 0x07 << 1,
+	GPR_DATA_PHASE_ADJ_shift                          = 1,
+	    X_LATENCY_BASE_0_CYCLES                       = 0x00,
+	    X_LATENCY_BASE_1_CYCLES                       = 0x01,
+	    X_LATENCY_BASE_2_CYCLES                       = 0x02,
+	    X_LATENCY_BASE_3_CYCLES                       = 0x03,
+	TD_SIMD_SYNC_ADJ_mask                             = 0x07 << 4,
+	TD_SIMD_SYNC_ADJ_shift                            = 4,
+	    X_0_CYCLES_DELAY                              = 0x00,
+	    X_1_CYCLES_DELAY                              = 0x01,
+	    X_2_CYCLES_DELAY                              = 0x02,
+	    X_3_CYCLES_DELAY                              = 0x03,
+	    X_4_CYCLES_DELAY                              = 0x04,
+	    X_5_CYCLES_DELAY                              = 0x05,
+	    X_6_CYCLES_DELAY                              = 0x06,
+	    X_7_CYCLES_DELAY                              = 0x07,
+    SMX_DC_CTL0                                           = 0x0000a020,
+	WR_GATHER_STREAM0_bit                             = 1 << 0,
+	WR_GATHER_STREAM1_bit                             = 1 << 1,
+	WR_GATHER_STREAM2_bit                             = 1 << 2,
+	WR_GATHER_STREAM3_bit                             = 1 << 3,
+	WR_GATHER_SCRATCH_bit                             = 1 << 4,
+	WR_GATHER_REDUC_BUF_bit                           = 1 << 5,
+	WR_GATHER_RING_BUF_bit                            = 1 << 6,
+	WR_GATHER_F_BUF_bit                               = 1 << 7,
+	DISABLE_CACHES_bit                                = 1 << 8,
+	AUTO_FLUSH_INVAL_EN_bit                           = 1 << 10,
+	AUTO_FLUSH_EN_bit                                 = 1 << 11,
+	AUTO_FLUSH_CNT_mask                               = 0xffff << 12,
+	AUTO_FLUSH_CNT_shift                              = 12,
+	MC_RD_STALL_FACTOR_mask                           = 0x03 << 28,
+	MC_RD_STALL_FACTOR_shift                          = 28,
+	MC_WR_STALL_FACTOR_mask                           = 0x03 << 30,
+	MC_WR_STALL_FACTOR_shift                          = 30,
+    SMX_DC_CTL1                                           = 0x0000a024,
+	OP_FIFO_SKID_mask                                 = 0x7f << 0,
+	OP_FIFO_SKID_shift                                = 0,
+	CACHE_LINE_SIZE_bit                               = 1 << 8,
+	MULTI_FLUSH_MODE_bit                              = 1 << 9,
+	MULTI_FLUSH_REQ_ABORT_IDX_FIFO_SKID_mask          = 0x0f << 10,
+	MULTI_FLUSH_REQ_ABORT_IDX_FIFO_SKID_shift         = 10,
+	DISABLE_WR_GATHER_RD_HIT_FORCE_EVICT_bit          = 1 << 16,
+	DISABLE_WR_GATHER_RD_HIT_COMP_VLDS_CHECK_bit      = 1 << 17,
+	DISABLE_FLUSH_ES_ALSO_INVALS_bit                  = 1 << 18,
+	DISABLE_FLUSH_GS_ALSO_INVALS_bit                  = 1 << 19,
+    SMX_DC_CTL2                                           = 0x0000a028,
+	INVALIDATE_CACHES_bit                             = 1 << 0,
+	CACHES_INVALID_bit                                = 1 << 1,
+	CACHES_DIRTY_bit                                  = 1 << 2,
+	FLUSH_ALL_bit                                     = 1 << 4,
+	FLUSH_GS_THREADS_bit                              = 1 << 8,
+	FLUSH_ES_THREADS_bit                              = 1 << 9,
+    SMX_DC_MC_INTF_CTL                                    = 0x0000a02c,
+	MC_RD_REQ_CRED_mask                               = 0xff << 0,
+	MC_RD_REQ_CRED_shift                              = 0,
+	MC_WR_REQ_CRED_mask                               = 0xff << 16,
+	MC_WR_REQ_CRED_shift                              = 16,
+    TD_PS_SAMPLER0_BORDER_RED                             = 0x0000a400,
+	TD_PS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_PS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_PS_SAMPLER0_BORDER_GREEN                           = 0x0000a404,
+	TD_PS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_PS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_PS_SAMPLER0_BORDER_BLUE                            = 0x0000a408,
+	TD_PS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_PS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_PS_SAMPLER0_BORDER_ALPHA                           = 0x0000a40c,
+	TD_PS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_PS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_VS_SAMPLER0_BORDER_RED                             = 0x0000a600,
+	TD_VS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_VS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_VS_SAMPLER0_BORDER_GREEN                           = 0x0000a604,
+	TD_VS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_VS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_VS_SAMPLER0_BORDER_BLUE                            = 0x0000a608,
+	TD_VS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_VS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_VS_SAMPLER0_BORDER_ALPHA                           = 0x0000a60c,
+	TD_VS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_VS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_GS_SAMPLER0_BORDER_RED                             = 0x0000a800,
+	TD_GS_SAMPLER0_BORDER_RED_num                     = 18,
+	TD_GS_SAMPLER0_BORDER_RED_offset                  = 16,
+    TD_GS_SAMPLER0_BORDER_GREEN                           = 0x0000a804,
+	TD_GS_SAMPLER0_BORDER_GREEN_num                   = 18,
+	TD_GS_SAMPLER0_BORDER_GREEN_offset                = 16,
+    TD_GS_SAMPLER0_BORDER_BLUE                            = 0x0000a808,
+	TD_GS_SAMPLER0_BORDER_BLUE_num                    = 18,
+	TD_GS_SAMPLER0_BORDER_BLUE_offset                 = 16,
+    TD_GS_SAMPLER0_BORDER_ALPHA                           = 0x0000a80c,
+	TD_GS_SAMPLER0_BORDER_ALPHA_num                   = 18,
+	TD_GS_SAMPLER0_BORDER_ALPHA_offset                = 16,
+    TD_PS_SAMPLER0_CLEARTYPE_KERNEL                       = 0x0000aa00,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL_num               = 18,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__WIDTH_mask       = 0x07 << 0,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__WIDTH_shift      = 0,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__HEIGHT_mask      = 0x07 << 3,
+	TD_PS_SAMPLER0_CLEARTYPE_KERNEL__HEIGHT_shift     = 3,
+    DB_DEPTH_SIZE                                         = 0x00028000,
+	PITCH_TILE_MAX_mask                               = 0x3ff << 0,
+	PITCH_TILE_MAX_shift                              = 0,
+	SLICE_TILE_MAX_mask                               = 0xfffff << 10,
+	SLICE_TILE_MAX_shift                              = 10,
+    DB_DEPTH_VIEW                                         = 0x00028004,
+	SLICE_START_mask                                  = 0x7ff << 0,
+	SLICE_START_shift                                 = 0,
+	SLICE_MAX_mask                                    = 0x7ff << 13,
+	SLICE_MAX_shift                                   = 13,
+    DB_DEPTH_BASE                                         = 0x0002800c,
+    DB_DEPTH_INFO                                         = 0x00028010,
+	DB_DEPTH_INFO__FORMAT_mask                        = 0x07 << 0,
+	DB_DEPTH_INFO__FORMAT_shift                       = 0,
+	    DEPTH_INVALID                                 = 0x00,
+	    DEPTH_16                                      = 0x01,
+	    DEPTH_X8_24                                   = 0x02,
+	    DEPTH_8_24                                    = 0x03,
+	    DEPTH_X8_24_FLOAT                             = 0x04,
+	    DEPTH_8_24_FLOAT                              = 0x05,
+	    DEPTH_32_FLOAT                                = 0x06,
+	    DEPTH_X24_8_32_FLOAT                          = 0x07,
+	DB_DEPTH_INFO__READ_SIZE_bit                      = 1 << 3,
+	DB_DEPTH_INFO__ARRAY_MODE_mask                    = 0x0f << 15,
+	DB_DEPTH_INFO__ARRAY_MODE_shift                   = 15,
+	    ARRAY_2D_TILED_THIN1                          = 0x04,
+	TILE_SURFACE_ENABLE_bit                           = 1 << 25,
+	TILE_COMPACT_bit                                  = 1 << 26,
+	ZRANGE_PRECISION_bit                              = 1 << 31,
+    DB_HTILE_DATA_BASE                                    = 0x00028014,
+    DB_STENCIL_CLEAR                                      = 0x00028028,
+	DB_STENCIL_CLEAR__CLEAR_mask                      = 0xff << 0,
+	DB_STENCIL_CLEAR__CLEAR_shift                     = 0,
+	MIN_mask                                          = 0xff << 16,
+	MIN_shift                                         = 16,
+    DB_DEPTH_CLEAR                                        = 0x0002802c,
+    PA_SC_SCREEN_SCISSOR_TL                               = 0x00028030,
+	PA_SC_SCREEN_SCISSOR_TL__TL_X_mask                = 0x7fff << 0,
+	PA_SC_SCREEN_SCISSOR_TL__TL_X_shift               = 0,
+	PA_SC_SCREEN_SCISSOR_TL__TL_Y_mask                = 0x7fff << 16,
+	PA_SC_SCREEN_SCISSOR_TL__TL_Y_shift               = 16,
+    PA_SC_SCREEN_SCISSOR_BR                               = 0x00028034,
+	PA_SC_SCREEN_SCISSOR_BR__BR_X_mask                = 0x7fff << 0,
+	PA_SC_SCREEN_SCISSOR_BR__BR_X_shift               = 0,
+	PA_SC_SCREEN_SCISSOR_BR__BR_Y_mask                = 0x7fff << 16,
+	PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift               = 16,
+    CB_COLOR0_BASE                                        = 0x00028040,
+	CB_COLOR0_BASE_num                                = 8,
+    CB_COLOR0_SIZE                                        = 0x00028060,
+	CB_COLOR0_SIZE_num                                = 8,
+/* 	PITCH_TILE_MAX_mask                               = 0x3ff << 0, */
+/* 	PITCH_TILE_MAX_shift                              = 0, */
+/* 	SLICE_TILE_MAX_mask                               = 0xfffff << 10, */
+/* 	SLICE_TILE_MAX_shift                              = 10, */
+    CB_COLOR0_VIEW                                        = 0x00028080,
+	CB_COLOR0_VIEW_num                                = 8,
+/* 	SLICE_START_mask                                  = 0x7ff << 0, */
+/* 	SLICE_START_shift                                 = 0, */
+/* 	SLICE_MAX_mask                                    = 0x7ff << 13, */
+/* 	SLICE_MAX_shift                                   = 13, */
+    CB_COLOR0_INFO                                        = 0x000280a0,
+	CB_COLOR0_INFO_num                                = 8,
+	ENDIAN_mask                                       = 0x03 << 0,
+	ENDIAN_shift                                      = 0,
+	    ENDIAN_NONE                                   = 0x00,
+	    ENDIAN_8IN16                                  = 0x01,
+	    ENDIAN_8IN32                                  = 0x02,
+	    ENDIAN_8IN64                                  = 0x03,
+	CB_COLOR0_INFO__FORMAT_mask                       = 0x3f << 2,
+	CB_COLOR0_INFO__FORMAT_shift                      = 2,
+	    COLOR_INVALID                                 = 0x00,
+	    COLOR_8                                       = 0x01,
+	    COLOR_4_4                                     = 0x02,
+	    COLOR_3_3_2                                   = 0x03,
+	    COLOR_16                                      = 0x05,
+	    COLOR_16_FLOAT                                = 0x06,
+	    COLOR_8_8                                     = 0x07,
+	    COLOR_5_6_5                                   = 0x08,
+	    COLOR_6_5_5                                   = 0x09,
+	    COLOR_1_5_5_5                                 = 0x0a,
+	    COLOR_4_4_4_4                                 = 0x0b,
+	    COLOR_5_5_5_1                                 = 0x0c,
+	    COLOR_32                                      = 0x0d,
+	    COLOR_32_FLOAT                                = 0x0e,
+	    COLOR_16_16                                   = 0x0f,
+	    COLOR_16_16_FLOAT                             = 0x10,
+	    COLOR_8_24                                    = 0x11,
+	    COLOR_8_24_FLOAT                              = 0x12,
+	    COLOR_24_8                                    = 0x13,
+	    COLOR_24_8_FLOAT                              = 0x14,
+	    COLOR_10_11_11                                = 0x15,
+	    COLOR_10_11_11_FLOAT                          = 0x16,
+	    COLOR_11_11_10                                = 0x17,
+	    COLOR_11_11_10_FLOAT                          = 0x18,
+	    COLOR_2_10_10_10                              = 0x19,
+	    COLOR_8_8_8_8                                 = 0x1a,
+	    COLOR_10_10_10_2                              = 0x1b,
+	    COLOR_X24_8_32_FLOAT                          = 0x1c,
+	    COLOR_32_32                                   = 0x1d,
+	    COLOR_32_32_FLOAT                             = 0x1e,
+	    COLOR_16_16_16_16                             = 0x1f,
+	    COLOR_16_16_16_16_FLOAT                       = 0x20,
+	    COLOR_32_32_32_32                             = 0x22,
+	    COLOR_32_32_32_32_FLOAT                       = 0x23,
+	CB_COLOR0_INFO__ARRAY_MODE_mask                   = 0x0f << 8,
+	CB_COLOR0_INFO__ARRAY_MODE_shift                  = 8,
+	    ARRAY_LINEAR_GENERAL                          = 0x00,
+	    ARRAY_LINEAR_ALIGNED                          = 0x01,
+/* 	    ARRAY_2D_TILED_THIN1                          = 0x04, */
+	NUMBER_TYPE_mask                                  = 0x07 << 12,
+	NUMBER_TYPE_shift                                 = 12,
+	    NUMBER_UNORM                                  = 0x00,
+	    NUMBER_SNORM                                  = 0x01,
+	    NUMBER_USCALED                                = 0x02,
+	    NUMBER_SSCALED                                = 0x03,
+	    NUMBER_UINT                                   = 0x04,
+	    NUMBER_SINT                                   = 0x05,
+	    NUMBER_SRGB                                   = 0x06,
+	    NUMBER_FLOAT                                  = 0x07,
+	CB_COLOR0_INFO__READ_SIZE_bit                     = 1 << 15,
+	COMP_SWAP_mask                                    = 0x03 << 16,
+	COMP_SWAP_shift                                   = 16,
+	    SWAP_STD                                      = 0x00,
+	    SWAP_ALT                                      = 0x01,
+	    SWAP_STD_REV                                  = 0x02,
+	    SWAP_ALT_REV                                  = 0x03,
+	CB_COLOR0_INFO__TILE_MODE_mask                    = 0x03 << 18,
+	CB_COLOR0_INFO__TILE_MODE_shift                   = 18,
+	    TILE_DISABLE                                  = 0x00,
+	    TILE_CLEAR_ENABLE                             = 0x01,
+	    TILE_FRAG_ENABLE                              = 0x02,
+	BLEND_CLAMP_bit                                   = 1 << 20,
+	CLEAR_COLOR_bit                                   = 1 << 21,
+	BLEND_BYPASS_bit                                  = 1 << 22,
+	BLEND_FLOAT32_bit                                 = 1 << 23,
+	SIMPLE_FLOAT_bit                                  = 1 << 24,
+	CB_COLOR0_INFO__ROUND_MODE_bit                    = 1 << 25,
+/* 	TILE_COMPACT_bit                                  = 1 << 26, */
+	SOURCE_FORMAT_bit                                 = 1 << 27,
+    CB_COLOR0_TILE                                        = 0x000280c0,
+	CB_COLOR0_TILE_num                                = 8,
+    CB_COLOR0_FRAG                                        = 0x000280e0,
+	CB_COLOR0_FRAG_num                                = 8,
+    CB_COLOR0_MASK                                        = 0x00028100,
+	CB_COLOR0_MASK_num                                = 8,
+	CMASK_BLOCK_MAX_mask                              = 0xfff << 0,
+	CMASK_BLOCK_MAX_shift                             = 0,
+	FMASK_TILE_MAX_mask                               = 0xfffff << 12,
+	FMASK_TILE_MAX_shift                              = 12,
+    CB_CLEAR_RED                                          = 0x00028120,
+    CB_CLEAR_GREEN                                        = 0x00028124,
+    CB_CLEAR_BLUE                                         = 0x00028128,
+    CB_CLEAR_ALPHA                                        = 0x0002812c,
+    SQ_ALU_CONST_BUFFER_SIZE_PS_0                         = 0x00028140,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_PS_0__DATA_shift         = 0,
+    SQ_ALU_CONST_BUFFER_SIZE_VS_0                         = 0x00028180,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_VS_0__DATA_shift         = 0,
+    SQ_ALU_CONST_BUFFER_SIZE_GS_0                         = 0x000281c0,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0_num                 = 16,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0__DATA_mask          = 0x1ff << 0,
+	SQ_ALU_CONST_BUFFER_SIZE_GS_0__DATA_shift         = 0,
+    PA_SC_WINDOW_OFFSET                                   = 0x00028200,
+	WINDOW_X_OFFSET_mask                              = 0x7fff << 0,
+	WINDOW_X_OFFSET_shift                             = 0,
+	WINDOW_Y_OFFSET_mask                              = 0x7fff << 16,
+	WINDOW_Y_OFFSET_shift                             = 16,
+    PA_SC_WINDOW_SCISSOR_TL                               = 0x00028204,
+	PA_SC_WINDOW_SCISSOR_TL__TL_X_mask                = 0x3fff << 0,
+	PA_SC_WINDOW_SCISSOR_TL__TL_X_shift               = 0,
+	PA_SC_WINDOW_SCISSOR_TL__TL_Y_mask                = 0x3fff << 16,
+	PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift               = 16,
+	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31,
+    PA_SC_WINDOW_SCISSOR_BR                               = 0x00028208,
+	PA_SC_WINDOW_SCISSOR_BR__BR_X_mask                = 0x3fff << 0,
+	PA_SC_WINDOW_SCISSOR_BR__BR_X_shift               = 0,
+	PA_SC_WINDOW_SCISSOR_BR__BR_Y_mask                = 0x3fff << 16,
+	PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift               = 16,
+    PA_SC_CLIPRECT_RULE                                   = 0x0002820c,
+	CLIP_RULE_mask                                    = 0xffff << 0,
+	CLIP_RULE_shift                                   = 0,
+    PA_SC_CLIPRECT_0_TL                                   = 0x00028210,
+	PA_SC_CLIPRECT_0_TL_num                           = 4,
+	PA_SC_CLIPRECT_0_TL_offset                        = 8,
+	PA_SC_CLIPRECT_0_TL__TL_X_mask                    = 0x3fff << 0,
+	PA_SC_CLIPRECT_0_TL__TL_X_shift                   = 0,
+	PA_SC_CLIPRECT_0_TL__TL_Y_mask                    = 0x3fff << 16,
+	PA_SC_CLIPRECT_0_TL__TL_Y_shift                   = 16,
+    PA_SC_CLIPRECT_0_BR                                   = 0x00028214,
+	PA_SC_CLIPRECT_0_BR_num                           = 4,
+	PA_SC_CLIPRECT_0_BR_offset                        = 8,
+	PA_SC_CLIPRECT_0_BR__BR_X_mask                    = 0x3fff << 0,
+	PA_SC_CLIPRECT_0_BR__BR_X_shift                   = 0,
+	PA_SC_CLIPRECT_0_BR__BR_Y_mask                    = 0x3fff << 16,
+	PA_SC_CLIPRECT_0_BR__BR_Y_shift                   = 16,
+    CB_TARGET_MASK                                        = 0x00028238,
+	TARGET0_ENABLE_mask                               = 0x0f << 0,
+	TARGET0_ENABLE_shift                              = 0,
+	TARGET1_ENABLE_mask                               = 0x0f << 4,
+	TARGET1_ENABLE_shift                              = 4,
+	TARGET2_ENABLE_mask                               = 0x0f << 8,
+	TARGET2_ENABLE_shift                              = 8,
+	TARGET3_ENABLE_mask                               = 0x0f << 12,
+	TARGET3_ENABLE_shift                              = 12,
+	TARGET4_ENABLE_mask                               = 0x0f << 16,
+	TARGET4_ENABLE_shift                              = 16,
+	TARGET5_ENABLE_mask                               = 0x0f << 20,
+	TARGET5_ENABLE_shift                              = 20,
+	TARGET6_ENABLE_mask                               = 0x0f << 24,
+	TARGET6_ENABLE_shift                              = 24,
+	TARGET7_ENABLE_mask                               = 0x0f << 28,
+	TARGET7_ENABLE_shift                              = 28,
+    CB_SHADER_MASK                                        = 0x0002823c,
+	OUTPUT0_ENABLE_mask                               = 0x0f << 0,
+	OUTPUT0_ENABLE_shift                              = 0,
+	OUTPUT1_ENABLE_mask                               = 0x0f << 4,
+	OUTPUT1_ENABLE_shift                              = 4,
+	OUTPUT2_ENABLE_mask                               = 0x0f << 8,
+	OUTPUT2_ENABLE_shift                              = 8,
+	OUTPUT3_ENABLE_mask                               = 0x0f << 12,
+	OUTPUT3_ENABLE_shift                              = 12,
+	OUTPUT4_ENABLE_mask                               = 0x0f << 16,
+	OUTPUT4_ENABLE_shift                              = 16,
+	OUTPUT5_ENABLE_mask                               = 0x0f << 20,
+	OUTPUT5_ENABLE_shift                              = 20,
+	OUTPUT6_ENABLE_mask                               = 0x0f << 24,
+	OUTPUT6_ENABLE_shift                              = 24,
+	OUTPUT7_ENABLE_mask                               = 0x0f << 28,
+	OUTPUT7_ENABLE_shift                              = 28,
+    PA_SC_GENERIC_SCISSOR_TL                              = 0x00028240,
+	PA_SC_GENERIC_SCISSOR_TL__TL_X_mask               = 0x3fff << 0,
+	PA_SC_GENERIC_SCISSOR_TL__TL_X_shift              = 0,
+	PA_SC_GENERIC_SCISSOR_TL__TL_Y_mask               = 0x3fff << 16,
+	PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift              = 16,
+/* 	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31, */
+    PA_SC_GENERIC_SCISSOR_BR                              = 0x00028244,
+	PA_SC_GENERIC_SCISSOR_BR__BR_X_mask               = 0x3fff << 0,
+	PA_SC_GENERIC_SCISSOR_BR__BR_X_shift              = 0,
+	PA_SC_GENERIC_SCISSOR_BR__BR_Y_mask               = 0x3fff << 16,
+	PA_SC_GENERIC_SCISSOR_BR__BR_Y_shift              = 16,
+    PA_SC_VPORT_SCISSOR_0_TL                              = 0x00028250,
+	PA_SC_VPORT_SCISSOR_0_TL_num                      = 16,
+	PA_SC_VPORT_SCISSOR_0_TL_offset                   = 8,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_X_mask               = 0x3fff << 0,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift              = 0,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_Y_mask               = 0x3fff << 16,
+	PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift              = 16,
+/* 	WINDOW_OFFSET_DISABLE_bit                         = 1 << 31, */
+    PA_SC_VPORT_SCISSOR_0_BR                              = 0x00028254,
+	PA_SC_VPORT_SCISSOR_0_BR_num                      = 16,
+	PA_SC_VPORT_SCISSOR_0_BR_offset                   = 8,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_X_mask               = 0x3fff << 0,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift              = 0,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_Y_mask               = 0x3fff << 16,
+	PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift              = 16,
+    PA_SC_VPORT_ZMIN_0                                    = 0x000282d0,
+	PA_SC_VPORT_ZMIN_0_num                            = 16,
+	PA_SC_VPORT_ZMIN_0_offset                         = 8,
+    PA_SC_VPORT_ZMAX_0                                    = 0x000282d4,
+	PA_SC_VPORT_ZMAX_0_num                            = 16,
+	PA_SC_VPORT_ZMAX_0_offset                         = 8,
+    SX_MISC                                               = 0x00028350,
+	MULTIPASS_bit                                     = 1 << 0,
+    SQ_VTX_SEMANTIC_0                                     = 0x00028380,
+	SQ_VTX_SEMANTIC_0_num                             = 32,
+/* 	SEMANTIC_ID_mask                                  = 0xff << 0, */
+/* 	SEMANTIC_ID_shift                                 = 0, */
+    VGT_MAX_VTX_INDX                                      = 0x00028400,
+    VGT_MIN_VTX_INDX                                      = 0x00028404,
+    VGT_INDX_OFFSET                                       = 0x00028408,
+    VGT_MULTI_PRIM_IB_RESET_INDX                          = 0x0002840c,
+    SX_ALPHA_TEST_CONTROL                                 = 0x00028410,
+	ALPHA_FUNC_mask                                   = 0x07 << 0,
+	ALPHA_FUNC_shift                                  = 0,
+	    REF_NEVER                                     = 0x00,
+	    REF_LESS                                      = 0x01,
+	    REF_EQUAL                                     = 0x02,
+	    REF_LEQUAL                                    = 0x03,
+	    REF_GREATER                                   = 0x04,
+	    REF_NOTEQUAL                                  = 0x05,
+	    REF_GEQUAL                                    = 0x06,
+	    REF_ALWAYS                                    = 0x07,
+	ALPHA_TEST_ENABLE_bit                             = 1 << 3,
+	ALPHA_TEST_BYPASS_bit                             = 1 << 8,
+    CB_BLEND_RED                                          = 0x00028414,
+    CB_BLEND_GREEN                                        = 0x00028418,
+    CB_BLEND_BLUE                                         = 0x0002841c,
+    CB_BLEND_ALPHA                                        = 0x00028420,
+    CB_FOG_RED                                            = 0x00028424,
+    CB_FOG_GREEN                                          = 0x00028428,
+    CB_FOG_BLUE                                           = 0x0002842c,
+    DB_STENCILREFMASK                                     = 0x00028430,
+	STENCILREF_mask                                   = 0xff << 0,
+	STENCILREF_shift                                  = 0,
+	STENCILMASK_mask                                  = 0xff << 8,
+	STENCILMASK_shift                                 = 8,
+	STENCILWRITEMASK_mask                             = 0xff << 16,
+	STENCILWRITEMASK_shift                            = 16,
+    DB_STENCILREFMASK_BF                                  = 0x00028434,
+	STENCILREF_BF_mask                                = 0xff << 0,
+	STENCILREF_BF_shift                               = 0,
+	STENCILMASK_BF_mask                               = 0xff << 8,
+	STENCILMASK_BF_shift                              = 8,
+	STENCILWRITEMASK_BF_mask                          = 0xff << 16,
+	STENCILWRITEMASK_BF_shift                         = 16,
+    SX_ALPHA_REF                                          = 0x00028438,
+    PA_CL_VPORT_XSCALE_0                                  = 0x0002843c,
+	PA_CL_VPORT_XSCALE_0_num                          = 16,
+	PA_CL_VPORT_XSCALE_0_offset                       = 24,
+    PA_CL_VPORT_XOFFSET_0                                 = 0x00028440,
+	PA_CL_VPORT_XOFFSET_0_num                         = 16,
+	PA_CL_VPORT_XOFFSET_0_offset                      = 24,
+    PA_CL_VPORT_YSCALE_0                                  = 0x00028444,
+	PA_CL_VPORT_YSCALE_0_num                          = 16,
+	PA_CL_VPORT_YSCALE_0_offset                       = 24,
+    PA_CL_VPORT_YOFFSET_0                                 = 0x00028448,
+	PA_CL_VPORT_YOFFSET_0_num                         = 16,
+	PA_CL_VPORT_YOFFSET_0_offset                      = 24,
+    PA_CL_VPORT_ZSCALE_0                                  = 0x0002844c,
+	PA_CL_VPORT_ZSCALE_0_num                          = 16,
+	PA_CL_VPORT_ZSCALE_0_offset                       = 24,
+    PA_CL_VPORT_ZOFFSET_0                                 = 0x00028450,
+	PA_CL_VPORT_ZOFFSET_0_num                         = 16,
+	PA_CL_VPORT_ZOFFSET_0_offset                      = 24,
+    SPI_VS_OUT_ID_0                                       = 0x00028614,
+	SPI_VS_OUT_ID_0_num                               = 10,
+	SEMANTIC_0_mask                                   = 0xff << 0,
+	SEMANTIC_0_shift                                  = 0,
+	SEMANTIC_1_mask                                   = 0xff << 8,
+	SEMANTIC_1_shift                                  = 8,
+	SEMANTIC_2_mask                                   = 0xff << 16,
+	SEMANTIC_2_shift                                  = 16,
+	SEMANTIC_3_mask                                   = 0xff << 24,
+	SEMANTIC_3_shift                                  = 24,
+    SPI_PS_INPUT_CNTL_0                                   = 0x00028644,
+	SPI_PS_INPUT_CNTL_0_num                           = 32,
+	SEMANTIC_mask                                     = 0xff << 0,
+	SEMANTIC_shift                                    = 0,
+	DEFAULT_VAL_mask                                  = 0x03 << 8,
+	DEFAULT_VAL_shift                                 = 8,
+	    X_0_0F                                        = 0x00,
+	FLAT_SHADE_bit                                    = 1 << 10,
+	SEL_CENTROID_bit                                  = 1 << 11,
+	SEL_LINEAR_bit                                    = 1 << 12,
+	CYL_WRAP_mask                                     = 0x0f << 13,
+	CYL_WRAP_shift                                    = 13,
+	PT_SPRITE_TEX_bit                                 = 1 << 17,
+	SEL_SAMPLE_bit                                    = 1 << 18,
+    SPI_VS_OUT_CONFIG                                     = 0x000286c4,
+	VS_PER_COMPONENT_bit                              = 1 << 0,
+	VS_EXPORT_COUNT_mask                              = 0x1f << 1,
+	VS_EXPORT_COUNT_shift                             = 1,
+	VS_EXPORTS_FOG_bit                                = 1 << 8,
+	VS_OUT_FOG_VEC_ADDR_mask                          = 0x1f << 9,
+	VS_OUT_FOG_VEC_ADDR_shift                         = 9,
+    SPI_PS_IN_CONTROL_0                                   = 0x000286cc,
+	NUM_INTERP_mask                                   = 0x3f << 0,
+	NUM_INTERP_shift                                  = 0,
+	POSITION_ENA_bit                                  = 1 << 8,
+	POSITION_CENTROID_bit                             = 1 << 9,
+	POSITION_ADDR_mask                                = 0x1f << 10,
+	POSITION_ADDR_shift                               = 10,
+	PARAM_GEN_mask                                    = 0x0f << 15,
+	PARAM_GEN_shift                                   = 15,
+	PARAM_GEN_ADDR_mask                               = 0x7f << 19,
+	PARAM_GEN_ADDR_shift                              = 19,
+	BARYC_SAMPLE_CNTL_mask                            = 0x03 << 26,
+	BARYC_SAMPLE_CNTL_shift                           = 26,
+	    CENTROIDS_ONLY                                = 0x00,
+	    CENTERS_ONLY                                  = 0x01,
+	    CENTROIDS_AND_CENTERS                         = 0x02,
+	    UNDEF                                         = 0x03,
+	PERSP_GRADIENT_ENA_bit                            = 1 << 28,
+	LINEAR_GRADIENT_ENA_bit                           = 1 << 29,
+	POSITION_SAMPLE_bit                               = 1 << 30,
+	BARYC_AT_SAMPLE_ENA_bit                           = 1 << 31,
+    SPI_PS_IN_CONTROL_1                                   = 0x000286d0,
+	GEN_INDEX_PIX_bit                                 = 1 << 0,
+	GEN_INDEX_PIX_ADDR_mask                           = 0x7f << 1,
+	GEN_INDEX_PIX_ADDR_shift                          = 1,
+	FRONT_FACE_ENA_bit                                = 1 << 8,
+	FRONT_FACE_CHAN_mask                              = 0x03 << 9,
+	FRONT_FACE_CHAN_shift                             = 9,
+	FRONT_FACE_ALL_BITS_bit                           = 1 << 11,
+	FRONT_FACE_ADDR_mask                              = 0x1f << 12,
+	FRONT_FACE_ADDR_shift                             = 12,
+	FOG_ADDR_mask                                     = 0x7f << 17,
+	FOG_ADDR_shift                                    = 17,
+	FIXED_PT_POSITION_ENA_bit                         = 1 << 24,
+	FIXED_PT_POSITION_ADDR_mask                       = 0x1f << 25,
+	FIXED_PT_POSITION_ADDR_shift                      = 25,
+    SPI_INTERP_CONTROL_0                                  = 0x000286d4,
+	FLAT_SHADE_ENA_bit                                = 1 << 0,
+	PNT_SPRITE_ENA_bit                                = 1 << 1,
+	PNT_SPRITE_OVRD_X_mask                            = 0x07 << 2,
+	PNT_SPRITE_OVRD_X_shift                           = 2,
+	    SPI_PNT_SPRITE_SEL_0                          = 0x00,
+	    SPI_PNT_SPRITE_SEL_1                          = 0x01,
+	    SPI_PNT_SPRITE_SEL_S                          = 0x02,
+	    SPI_PNT_SPRITE_SEL_T                          = 0x03,
+	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04,
+	PNT_SPRITE_OVRD_Y_mask                            = 0x07 << 5,
+	PNT_SPRITE_OVRD_Y_shift                           = 5,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_OVRD_Z_mask                            = 0x07 << 8,
+	PNT_SPRITE_OVRD_Z_shift                           = 8,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_OVRD_W_mask                            = 0x07 << 11,
+	PNT_SPRITE_OVRD_W_shift                           = 11,
+/* 	    SPI_PNT_SPRITE_SEL_0                          = 0x00, */
+/* 	    SPI_PNT_SPRITE_SEL_1                          = 0x01, */
+/* 	    SPI_PNT_SPRITE_SEL_S                          = 0x02, */
+/* 	    SPI_PNT_SPRITE_SEL_T                          = 0x03, */
+/* 	    SPI_PNT_SPRITE_SEL_NONE                       = 0x04, */
+	PNT_SPRITE_TOP_1_bit                              = 1 << 14,
+    SPI_INPUT_Z                                           = 0x000286d8,
+	PROVIDE_Z_TO_SPI_bit                              = 1 << 0,
+    SPI_FOG_CNTL                                          = 0x000286dc,
+	PASS_FOG_THROUGH_PS_bit                           = 1 << 0,
+	PIXEL_FOG_FUNC_mask                               = 0x03 << 1,
+	PIXEL_FOG_FUNC_shift                              = 1,
+	    SPI_FOG_NONE                                  = 0x00,
+	    SPI_FOG_EXP                                   = 0x01,
+	    SPI_FOG_EXP2                                  = 0x02,
+	    SPI_FOG_LINEAR                                = 0x03,
+	PIXEL_FOG_SRC_SEL_bit                             = 1 << 3,
+	VS_FOG_CLAMP_DISABLE_bit                          = 1 << 4,
+    SPI_FOG_FUNC_SCALE                                    = 0x000286e0,
+    SPI_FOG_FUNC_BIAS                                     = 0x000286e4,
+    CB_BLEND0_CONTROL                                     = 0x00028780,
+	CB_BLEND0_CONTROL_num                             = 8,
+	COLOR_SRCBLEND_mask                               = 0x1f << 0,
+	COLOR_SRCBLEND_shift                              = 0,
+	COLOR_COMB_FCN_mask                               = 0x07 << 5,
+	COLOR_COMB_FCN_shift                              = 5,
+	COLOR_DESTBLEND_mask                              = 0x1f << 8,
+	COLOR_DESTBLEND_shift                             = 8,
+	OPACITY_WEIGHT_bit                                = 1 << 13,
+	ALPHA_SRCBLEND_mask                               = 0x1f << 16,
+	ALPHA_SRCBLEND_shift                              = 16,
+	ALPHA_COMB_FCN_mask                               = 0x07 << 21,
+	ALPHA_COMB_FCN_shift                              = 21,
+	ALPHA_DESTBLEND_mask                              = 0x1f << 24,
+	ALPHA_DESTBLEND_shift                             = 24,
+	SEPARATE_ALPHA_BLEND_bit                          = 1 << 29,
+    VGT_DMA_BASE_HI                                       = 0x000287e4,
+	VGT_DMA_BASE_HI__BASE_ADDR_mask                   = 0xff << 0,
+	VGT_DMA_BASE_HI__BASE_ADDR_shift                  = 0,
+    VGT_DMA_BASE                                          = 0x000287e8,
+    VGT_DRAW_INITIATOR                                    = 0x000287f0,
+	SOURCE_SELECT_mask                                = 0x03 << 0,
+	SOURCE_SELECT_shift                               = 0,
+	    DI_SRC_SEL_DMA                                = 0x00,
+	    DI_SRC_SEL_IMMEDIATE                          = 0x01,
+	    DI_SRC_SEL_AUTO_INDEX                         = 0x02,
+	    DI_SRC_SEL_RESERVED                           = 0x03,
+	MAJOR_MODE_mask                                   = 0x03 << 2,
+	MAJOR_MODE_shift                                  = 2,
+	    DI_MAJOR_MODE_0                               = 0x00,
+	    DI_MAJOR_MODE_1                               = 0x01,
+	SPRITE_EN_bit                                     = 1 << 4,
+	NOT_EOP_bit                                       = 1 << 5,
+	USE_OPAQUE_bit                                    = 1 << 6,
+    VGT_IMMED_DATA                                        = 0x000287f4,
+    VGT_EVENT_ADDRESS_REG                                 = 0x000287f8,
+	ADDRESS_LOW_mask                                  = 0xfffffff << 0,
+	ADDRESS_LOW_shift                                 = 0,
+    DB_DEPTH_CONTROL                                      = 0x00028800,
+	STENCIL_ENABLE_bit                                = 1 << 0,
+	Z_ENABLE_bit                                      = 1 << 1,
+	Z_WRITE_ENABLE_bit                                = 1 << 2,
+	ZFUNC_mask                                        = 0x07 << 4,
+	ZFUNC_shift                                       = 4,
+	    FRAG_NEVER                                    = 0x00,
+	    FRAG_LESS                                     = 0x01,
+	    FRAG_EQUAL                                    = 0x02,
+	    FRAG_LEQUAL                                   = 0x03,
+	    FRAG_GREATER                                  = 0x04,
+	    FRAG_NOTEQUAL                                 = 0x05,
+	    FRAG_GEQUAL                                   = 0x06,
+	    FRAG_ALWAYS                                   = 0x07,
+	BACKFACE_ENABLE_bit                               = 1 << 7,
+	STENCILFUNC_mask                                  = 0x07 << 8,
+	STENCILFUNC_shift                                 = 8,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	STENCILFAIL_mask                                  = 0x07 << 11,
+	STENCILFAIL_shift                                 = 11,
+	    STENCIL_KEEP                                  = 0x00,
+	    STENCIL_ZERO                                  = 0x01,
+	    STENCIL_REPLACE                               = 0x02,
+	    STENCIL_INCR_CLAMP                            = 0x03,
+	    STENCIL_DECR_CLAMP                            = 0x04,
+	    STENCIL_INVERT                                = 0x05,
+	    STENCIL_INCR_WRAP                             = 0x06,
+	    STENCIL_DECR_WRAP                             = 0x07,
+	STENCILZPASS_mask                                 = 0x07 << 14,
+	STENCILZPASS_shift                                = 14,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZFAIL_mask                                 = 0x07 << 17,
+	STENCILZFAIL_shift                                = 17,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILFUNC_BF_mask                               = 0x07 << 20,
+	STENCILFUNC_BF_shift                              = 20,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	STENCILFAIL_BF_mask                               = 0x07 << 23,
+	STENCILFAIL_BF_shift                              = 23,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZPASS_BF_mask                              = 0x07 << 26,
+	STENCILZPASS_BF_shift                             = 26,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+	STENCILZFAIL_BF_mask                              = 0x07 << 29,
+	STENCILZFAIL_BF_shift                             = 29,
+/* 	    STENCIL_KEEP                                  = 0x00, */
+/* 	    STENCIL_ZERO                                  = 0x01, */
+/* 	    STENCIL_REPLACE                               = 0x02, */
+/* 	    STENCIL_INCR_CLAMP                            = 0x03, */
+/* 	    STENCIL_DECR_CLAMP                            = 0x04, */
+/* 	    STENCIL_INVERT                                = 0x05, */
+/* 	    STENCIL_INCR_WRAP                             = 0x06, */
+/* 	    STENCIL_DECR_WRAP                             = 0x07, */
+    CB_BLEND_CONTROL                                      = 0x00028804,
+/* 	COLOR_SRCBLEND_mask                               = 0x1f << 0, */
+/* 	COLOR_SRCBLEND_shift                              = 0, */
+	    BLEND_ZERO                                    = 0x00,
+	    BLEND_ONE                                     = 0x01,
+	    BLEND_SRC_COLOR                               = 0x02,
+	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03,
+	    BLEND_SRC_ALPHA                               = 0x04,
+	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05,
+	    BLEND_DST_ALPHA                               = 0x06,
+	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07,
+	    BLEND_DST_COLOR                               = 0x08,
+	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09,
+	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a,
+	    BLEND_BOTH_SRC_ALPHA                          = 0x0b,
+	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c,
+	    BLEND_CONSTANT_COLOR                          = 0x0d,
+	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e,
+	    BLEND_SRC1_COLOR                              = 0x0f,
+	    BLEND_INV_SRC1_COLOR                          = 0x10,
+	    BLEND_SRC1_ALPHA                              = 0x11,
+	    BLEND_INV_SRC1_ALPHA                          = 0x12,
+	    BLEND_CONSTANT_ALPHA                          = 0x13,
+	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14,
+/* 	COLOR_COMB_FCN_mask                               = 0x07 << 5, */
+/* 	COLOR_COMB_FCN_shift                              = 5, */
+	    COMB_DST_PLUS_SRC                             = 0x00,
+	    COMB_SRC_MINUS_DST                            = 0x01,
+	    COMB_MIN_DST_SRC                              = 0x02,
+	    COMB_MAX_DST_SRC                              = 0x03,
+	    COMB_DST_MINUS_SRC                            = 0x04,
+/* 	COLOR_DESTBLEND_mask                              = 0x1f << 8, */
+/* 	COLOR_DESTBLEND_shift                             = 8, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	OPACITY_WEIGHT_bit                                = 1 << 13, */
+/* 	ALPHA_SRCBLEND_mask                               = 0x1f << 16, */
+/* 	ALPHA_SRCBLEND_shift                              = 16, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	ALPHA_COMB_FCN_mask                               = 0x07 << 21, */
+/* 	ALPHA_COMB_FCN_shift                              = 21, */
+/* 	    COMB_DST_PLUS_SRC                             = 0x00, */
+/* 	    COMB_SRC_MINUS_DST                            = 0x01, */
+/* 	    COMB_MIN_DST_SRC                              = 0x02, */
+/* 	    COMB_MAX_DST_SRC                              = 0x03, */
+/* 	    COMB_DST_MINUS_SRC                            = 0x04, */
+/* 	ALPHA_DESTBLEND_mask                              = 0x1f << 24, */
+/* 	ALPHA_DESTBLEND_shift                             = 24, */
+/* 	    BLEND_ZERO                                    = 0x00, */
+/* 	    BLEND_ONE                                     = 0x01, */
+/* 	    BLEND_SRC_COLOR                               = 0x02, */
+/* 	    BLEND_ONE_MINUS_SRC_COLOR                     = 0x03, */
+/* 	    BLEND_SRC_ALPHA                               = 0x04, */
+/* 	    BLEND_ONE_MINUS_SRC_ALPHA                     = 0x05, */
+/* 	    BLEND_DST_ALPHA                               = 0x06, */
+/* 	    BLEND_ONE_MINUS_DST_ALPHA                     = 0x07, */
+/* 	    BLEND_DST_COLOR                               = 0x08, */
+/* 	    BLEND_ONE_MINUS_DST_COLOR                     = 0x09, */
+/* 	    BLEND_SRC_ALPHA_SATURATE                      = 0x0a, */
+/* 	    BLEND_BOTH_SRC_ALPHA                          = 0x0b, */
+/* 	    BLEND_BOTH_INV_SRC_ALPHA                      = 0x0c, */
+/* 	    BLEND_CONSTANT_COLOR                          = 0x0d, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_COLOR                = 0x0e, */
+/* 	    BLEND_SRC1_COLOR                              = 0x0f, */
+/* 	    BLEND_INV_SRC1_COLOR                          = 0x10, */
+/* 	    BLEND_SRC1_ALPHA                              = 0x11, */
+/* 	    BLEND_INV_SRC1_ALPHA                          = 0x12, */
+/* 	    BLEND_CONSTANT_ALPHA                          = 0x13, */
+/* 	    BLEND_ONE_MINUS_CONSTANT_ALPHA                = 0x14, */
+/* 	SEPARATE_ALPHA_BLEND_bit                          = 1 << 29, */
+    CB_COLOR_CONTROL                                      = 0x00028808,
+	FOG_ENABLE_bit                                    = 1 << 0,
+	MULTIWRITE_ENABLE_bit                             = 1 << 1,
+	DITHER_ENABLE_bit                                 = 1 << 2,
+	DEGAMMA_ENABLE_bit                                = 1 << 3,
+	SPECIAL_OP_mask                                   = 0x07 << 4,
+	SPECIAL_OP_shift                                  = 4,
+	    SPECIAL_NORMAL                                = 0x00,
+	    SPECIAL_DISABLE                               = 0x01,
+	    SPECIAL_FAST_CLEAR                            = 0x02,
+	    SPECIAL_FORCE_CLEAR                           = 0x03,
+	    SPECIAL_EXPAND_COLOR                          = 0x04,
+	    SPECIAL_EXPAND_TEXTURE                        = 0x05,
+	    SPECIAL_EXPAND_SAMPLES                        = 0x06,
+	    SPECIAL_RESOLVE_BOX                           = 0x07,
+	PER_MRT_BLEND_bit                                 = 1 << 7,
+	TARGET_BLEND_ENABLE_mask                          = 0xff << 8,
+	TARGET_BLEND_ENABLE_shift                         = 8,
+	ROP3_mask                                         = 0xff << 16,
+	ROP3_shift                                        = 16,
+    DB_SHADER_CONTROL                                     = 0x0002880c,
+	Z_EXPORT_ENABLE_bit                               = 1 << 0,
+	STENCIL_REF_EXPORT_ENABLE_bit                     = 1 << 1,
+	Z_ORDER_mask                                      = 0x03 << 4,
+	Z_ORDER_shift                                     = 4,
+	    LATE_Z                                        = 0x00,
+	    EARLY_Z_THEN_LATE_Z                           = 0x01,
+	    RE_Z                                          = 0x02,
+	    EARLY_Z_THEN_RE_Z                             = 0x03,
+	KILL_ENABLE_bit                                   = 1 << 6,
+	COVERAGE_TO_MASK_ENABLE_bit                       = 1 << 7,
+	MASK_EXPORT_ENABLE_bit                            = 1 << 8,
+	DUAL_EXPORT_ENABLE_bit                            = 1 << 9,
+	EXEC_ON_HIER_FAIL_bit                             = 1 << 10,
+	EXEC_ON_NOOP_bit                                  = 1 << 11,
+    PA_CL_CLIP_CNTL                                       = 0x00028810,
+	UCP_ENA_0_bit                                     = 1 << 0,
+	UCP_ENA_1_bit                                     = 1 << 1,
+	UCP_ENA_2_bit                                     = 1 << 2,
+	UCP_ENA_3_bit                                     = 1 << 3,
+	UCP_ENA_4_bit                                     = 1 << 4,
+	UCP_ENA_5_bit                                     = 1 << 5,
+	PS_UCP_Y_SCALE_NEG_bit                            = 1 << 13,
+	PS_UCP_MODE_mask                                  = 0x03 << 14,
+	PS_UCP_MODE_shift                                 = 14,
+	CLIP_DISABLE_bit                                  = 1 << 16,
+	UCP_CULL_ONLY_ENA_bit                             = 1 << 17,
+	BOUNDARY_EDGE_FLAG_ENA_bit                        = 1 << 18,
+	DX_CLIP_SPACE_DEF_bit                             = 1 << 19,
+	DIS_CLIP_ERR_DETECT_bit                           = 1 << 20,
+	VTX_KILL_OR_bit                                   = 1 << 21,
+	DX_LINEAR_ATTR_CLIP_ENA_bit                       = 1 << 24,
+	VTE_VPORT_PROVOKE_DISABLE_bit                     = 1 << 25,
+	ZCLIP_NEAR_DISABLE_bit                            = 1 << 26,
+	ZCLIP_FAR_DISABLE_bit                             = 1 << 27,
+    PA_SU_SC_MODE_CNTL                                    = 0x00028814,
+	CULL_FRONT_bit                                    = 1 << 0,
+	CULL_BACK_bit                                     = 1 << 1,
+	FACE_bit                                          = 1 << 2,
+	POLY_MODE_mask                                    = 0x03 << 3,
+	POLY_MODE_shift                                   = 3,
+	    X_DISABLE_POLY_MODE                           = 0x00,
+	    X_DUAL_MODE                                   = 0x01,
+	POLYMODE_FRONT_PTYPE_mask                         = 0x07 << 5,
+	POLYMODE_FRONT_PTYPE_shift                        = 5,
+	    X_DRAW_POINTS                                 = 0x00,
+	    X_DRAW_LINES                                  = 0x01,
+	    X_DRAW_TRIANGLES                              = 0x02,
+	POLYMODE_BACK_PTYPE_mask                          = 0x07 << 8,
+	POLYMODE_BACK_PTYPE_shift                         = 8,
+/* 	    X_DRAW_POINTS                                 = 0x00, */
+/* 	    X_DRAW_LINES                                  = 0x01, */
+/* 	    X_DRAW_TRIANGLES                              = 0x02, */
+	POLY_OFFSET_FRONT_ENABLE_bit                      = 1 << 11,
+	POLY_OFFSET_BACK_ENABLE_bit                       = 1 << 12,
+	POLY_OFFSET_PARA_ENABLE_bit                       = 1 << 13,
+	VTX_WINDOW_OFFSET_ENABLE_bit                      = 1 << 16,
+	PROVOKING_VTX_LAST_bit                            = 1 << 19,
+	PERSP_CORR_DIS_bit                                = 1 << 20,
+	MULTI_PRIM_IB_ENA_bit                             = 1 << 21,
+    PA_CL_VTE_CNTL                                        = 0x00028818,
+	VPORT_X_SCALE_ENA_bit                             = 1 << 0,
+	VPORT_X_OFFSET_ENA_bit                            = 1 << 1,
+	VPORT_Y_SCALE_ENA_bit                             = 1 << 2,
+	VPORT_Y_OFFSET_ENA_bit                            = 1 << 3,
+	VPORT_Z_SCALE_ENA_bit                             = 1 << 4,
+	VPORT_Z_OFFSET_ENA_bit                            = 1 << 5,
+	VTX_XY_FMT_bit                                    = 1 << 8,
+	VTX_Z_FMT_bit                                     = 1 << 9,
+	VTX_W0_FMT_bit                                    = 1 << 10,
+	PERFCOUNTER_REF_bit                               = 1 << 11,
+    PA_CL_VS_OUT_CNTL                                     = 0x0002881c,
+	CLIP_DIST_ENA_0_bit                               = 1 << 0,
+	CLIP_DIST_ENA_1_bit                               = 1 << 1,
+	CLIP_DIST_ENA_2_bit                               = 1 << 2,
+	CLIP_DIST_ENA_3_bit                               = 1 << 3,
+	CLIP_DIST_ENA_4_bit                               = 1 << 4,
+	CLIP_DIST_ENA_5_bit                               = 1 << 5,
+	CLIP_DIST_ENA_6_bit                               = 1 << 6,
+	CLIP_DIST_ENA_7_bit                               = 1 << 7,
+	CULL_DIST_ENA_0_bit                               = 1 << 8,
+	CULL_DIST_ENA_1_bit                               = 1 << 9,
+	CULL_DIST_ENA_2_bit                               = 1 << 10,
+	CULL_DIST_ENA_3_bit                               = 1 << 11,
+	CULL_DIST_ENA_4_bit                               = 1 << 12,
+	CULL_DIST_ENA_5_bit                               = 1 << 13,
+	CULL_DIST_ENA_6_bit                               = 1 << 14,
+	CULL_DIST_ENA_7_bit                               = 1 << 15,
+	USE_VTX_POINT_SIZE_bit                            = 1 << 16,
+	USE_VTX_EDGE_FLAG_bit                             = 1 << 17,
+	USE_VTX_RENDER_TARGET_INDX_bit                    = 1 << 18,
+	USE_VTX_VIEWPORT_INDX_bit                         = 1 << 19,
+	USE_VTX_KILL_FLAG_bit                             = 1 << 20,
+	VS_OUT_MISC_VEC_ENA_bit                           = 1 << 21,
+	VS_OUT_CCDIST0_VEC_ENA_bit                        = 1 << 22,
+	VS_OUT_CCDIST1_VEC_ENA_bit                        = 1 << 23,
+    PA_CL_NANINF_CNTL                                     = 0x00028820,
+	VTE_XY_INF_DISCARD_bit                            = 1 << 0,
+	VTE_Z_INF_DISCARD_bit                             = 1 << 1,
+	VTE_W_INF_DISCARD_bit                             = 1 << 2,
+	VTE_0XNANINF_IS_0_bit                             = 1 << 3,
+	VTE_XY_NAN_RETAIN_bit                             = 1 << 4,
+	VTE_Z_NAN_RETAIN_bit                              = 1 << 5,
+	VTE_W_NAN_RETAIN_bit                              = 1 << 6,
+	VTE_W_RECIP_NAN_IS_0_bit                          = 1 << 7,
+	VS_XY_NAN_TO_INF_bit                              = 1 << 8,
+	VS_XY_INF_RETAIN_bit                              = 1 << 9,
+	VS_Z_NAN_TO_INF_bit                               = 1 << 10,
+	VS_Z_INF_RETAIN_bit                               = 1 << 11,
+	VS_W_NAN_TO_INF_bit                               = 1 << 12,
+	VS_W_INF_RETAIN_bit                               = 1 << 13,
+	VS_CLIP_DIST_INF_DISCARD_bit                      = 1 << 14,
+	VTE_NO_OUTPUT_NEG_0_bit                           = 1 << 20,
+    SQ_PGM_START_PS                                       = 0x00028840,
+    SQ_PGM_RESOURCES_PS                                   = 0x00028850,
+	NUM_GPRS_mask                                     = 0xff << 0,
+	NUM_GPRS_shift                                    = 0,
+	STACK_SIZE_mask                                   = 0xff << 8,
+	STACK_SIZE_shift                                  = 8,
+	SQ_PGM_RESOURCES_PS__DX10_CLAMP_bit               = 1 << 21,
+	FETCH_CACHE_LINES_mask                            = 0x07 << 24,
+	FETCH_CACHE_LINES_shift                           = 24,
+	UNCACHED_FIRST_INST_bit                           = 1 << 28,
+	CLAMP_CONSTS_bit                                  = 1 << 31,
+    SQ_PGM_EXPORTS_PS                                     = 0x00028854,
+	EXPORT_MODE_mask                                  = 0x1f << 0,
+	EXPORT_MODE_shift                                 = 0,
+    SQ_PGM_START_VS                                       = 0x00028858,
+    SQ_PGM_RESOURCES_VS                                   = 0x00028868,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_VS__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_GS                                       = 0x0002886c,
+    SQ_PGM_RESOURCES_GS                                   = 0x0002887c,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_GS__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_ES                                       = 0x00028880,
+    SQ_PGM_RESOURCES_ES                                   = 0x00028890,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_ES__DX10_CLAMP_bit               = 1 << 21,
+/* 	FETCH_CACHE_LINES_mask                            = 0x07 << 24, */
+/* 	FETCH_CACHE_LINES_shift                           = 24, */
+/* 	UNCACHED_FIRST_INST_bit                           = 1 << 28, */
+    SQ_PGM_START_FS                                       = 0x00028894,
+    SQ_PGM_RESOURCES_FS                                   = 0x000288a4,
+/* 	NUM_GPRS_mask                                     = 0xff << 0, */
+/* 	NUM_GPRS_shift                                    = 0, */
+/* 	STACK_SIZE_mask                                   = 0xff << 8, */
+/* 	STACK_SIZE_shift                                  = 8, */
+	SQ_PGM_RESOURCES_FS__DX10_CLAMP_bit               = 1 << 21,
+    SQ_ESGS_RING_ITEMSIZE                                 = 0x000288a8,
+	ITEMSIZE_mask                                     = 0x7fff << 0,
+	ITEMSIZE_shift                                    = 0,
+    SQ_GSVS_RING_ITEMSIZE                                 = 0x000288ac,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_ESTMP_RING_ITEMSIZE                                = 0x000288b0,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_GSTMP_RING_ITEMSIZE                                = 0x000288b4,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_VSTMP_RING_ITEMSIZE                                = 0x000288b8,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_PSTMP_RING_ITEMSIZE                                = 0x000288bc,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_FBUF_RING_ITEMSIZE                                 = 0x000288c0,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_REDUC_RING_ITEMSIZE                                = 0x000288c4,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_GS_VERT_ITEMSIZE                                   = 0x000288c8,
+/* 	ITEMSIZE_mask                                     = 0x7fff << 0, */
+/* 	ITEMSIZE_shift                                    = 0, */
+    SQ_PGM_CF_OFFSET_PS                                   = 0x000288cc,
+	PGM_CF_OFFSET_mask                                = 0xfffff << 0,
+	PGM_CF_OFFSET_shift                               = 0,
+    SQ_PGM_CF_OFFSET_VS                                   = 0x000288d0,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_GS                                   = 0x000288d4,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_ES                                   = 0x000288d8,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_PGM_CF_OFFSET_FS                                   = 0x000288dc,
+/* 	PGM_CF_OFFSET_mask                                = 0xfffff << 0, */
+/* 	PGM_CF_OFFSET_shift                               = 0, */
+    SQ_VTX_SEMANTIC_CLEAR                                 = 0x000288e0,
+    SQ_ALU_CONST_CACHE_PS_0                               = 0x00028940,
+	SQ_ALU_CONST_CACHE_PS_0_num                       = 16,
+    SQ_ALU_CONST_CACHE_VS_0                               = 0x00028980,
+	SQ_ALU_CONST_CACHE_VS_0_num                       = 16,
+    SQ_ALU_CONST_CACHE_GS_0                               = 0x000289c0,
+	SQ_ALU_CONST_CACHE_GS_0_num                       = 16,
+    PA_SU_POINT_SIZE                                      = 0x00028a00,
+	PA_SU_POINT_SIZE__HEIGHT_mask                     = 0xffff << 0,
+	PA_SU_POINT_SIZE__HEIGHT_shift                    = 0,
+	PA_SU_POINT_SIZE__WIDTH_mask                      = 0xffff << 16,
+	PA_SU_POINT_SIZE__WIDTH_shift                     = 16,
+    PA_SU_POINT_MINMAX                                    = 0x00028a04,
+	MIN_SIZE_mask                                     = 0xffff << 0,
+	MIN_SIZE_shift                                    = 0,
+	MAX_SIZE_mask                                     = 0xffff << 16,
+	MAX_SIZE_shift                                    = 16,
+    PA_SU_LINE_CNTL                                       = 0x00028a08,
+	PA_SU_LINE_CNTL__WIDTH_mask                       = 0xffff << 0,
+	PA_SU_LINE_CNTL__WIDTH_shift                      = 0,
+    PA_SC_LINE_STIPPLE                                    = 0x00028a0c,
+	LINE_PATTERN_mask                                 = 0xffff << 0,
+	LINE_PATTERN_shift                                = 0,
+	REPEAT_COUNT_mask                                 = 0xff << 16,
+	REPEAT_COUNT_shift                                = 16,
+	PATTERN_BIT_ORDER_bit                             = 1 << 28,
+	AUTO_RESET_CNTL_mask                              = 0x03 << 29,
+	AUTO_RESET_CNTL_shift                             = 29,
+    VGT_OUTPUT_PATH_CNTL                                  = 0x00028a10,
+	PATH_SELECT_mask                                  = 0x03 << 0,
+	PATH_SELECT_shift                                 = 0,
+	    VGT_OUTPATH_VTX_REUSE                         = 0x00,
+	    VGT_OUTPATH_TESS_EN                           = 0x01,
+	    VGT_OUTPATH_PASSTHRU                          = 0x02,
+	    VGT_OUTPATH_GS_BLOCK                          = 0x03,
+    VGT_HOS_CNTL                                          = 0x00028a14,
+	TESS_MODE_mask                                    = 0x03 << 0,
+	TESS_MODE_shift                                   = 0,
+    VGT_HOS_MAX_TESS_LEVEL                                = 0x00028a18,
+    VGT_HOS_MIN_TESS_LEVEL                                = 0x00028a1c,
+    VGT_HOS_REUSE_DEPTH                                   = 0x00028a20,
+	REUSE_DEPTH_mask                                  = 0xff << 0,
+	REUSE_DEPTH_shift                                 = 0,
+    VGT_GROUP_PRIM_TYPE                                   = 0x00028a24,
+	VGT_GROUP_PRIM_TYPE__PRIM_TYPE_mask               = 0x1f << 0,
+	VGT_GROUP_PRIM_TYPE__PRIM_TYPE_shift              = 0,
+	    VGT_GRP_3D_POINT                              = 0x00,
+	    VGT_GRP_3D_LINE                               = 0x01,
+	    VGT_GRP_3D_TRI                                = 0x02,
+	    VGT_GRP_3D_RECT                               = 0x03,
+	    VGT_GRP_3D_QUAD                               = 0x04,
+	    VGT_GRP_2D_COPY_RECT_V0                       = 0x05,
+	    VGT_GRP_2D_COPY_RECT_V1                       = 0x06,
+	    VGT_GRP_2D_COPY_RECT_V2                       = 0x07,
+	    VGT_GRP_2D_COPY_RECT_V3                       = 0x08,
+	    VGT_GRP_2D_FILL_RECT                          = 0x09,
+	    VGT_GRP_2D_LINE                               = 0x0a,
+	    VGT_GRP_2D_TRI                                = 0x0b,
+	    VGT_GRP_PRIM_INDEX_LINE                       = 0x0c,
+	    VGT_GRP_PRIM_INDEX_TRI                        = 0x0d,
+	    VGT_GRP_PRIM_INDEX_QUAD                       = 0x0e,
+	    VGT_GRP_3D_LINE_ADJ                           = 0x0f,
+	    VGT_GRP_3D_TRI_ADJ                            = 0x10,
+	RETAIN_ORDER_bit                                  = 1 << 14,
+	RETAIN_QUADS_bit                                  = 1 << 15,
+	PRIM_ORDER_mask                                   = 0x07 << 16,
+	PRIM_ORDER_shift                                  = 16,
+	    VGT_GRP_LIST                                  = 0x00,
+	    VGT_GRP_STRIP                                 = 0x01,
+	    VGT_GRP_FAN                                   = 0x02,
+	    VGT_GRP_LOOP                                  = 0x03,
+	    VGT_GRP_POLYGON                               = 0x04,
+    VGT_GROUP_FIRST_DECR                                  = 0x00028a28,
+	FIRST_DECR_mask                                   = 0x0f << 0,
+	FIRST_DECR_shift                                  = 0,
+    VGT_GROUP_DECR                                        = 0x00028a2c,
+	DECR_mask                                         = 0x0f << 0,
+	DECR_shift                                        = 0,
+    VGT_GROUP_VECT_0_CNTL                                 = 0x00028a30,
+	COMP_X_EN_bit                                     = 1 << 0,
+	COMP_Y_EN_bit                                     = 1 << 1,
+	COMP_Z_EN_bit                                     = 1 << 2,
+	COMP_W_EN_bit                                     = 1 << 3,
+	VGT_GROUP_VECT_0_CNTL__STRIDE_mask                = 0xff << 8,
+	VGT_GROUP_VECT_0_CNTL__STRIDE_shift               = 8,
+	SHIFT_mask                                        = 0xff << 16,
+	SHIFT_shift                                       = 16,
+    VGT_GROUP_VECT_1_CNTL                                 = 0x00028a34,
+/* 	COMP_X_EN_bit                                     = 1 << 0, */
+/* 	COMP_Y_EN_bit                                     = 1 << 1, */
+/* 	COMP_Z_EN_bit                                     = 1 << 2, */
+/* 	COMP_W_EN_bit                                     = 1 << 3, */
+	VGT_GROUP_VECT_1_CNTL__STRIDE_mask                = 0xff << 8,
+	VGT_GROUP_VECT_1_CNTL__STRIDE_shift               = 8,
+/* 	SHIFT_mask                                        = 0xff << 16, */
+/* 	SHIFT_shift                                       = 16, */
+    VGT_GROUP_VECT_0_FMT_CNTL                             = 0x00028a38,
+	X_CONV_mask                                       = 0x0f << 0,
+	X_CONV_shift                                      = 0,
+	    VGT_GRP_INDEX_16                              = 0x00,
+	    VGT_GRP_INDEX_32                              = 0x01,
+	    VGT_GRP_UINT_16                               = 0x02,
+	    VGT_GRP_UINT_32                               = 0x03,
+	    VGT_GRP_SINT_16                               = 0x04,
+	    VGT_GRP_SINT_32                               = 0x05,
+	    VGT_GRP_FLOAT_32                              = 0x06,
+	    VGT_GRP_AUTO_PRIM                             = 0x07,
+	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08,
+	X_OFFSET_mask                                     = 0x0f << 4,
+	X_OFFSET_shift                                    = 4,
+	Y_CONV_mask                                       = 0x0f << 8,
+	Y_CONV_shift                                      = 8,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	Y_OFFSET_mask                                     = 0x0f << 12,
+	Y_OFFSET_shift                                    = 12,
+	Z_CONV_mask                                       = 0x0f << 16,
+	Z_CONV_shift                                      = 16,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	Z_OFFSET_mask                                     = 0x0f << 20,
+	Z_OFFSET_shift                                    = 20,
+	W_CONV_mask                                       = 0x0f << 24,
+	W_CONV_shift                                      = 24,
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+	W_OFFSET_mask                                     = 0x0f << 28,
+	W_OFFSET_shift                                    = 28,
+    VGT_GROUP_VECT_1_FMT_CNTL                             = 0x00028a3c,
+/* 	X_CONV_mask                                       = 0x0f << 0, */
+/* 	X_CONV_shift                                      = 0, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	X_OFFSET_mask                                     = 0x0f << 4, */
+/* 	X_OFFSET_shift                                    = 4, */
+/* 	Y_CONV_mask                                       = 0x0f << 8, */
+/* 	Y_CONV_shift                                      = 8, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	Y_OFFSET_mask                                     = 0x0f << 12, */
+/* 	Y_OFFSET_shift                                    = 12, */
+/* 	Z_CONV_mask                                       = 0x0f << 16, */
+/* 	Z_CONV_shift                                      = 16, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	Z_OFFSET_mask                                     = 0x0f << 20, */
+/* 	Z_OFFSET_shift                                    = 20, */
+/* 	W_CONV_mask                                       = 0x0f << 24, */
+/* 	W_CONV_shift                                      = 24, */
+/* 	    VGT_GRP_INDEX_16                              = 0x00, */
+/* 	    VGT_GRP_INDEX_32                              = 0x01, */
+/* 	    VGT_GRP_UINT_16                               = 0x02, */
+/* 	    VGT_GRP_UINT_32                               = 0x03, */
+/* 	    VGT_GRP_SINT_16                               = 0x04, */
+/* 	    VGT_GRP_SINT_32                               = 0x05, */
+/* 	    VGT_GRP_FLOAT_32                              = 0x06, */
+/* 	    VGT_GRP_AUTO_PRIM                             = 0x07, */
+/* 	    VGT_GRP_FIX_1_23_TO_FLOAT                     = 0x08, */
+/* 	W_OFFSET_mask                                     = 0x0f << 28, */
+/* 	W_OFFSET_shift                                    = 28, */
+    VGT_GS_MODE                                           = 0x00028a40,
+	MODE_mask                                         = 0x03 << 0,
+	MODE_shift                                        = 0,
+	    GS_OFF                                        = 0x00,
+	    GS_SCENARIO_A                                 = 0x01,
+	    GS_SCENARIO_B                                 = 0x02,
+	    GS_SCENARIO_G                                 = 0x03,
+	ES_PASSTHRU_bit                                   = 1 << 2,
+	CUT_MODE_mask                                     = 0x03 << 3,
+	CUT_MODE_shift                                    = 3,
+	    GS_CUT_1024                                   = 0x00,
+	    GS_CUT_512                                    = 0x01,
+	    GS_CUT_256                                    = 0x02,
+	    GS_CUT_128                                    = 0x03,
+    PA_SC_MPASS_PS_CNTL                                   = 0x00028a48,
+	MPASS_PIX_VEC_PER_PASS_mask                       = 0xfffff << 0,
+	MPASS_PIX_VEC_PER_PASS_shift                      = 0,
+	MPASS_PS_ENA_bit                                  = 1 << 31,
+    PA_SC_MODE_CNTL                                       = 0x00028a4c,
+	MSAA_ENABLE_bit                                   = 1 << 0,
+	CLIPRECT_ENABLE_bit                               = 1 << 1,
+	LINE_STIPPLE_ENABLE_bit                           = 1 << 2,
+	MULTI_CHIP_PRIM_DISCARD_ENAB_bit                  = 1 << 3,
+	WALK_ORDER_ENABLE_bit                             = 1 << 4,
+	HALVE_DETAIL_SAMPLE_PERF_bit                      = 1 << 5,
+	WALK_SIZE_bit                                     = 1 << 6,
+	WALK_ALIGNMENT_bit                                = 1 << 7,
+	WALK_ALIGN8_PRIM_FITS_ST_bit                      = 1 << 8,
+	TILE_COVER_NO_SCISSOR_bit                         = 1 << 9,
+	KILL_PIX_POST_HI_Z_bit                            = 1 << 10,
+	KILL_PIX_POST_DETAIL_MASK_bit                     = 1 << 11,
+	MULTI_CHIP_SUPERTILE_ENABLE_bit                   = 1 << 12,
+	TILE_COVER_DISABLE_bit                            = 1 << 13,
+	FORCE_EOV_CNTDWN_ENABLE_bit                       = 1 << 14,
+	FORCE_EOV_TILE_ENABLE_bit                         = 1 << 15,
+	FORCE_EOV_REZ_ENABLE_bit                          = 1 << 16,
+	PS_ITER_SAMPLE_bit                                = 1 << 17,
+    VGT_ENHANCE                                           = 0x00028a50,
+	VGT_ENHANCE__MI_TIMESTAMP_RES_mask                = 0x03 << 0,
+	VGT_ENHANCE__MI_TIMESTAMP_RES_shift               = 0,
+	    X_0_992_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_32   = 0x00,
+	    X_0_496_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_16   = 0x01,
+	    X_0_248_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_8    = 0x02,
+	    X_0_124_CLOCKS_LATENCY_RANGE_IN_STEPS_OF_4    = 0x03,
+	MISC_mask                                         = 0x3fffffff << 2,
+	MISC_shift                                        = 2,
+    VGT_GS_OUT_PRIM_TYPE                                  = 0x00028a6c,
+	OUTPRIM_TYPE_mask                                 = 0x3f << 0,
+	OUTPRIM_TYPE_shift                                = 0,
+	    POINTLIST                                     = 0x00,
+	    LINESTRIP                                     = 0x01,
+	    TRISTRIP                                      = 0x02,
+    VGT_DMA_SIZE                                          = 0x00028a74,
+    VGT_DMA_INDEX_TYPE                                    = 0x00028a7c,
+/* 	INDEX_TYPE_mask                                   = 0x03 << 0, */
+/* 	INDEX_TYPE_shift                                  = 0, */
+	    VGT_INDEX_16                                  = 0x00,
+	    VGT_INDEX_32                                  = 0x01,
+	SWAP_MODE_mask                                    = 0x03 << 2,
+	SWAP_MODE_shift                                   = 2,
+	    VGT_DMA_SWAP_NONE                             = 0x00,
+	    VGT_DMA_SWAP_16_BIT                           = 0x01,
+	    VGT_DMA_SWAP_32_BIT                           = 0x02,
+	    VGT_DMA_SWAP_WORD                             = 0x03,
+    VGT_PRIMITIVEID_EN                                    = 0x00028a84,
+	PRIMITIVEID_EN_bit                                = 1 << 0,
+    VGT_DMA_NUM_INSTANCES                                 = 0x00028a88,
+    VGT_EVENT_INITIATOR                                   = 0x00028a90,
+	EVENT_TYPE_mask                                   = 0x3f << 0,
+	EVENT_TYPE_shift                                  = 0,
+	    CACHE_FLUSH_TS                                = 0x04,
+	    CONTEXT_DONE                                  = 0x05,
+	    CACHE_FLUSH                                   = 0x06,
+	    VIZQUERY_START                                = 0x07,
+	    VIZQUERY_END                                  = 0x08,
+	    SC_WAIT_WC                                    = 0x09,
+	    MPASS_PS_CP_REFETCH                           = 0x0a,
+	    MPASS_PS_RST_START                            = 0x0b,
+	    MPASS_PS_INCR_START                           = 0x0c,
+	    RST_PIX_CNT                                   = 0x0d,
+	    RST_VTX_CNT                                   = 0x0e,
+	    VS_PARTIAL_FLUSH                              = 0x0f,
+	    PS_PARTIAL_FLUSH                              = 0x10,
+	    CACHE_FLUSH_AND_INV_TS_EVENT                  = 0x14,
+	    ZPASS_DONE                                    = 0x15,
+	    CACHE_FLUSH_AND_INV_EVENT                     = 0x16,
+	    PERFCOUNTER_START                             = 0x17,
+	    PERFCOUNTER_STOP                              = 0x18,
+	    PIPELINESTAT_START                            = 0x19,
+	    PIPELINESTAT_STOP                             = 0x1a,
+	    PERFCOUNTER_SAMPLE                            = 0x1b,
+	    FLUSH_ES_OUTPUT                               = 0x1c,
+	    FLUSH_GS_OUTPUT                               = 0x1d,
+	    SAMPLE_PIPELINESTAT                           = 0x1e,
+	    SO_VGTSTREAMOUT_FLUSH                         = 0x1f,
+	    SAMPLE_STREAMOUTSTATS                         = 0x20,
+	    RESET_VTX_CNT                                 = 0x21,
+	    BLOCK_CONTEXT_DONE                            = 0x22,
+	    CR_CONTEXT_DONE                               = 0x23,
+	    VGT_FLUSH                                     = 0x24,
+	    CR_DONE_TS                                    = 0x25,
+	    SQ_NON_EVENT                                  = 0x26,
+	    SC_SEND_DB_VPZ                                = 0x27,
+	    BOTTOM_OF_PIPE_TS                             = 0x28,
+	    DB_CACHE_FLUSH_AND_INV                        = 0x2a,
+	ADDRESS_HI_mask                                   = 0xff << 19,
+	ADDRESS_HI_shift                                  = 19,
+	EXTENDED_EVENT_bit                                = 1 << 27,
+    VGT_MULTI_PRIM_IB_RESET_EN                            = 0x00028a94,
+	RESET_EN_bit                                      = 1 << 0,
+    VGT_INSTANCE_STEP_RATE_0                              = 0x00028aa0,
+    VGT_INSTANCE_STEP_RATE_1                              = 0x00028aa4,
+    VGT_STRMOUT_EN                                        = 0x00028ab0,
+	STREAMOUT_bit                                     = 1 << 0,
+    VGT_REUSE_OFF                                         = 0x00028ab4,
+	REUSE_OFF_bit                                     = 1 << 0,
+    VGT_VTX_CNT_EN                                        = 0x00028ab8,
+	VTX_CNT_EN_bit                                    = 1 << 0,
+    VGT_STRMOUT_BUFFER_SIZE_0                             = 0x00028ad0,
+    VGT_STRMOUT_VTX_STRIDE_0                              = 0x00028ad4,
+	VGT_STRMOUT_VTX_STRIDE_0__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_0__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_0                             = 0x00028ad8,
+    VGT_STRMOUT_BUFFER_OFFSET_0                           = 0x00028adc,
+    VGT_STRMOUT_BUFFER_SIZE_1                             = 0x00028ae0,
+    VGT_STRMOUT_VTX_STRIDE_1                              = 0x00028ae4,
+	VGT_STRMOUT_VTX_STRIDE_1__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_1__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_1                             = 0x00028ae8,
+    VGT_STRMOUT_BUFFER_OFFSET_1                           = 0x00028aec,
+    VGT_STRMOUT_BUFFER_SIZE_2                             = 0x00028af0,
+    VGT_STRMOUT_VTX_STRIDE_2                              = 0x00028af4,
+	VGT_STRMOUT_VTX_STRIDE_2__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_2__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_2                             = 0x00028af8,
+    VGT_STRMOUT_BUFFER_OFFSET_2                           = 0x00028afc,
+    VGT_STRMOUT_BUFFER_SIZE_3                             = 0x00028b00,
+    VGT_STRMOUT_VTX_STRIDE_3                              = 0x00028b04,
+	VGT_STRMOUT_VTX_STRIDE_3__STRIDE_mask             = 0x3ff << 0,
+	VGT_STRMOUT_VTX_STRIDE_3__STRIDE_shift            = 0,
+    VGT_STRMOUT_BUFFER_BASE_3                             = 0x00028b08,
+    VGT_STRMOUT_BUFFER_OFFSET_3                           = 0x00028b0c,
+    VGT_STRMOUT_BASE_OFFSET_0                             = 0x00028b10,
+    VGT_STRMOUT_BASE_OFFSET_1                             = 0x00028b14,
+    VGT_STRMOUT_BASE_OFFSET_2                             = 0x00028b18,
+    VGT_STRMOUT_BASE_OFFSET_3                             = 0x00028b1c,
+    VGT_STRMOUT_BUFFER_EN                                 = 0x00028b20,
+	BUFFER_0_EN_bit                                   = 1 << 0,
+	BUFFER_1_EN_bit                                   = 1 << 1,
+	BUFFER_2_EN_bit                                   = 1 << 2,
+	BUFFER_3_EN_bit                                   = 1 << 3,
+    VGT_STRMOUT_DRAW_OPAQUE_OFFSET                        = 0x00028b28,
+    VGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE            = 0x00028b2c,
+    VGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE                 = 0x00028b30,
+    VGT_STRMOUT_BASE_OFFSET_HI_0                          = 0x00028b44,
+	VGT_STRMOUT_BASE_OFFSET_HI_0__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_0__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_1                          = 0x00028b48,
+	VGT_STRMOUT_BASE_OFFSET_HI_1__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_1__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_2                          = 0x00028b4c,
+	VGT_STRMOUT_BASE_OFFSET_HI_2__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_2__BASE_OFFSET_shift   = 0,
+    VGT_STRMOUT_BASE_OFFSET_HI_3                          = 0x00028b50,
+	VGT_STRMOUT_BASE_OFFSET_HI_3__BASE_OFFSET_mask    = 0x3f << 0,
+	VGT_STRMOUT_BASE_OFFSET_HI_3__BASE_OFFSET_shift   = 0,
+    PA_SC_LINE_CNTL                                       = 0x00028c00,
+	BRES_CNTL_mask                                    = 0xff << 0,
+	BRES_CNTL_shift                                   = 0,
+	USE_BRES_CNTL_bit                                 = 1 << 8,
+	EXPAND_LINE_WIDTH_bit                             = 1 << 9,
+	LAST_PIXEL_bit                                    = 1 << 10,
+    PA_SC_AA_CONFIG                                       = 0x00028c04,
+	MSAA_NUM_SAMPLES_mask                             = 0x03 << 0,
+	MSAA_NUM_SAMPLES_shift                            = 0,
+	AA_MASK_CENTROID_DTMN_bit                         = 1 << 4,
+	MAX_SAMPLE_DIST_mask                              = 0x0f << 13,
+	MAX_SAMPLE_DIST_shift                             = 13,
+    PA_SU_VTX_CNTL                                        = 0x00028c08,
+	PIX_CENTER_bit                                    = 1 << 0,
+	PA_SU_VTX_CNTL__ROUND_MODE_mask                   = 0x03 << 1,
+	PA_SU_VTX_CNTL__ROUND_MODE_shift                  = 1,
+	    X_TRUNCATE                                    = 0x00,
+	    X_ROUND                                       = 0x01,
+	    X_ROUND_TO_EVEN                               = 0x02,
+	    X_ROUND_TO_ODD                                = 0x03,
+	QUANT_MODE_mask                                   = 0x07 << 3,
+	QUANT_MODE_shift                                  = 3,
+	    X_1_16TH                                      = 0x00,
+	    X_1_8TH                                       = 0x01,
+	    X_1_4TH                                       = 0x02,
+	    X_1_2                                         = 0x03,
+	    X_1                                           = 0x04,
+	    X_1_256TH                                     = 0x05,
+    PA_CL_GB_VERT_CLIP_ADJ                                = 0x00028c0c,
+    PA_CL_GB_VERT_DISC_ADJ                                = 0x00028c10,
+    PA_CL_GB_HORZ_CLIP_ADJ                                = 0x00028c14,
+    PA_CL_GB_HORZ_DISC_ADJ                                = 0x00028c18,
+    PA_SC_AA_SAMPLE_LOCS_MCTX                             = 0x00028c1c,
+/* 	S0_X_mask                                         = 0x0f << 0, */
+/* 	S0_X_shift                                        = 0, */
+/* 	S0_Y_mask                                         = 0x0f << 4, */
+/* 	S0_Y_shift                                        = 4, */
+/* 	S1_X_mask                                         = 0x0f << 8, */
+/* 	S1_X_shift                                        = 8, */
+/* 	S1_Y_mask                                         = 0x0f << 12, */
+/* 	S1_Y_shift                                        = 12, */
+/* 	S2_X_mask                                         = 0x0f << 16, */
+/* 	S2_X_shift                                        = 16, */
+/* 	S2_Y_mask                                         = 0x0f << 20, */
+/* 	S2_Y_shift                                        = 20, */
+/* 	S3_X_mask                                         = 0x0f << 24, */
+/* 	S3_X_shift                                        = 24, */
+/* 	S3_Y_mask                                         = 0x0f << 28, */
+/* 	S3_Y_shift                                        = 28, */
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX                      = 0x00028c20,
+/* 	S4_X_mask                                         = 0x0f << 0, */
+/* 	S4_X_shift                                        = 0, */
+/* 	S4_Y_mask                                         = 0x0f << 4, */
+/* 	S4_Y_shift                                        = 4, */
+/* 	S5_X_mask                                         = 0x0f << 8, */
+/* 	S5_X_shift                                        = 8, */
+/* 	S5_Y_mask                                         = 0x0f << 12, */
+/* 	S5_Y_shift                                        = 12, */
+/* 	S6_X_mask                                         = 0x0f << 16, */
+/* 	S6_X_shift                                        = 16, */
+/* 	S6_Y_mask                                         = 0x0f << 20, */
+/* 	S6_Y_shift                                        = 20, */
+/* 	S7_X_mask                                         = 0x0f << 24, */
+/* 	S7_X_shift                                        = 24, */
+/* 	S7_Y_mask                                         = 0x0f << 28, */
+/* 	S7_Y_shift                                        = 28, */
+    CB_CLRCMP_CONTROL                                     = 0x00028c30,
+	CLRCMP_FCN_SRC_mask                               = 0x07 << 0,
+	CLRCMP_FCN_SRC_shift                              = 0,
+	    CLRCMP_DRAW_ALWAYS                            = 0x00,
+	    CLRCMP_DRAW_NEVER                             = 0x01,
+	    CLRCMP_DRAW_ON_NEQ                            = 0x04,
+	    CLRCMP_DRAW_ON_EQ                             = 0x05,
+	CLRCMP_FCN_DST_mask                               = 0x07 << 8,
+	CLRCMP_FCN_DST_shift                              = 8,
+/* 	    CLRCMP_DRAW_ALWAYS                            = 0x00, */
+/* 	    CLRCMP_DRAW_NEVER                             = 0x01, */
+/* 	    CLRCMP_DRAW_ON_NEQ                            = 0x04, */
+/* 	    CLRCMP_DRAW_ON_EQ                             = 0x05, */
+	CLRCMP_FCN_SEL_mask                               = 0x03 << 24,
+	CLRCMP_FCN_SEL_shift                              = 24,
+	    CLRCMP_SEL_DST                                = 0x00,
+	    CLRCMP_SEL_SRC                                = 0x01,
+	    CLRCMP_SEL_AND                                = 0x02,
+    CB_CLRCMP_SRC                                         = 0x00028c34,
+    CB_CLRCMP_DST                                         = 0x00028c38,
+    CB_CLRCMP_MSK                                         = 0x00028c3c,
+    PA_SC_AA_MASK                                         = 0x00028c48,
+    VGT_VERTEX_REUSE_BLOCK_CNTL                           = 0x00028c58,
+	VTX_REUSE_DEPTH_mask                              = 0xff << 0,
+	VTX_REUSE_DEPTH_shift                             = 0,
+    VGT_OUT_DEALLOC_CNTL                                  = 0x00028c5c,
+	DEALLOC_DIST_mask                                 = 0x7f << 0,
+	DEALLOC_DIST_shift                                = 0,
+    DB_RENDER_CONTROL                                     = 0x00028d0c,
+	DEPTH_CLEAR_ENABLE_bit                            = 1 << 0,
+	STENCIL_CLEAR_ENABLE_bit                          = 1 << 1,
+	DEPTH_COPY_bit                                    = 1 << 2,
+	STENCIL_COPY_bit                                  = 1 << 3,
+	RESUMMARIZE_ENABLE_bit                            = 1 << 4,
+	STENCIL_COMPRESS_DISABLE_bit                      = 1 << 5,
+	DEPTH_COMPRESS_DISABLE_bit                        = 1 << 6,
+	COPY_CENTROID_bit                                 = 1 << 7,
+	COPY_SAMPLE_mask                                  = 0x07 << 8,
+	COPY_SAMPLE_shift                                 = 8,
+	ZPASS_INCREMENT_DISABLE_bit                       = 1 << 11,
+    DB_RENDER_OVERRIDE                                    = 0x00028d10,
+	FORCE_HIZ_ENABLE_mask                             = 0x03 << 0,
+	FORCE_HIZ_ENABLE_shift                            = 0,
+	    FORCE_OFF                                     = 0x00,
+	    FORCE_ENABLE                                  = 0x01,
+	    FORCE_DISABLE                                 = 0x02,
+	    FORCE_RESERVED                                = 0x03,
+	FORCE_HIS_ENABLE0_mask                            = 0x03 << 2,
+	FORCE_HIS_ENABLE0_shift                           = 2,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_HIS_ENABLE1_mask                            = 0x03 << 4,
+	FORCE_HIS_ENABLE1_shift                           = 4,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_SHADER_Z_ORDER_bit                          = 1 << 6,
+	FAST_Z_DISABLE_bit                                = 1 << 7,
+	FAST_STENCIL_DISABLE_bit                          = 1 << 8,
+	NOOP_CULL_DISABLE_bit                             = 1 << 9,
+	FORCE_COLOR_KILL_bit                              = 1 << 10,
+	FORCE_Z_READ_bit                                  = 1 << 11,
+	FORCE_STENCIL_READ_bit                            = 1 << 12,
+	FORCE_FULL_Z_RANGE_mask                           = 0x03 << 13,
+	FORCE_FULL_Z_RANGE_shift                          = 13,
+/* 	    FORCE_OFF                                     = 0x00, */
+/* 	    FORCE_ENABLE                                  = 0x01, */
+/* 	    FORCE_DISABLE                                 = 0x02, */
+/* 	    FORCE_RESERVED                                = 0x03, */
+	FORCE_QC_SMASK_CONFLICT_bit                       = 1 << 15,
+	DISABLE_VIEWPORT_CLAMP_bit                        = 1 << 16,
+	IGNORE_SC_ZRANGE_bit                              = 1 << 17,
+    DB_HTILE_SURFACE                                      = 0x00028d24,
+	HTILE_WIDTH_bit                                   = 1 << 0,
+	HTILE_HEIGHT_bit                                  = 1 << 1,
+	LINEAR_bit                                        = 1 << 2,
+	FULL_CACHE_bit                                    = 1 << 3,
+	HTILE_USES_PRELOAD_WIN_bit                        = 1 << 4,
+	PRELOAD_bit                                       = 1 << 5,
+	PREFETCH_WIDTH_mask                               = 0x3f << 6,
+	PREFETCH_WIDTH_shift                              = 6,
+	PREFETCH_HEIGHT_mask                              = 0x3f << 12,
+	PREFETCH_HEIGHT_shift                             = 12,
+    DB_SRESULTS_COMPARE_STATE1                            = 0x00028d2c,
+	COMPAREFUNC1_mask                                 = 0x07 << 0,
+	COMPAREFUNC1_shift                                = 0,
+/* 	    REF_NEVER                                     = 0x00, */
+/* 	    REF_LESS                                      = 0x01, */
+/* 	    REF_EQUAL                                     = 0x02, */
+/* 	    REF_LEQUAL                                    = 0x03, */
+/* 	    REF_GREATER                                   = 0x04, */
+/* 	    REF_NOTEQUAL                                  = 0x05, */
+/* 	    REF_GEQUAL                                    = 0x06, */
+/* 	    REF_ALWAYS                                    = 0x07, */
+	COMPAREVALUE1_mask                                = 0xff << 4,
+	COMPAREVALUE1_shift                               = 4,
+	COMPAREMASK1_mask                                 = 0xff << 12,
+	COMPAREMASK1_shift                                = 12,
+	ENABLE1_bit                                       = 1 << 24,
+    DB_PRELOAD_CONTROL                                    = 0x00028d30,
+	START_X_mask                                      = 0xff << 0,
+	START_X_shift                                     = 0,
+	START_Y_mask                                      = 0xff << 8,
+	START_Y_shift                                     = 8,
+	MAX_X_mask                                        = 0xff << 16,
+	MAX_X_shift                                       = 16,
+	MAX_Y_mask                                        = 0xff << 24,
+	MAX_Y_shift                                       = 24,
+    DB_PREFETCH_LIMIT                                     = 0x00028d34,
+	DEPTH_HEIGHT_TILE_MAX_mask                        = 0x3ff << 0,
+	DEPTH_HEIGHT_TILE_MAX_shift                       = 0,
+    PA_SU_POLY_OFFSET_DB_FMT_CNTL                         = 0x00028df8,
+	POLY_OFFSET_NEG_NUM_DB_BITS_mask                  = 0xff << 0,
+	POLY_OFFSET_NEG_NUM_DB_BITS_shift                 = 0,
+	POLY_OFFSET_DB_IS_FLOAT_FMT_bit                   = 1 << 8,
+    PA_SU_POLY_OFFSET_CLAMP                               = 0x00028dfc,
+    PA_SU_POLY_OFFSET_FRONT_SCALE                         = 0x00028e00,
+    PA_SU_POLY_OFFSET_FRONT_OFFSET                        = 0x00028e04,
+    PA_SU_POLY_OFFSET_BACK_SCALE                          = 0x00028e08,
+    PA_SU_POLY_OFFSET_BACK_OFFSET                         = 0x00028e0c,
+    PA_CL_POINT_X_RAD                                     = 0x00028e10,
+    PA_CL_POINT_Y_RAD                                     = 0x00028e14,
+    PA_CL_POINT_SIZE                                      = 0x00028e18,
+    PA_CL_POINT_CULL_RAD                                  = 0x00028e1c,
+    PA_CL_UCP_0_X                                         = 0x00028e20,
+	PA_CL_UCP_0_X_num                                 = 6,
+	PA_CL_UCP_0_X_offset                              = 16,
+    PA_CL_UCP_0_Y                                         = 0x00028e24,
+	PA_CL_UCP_0_Y_num                                 = 6,
+	PA_CL_UCP_0_Y_offset                              = 16,
+    PA_CL_UCP_0_Z                                         = 0x00028e28,
+	PA_CL_UCP_0_Z_num                                 = 6,
+	PA_CL_UCP_0_Z_offset                              = 16,
+    SQ_ALU_CONSTANT0_0                                    = 0x00030000,
+    SQ_ALU_CONSTANT1_0                                    = 0x00030004,
+    SQ_ALU_CONSTANT2_0                                    = 0x00030008,
+    SQ_ALU_CONSTANT3_0                                    = 0x0003000c,
+    SQ_VTX_CONSTANT_WORD0_0                               = 0x00038000,
+    SQ_TEX_RESOURCE_WORD0_0                               = 0x00038000,
+	DIM_mask                                          = 0x07 << 0,
+	DIM_shift                                         = 0,
+	    SQ_TEX_DIM_1D                                 = 0x00,
+	    SQ_TEX_DIM_2D                                 = 0x01,
+	    SQ_TEX_DIM_3D                                 = 0x02,
+	    SQ_TEX_DIM_CUBEMAP                            = 0x03,
+	    SQ_TEX_DIM_1D_ARRAY                           = 0x04,
+	    SQ_TEX_DIM_2D_ARRAY                           = 0x05,
+	    SQ_TEX_DIM_2D_MSAA                            = 0x06,
+	    SQ_TEX_DIM_2D_ARRAY_MSAA                      = 0x07,
+	SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask           = 0x0f << 3,
+	SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift          = 3,
+	TILE_TYPE_bit                                     = 1 << 7,
+	PITCH_mask                                        = 0x7ff << 8,
+	PITCH_shift                                       = 8,
+	TEX_WIDTH_mask                                    = 0x1fff << 19,
+	TEX_WIDTH_shift                                   = 19,
+    SQ_VTX_CONSTANT_WORD1_0                               = 0x00038004,
+    SQ_TEX_RESOURCE_WORD1_0                               = 0x00038004,
+	TEX_HEIGHT_mask                                   = 0x1fff << 0,
+	TEX_HEIGHT_shift                                  = 0,
+	TEX_DEPTH_mask                                    = 0x1fff << 13,
+	TEX_DEPTH_shift                                   = 13,
+	SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask         = 0x3f << 26,
+	SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift        = 26,
+    SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
+	BASE_ADDRESS_HI_mask                              = 0xff << 0,
+	BASE_ADDRESS_HI_shift                             = 0,
+	SQ_VTX_CONSTANT_WORD2_0__STRIDE_mask              = 0x7ff << 8,
+	SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift             = 8,
+	SQ_VTX_CONSTANT_WORD2_0__CLAMP_X_bit              = 1 << 19,
+	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask         = 0x3f << 20,
+	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift        = 20,
+	SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_mask      = 0x03 << 26,
+	SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift     = 26,
+/* 	    SQ_NUM_FORMAT_NORM                            = 0x00, */
+/* 	    SQ_NUM_FORMAT_INT                             = 0x01, */
+/* 	    SQ_NUM_FORMAT_SCALED                          = 0x02, */
+	SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit      = 1 << 28,
+	SQ_VTX_CONSTANT_WORD2_0__SRF_MODE_ALL_bit         = 1 << 29,
+	SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_mask         = 0x03 << 30,
+	SQ_VTX_CONSTANT_WORD2_0__ENDIAN_SWAP_shift        = 30,
+/* 	    SQ_ENDIAN_NONE                                = 0x00, */
+/* 	    SQ_ENDIAN_8IN16                               = 0x01, */
+/* 	    SQ_ENDIAN_8IN32                               = 0x02, */
+    SQ_TEX_RESOURCE_WORD2_0                               = 0x00038008,
+    SQ_VTX_CONSTANT_WORD3_0                               = 0x0003800c,
+	MEM_REQUEST_SIZE_mask                             = 0x03 << 0,
+	MEM_REQUEST_SIZE_shift                            = 0,
+    SQ_TEX_RESOURCE_WORD3_0                               = 0x0003800c,
+    SQ_TEX_RESOURCE_WORD4_0                               = 0x00038010,
+	FORMAT_COMP_X_mask                                = 0x03 << 0,
+	FORMAT_COMP_X_shift                               = 0,
+	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00,
+	    SQ_FORMAT_COMP_SIGNED                         = 0x01,
+	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02,
+	FORMAT_COMP_Y_mask                                = 0x03 << 2,
+	FORMAT_COMP_Y_shift                               = 2,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	FORMAT_COMP_Z_mask                                = 0x03 << 4,
+	FORMAT_COMP_Z_shift                               = 4,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	FORMAT_COMP_W_mask                                = 0x03 << 6,
+	FORMAT_COMP_W_shift                               = 6,
+/* 	    SQ_FORMAT_COMP_UNSIGNED                       = 0x00, */
+/* 	    SQ_FORMAT_COMP_SIGNED                         = 0x01, */
+/* 	    SQ_FORMAT_COMP_UNSIGNED_BIASED                = 0x02, */
+	SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_mask      = 0x03 << 8,
+	SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift     = 8,
+/* 	    SQ_NUM_FORMAT_NORM                            = 0x00, */
+/* 	    SQ_NUM_FORMAT_INT                             = 0x01, */
+/* 	    SQ_NUM_FORMAT_SCALED                          = 0x02, */
+	SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit         = 1 << 10,
+	SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit        = 1 << 11,
+	SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_mask         = 0x03 << 12,
+	SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift        = 12,
+/* 	    SQ_ENDIAN_NONE                                = 0x00, */
+/* 	    SQ_ENDIAN_8IN16                               = 0x01, */
+/* 	    SQ_ENDIAN_8IN32                               = 0x02, */
+	REQUEST_SIZE_mask                                 = 0x03 << 14,
+	REQUEST_SIZE_shift                                = 14,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask           = 0x07 << 16,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift          = 16,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask           = 0x07 << 19,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift          = 19,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask           = 0x07 << 22,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift          = 22,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask           = 0x07 << 25,
+	SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift          = 25,
+/* 	    SQ_SEL_X                                      = 0x00, */
+/* 	    SQ_SEL_Y                                      = 0x01, */
+/* 	    SQ_SEL_Z                                      = 0x02, */
+/* 	    SQ_SEL_W                                      = 0x03, */
+/* 	    SQ_SEL_0                                      = 0x04, */
+/* 	    SQ_SEL_1                                      = 0x05, */
+	BASE_LEVEL_mask                                   = 0x0f << 28,
+	BASE_LEVEL_shift                                  = 28,
+    SQ_TEX_RESOURCE_WORD5_0                               = 0x00038014,
+	LAST_LEVEL_mask                                   = 0x0f << 0,
+	LAST_LEVEL_shift                                  = 0,
+	BASE_ARRAY_mask                                   = 0x1fff << 4,
+	BASE_ARRAY_shift                                  = 4,
+	LAST_ARRAY_mask                                   = 0x1fff << 17,
+	LAST_ARRAY_shift                                  = 17,
+    SQ_TEX_RESOURCE_WORD6_0                               = 0x00038018,
+	MPEG_CLAMP_mask                                   = 0x03 << 0,
+	MPEG_CLAMP_shift                                  = 0,
+	    SQ_TEX_MPEG_CLAMP_OFF                         = 0x00,
+	    SQ_TEX_MPEG_9                                 = 0x01,
+	    SQ_TEX_MPEG_10                                = 0x02,
+	PERF_MODULATION_mask                              = 0x07 << 5,
+	PERF_MODULATION_shift                             = 5,
+	INTERLACED_bit                                    = 1 << 8,
+	SQ_TEX_RESOURCE_WORD6_0__TYPE_mask                = 0x03 << 30,
+	SQ_TEX_RESOURCE_WORD6_0__TYPE_shift               = 30,
+	    SQ_TEX_VTX_INVALID_TEXTURE                    = 0x00,
+	    SQ_TEX_VTX_INVALID_BUFFER                     = 0x01,
+	    SQ_TEX_VTX_VALID_TEXTURE                      = 0x02,
+	    SQ_TEX_VTX_VALID_BUFFER                       = 0x03,
+    SQ_VTX_CONSTANT_WORD6_0                               = 0x00038018,
+	SQ_VTX_CONSTANT_WORD6_0__TYPE_mask                = 0x03 << 30,
+	SQ_VTX_CONSTANT_WORD6_0__TYPE_shift               = 30,
+/* 	    SQ_TEX_VTX_INVALID_TEXTURE                    = 0x00, */
+/* 	    SQ_TEX_VTX_INVALID_BUFFER                     = 0x01, */
+/* 	    SQ_TEX_VTX_VALID_TEXTURE                      = 0x02, */
+/* 	    SQ_TEX_VTX_VALID_BUFFER                       = 0x03, */
+    SQ_TEX_SAMPLER_WORD0_0                                = 0x0003c000,
+	SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_mask              = 0x07 << 0,
+	SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift             = 0,
+	    SQ_TEX_WRAP                                   = 0x00,
+	    SQ_TEX_MIRROR                                 = 0x01,
+	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02,
+	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03,
+	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04,
+	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05,
+	    SQ_TEX_CLAMP_BORDER                           = 0x06,
+	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07,
+	CLAMP_Y_mask                                      = 0x07 << 3,
+	CLAMP_Y_shift                                     = 3,
+/* 	    SQ_TEX_WRAP                                   = 0x00, */
+/* 	    SQ_TEX_MIRROR                                 = 0x01, */
+/* 	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02, */
+/* 	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03, */
+/* 	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04, */
+/* 	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05, */
+/* 	    SQ_TEX_CLAMP_BORDER                           = 0x06, */
+/* 	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07, */
+	CLAMP_Z_mask                                      = 0x07 << 6,
+	CLAMP_Z_shift                                     = 6,
+/* 	    SQ_TEX_WRAP                                   = 0x00, */
+/* 	    SQ_TEX_MIRROR                                 = 0x01, */
+/* 	    SQ_TEX_CLAMP_LAST_TEXEL                       = 0x02, */
+/* 	    SQ_TEX_MIRROR_ONCE_LAST_TEXEL                 = 0x03, */
+/* 	    SQ_TEX_CLAMP_HALF_BORDER                      = 0x04, */
+/* 	    SQ_TEX_MIRROR_ONCE_HALF_BORDER                = 0x05, */
+/* 	    SQ_TEX_CLAMP_BORDER                           = 0x06, */
+/* 	    SQ_TEX_MIRROR_ONCE_BORDER                     = 0x07, */
+	XY_MAG_FILTER_mask                                = 0x07 << 9,
+	XY_MAG_FILTER_shift                               = 9,
+	    SQ_TEX_XY_FILTER_POINT                        = 0x00,
+	    SQ_TEX_XY_FILTER_BILINEAR                     = 0x01,
+	    SQ_TEX_XY_FILTER_BICUBIC                      = 0x02,
+	XY_MIN_FILTER_mask                                = 0x07 << 12,
+	XY_MIN_FILTER_shift                               = 12,
+/* 	    SQ_TEX_XY_FILTER_POINT                        = 0x00, */
+/* 	    SQ_TEX_XY_FILTER_BILINEAR                     = 0x01, */
+/* 	    SQ_TEX_XY_FILTER_BICUBIC                      = 0x02, */
+	Z_FILTER_mask                                     = 0x03 << 15,
+	Z_FILTER_shift                                    = 15,
+	    SQ_TEX_Z_FILTER_NONE                          = 0x00,
+	    SQ_TEX_Z_FILTER_POINT                         = 0x01,
+	    SQ_TEX_Z_FILTER_LINEAR                        = 0x02,
+	MIP_FILTER_mask                                   = 0x03 << 17,
+	MIP_FILTER_shift                                  = 17,
+/* 	    SQ_TEX_Z_FILTER_NONE                          = 0x00, */
+/* 	    SQ_TEX_Z_FILTER_POINT                         = 0x01, */
+/* 	    SQ_TEX_Z_FILTER_LINEAR                        = 0x02, */
+	BORDER_COLOR_TYPE_mask                            = 0x03 << 22,
+	BORDER_COLOR_TYPE_shift                           = 22,
+	    SQ_TEX_BORDER_COLOR_TRANS_BLACK               = 0x00,
+	    SQ_TEX_BORDER_COLOR_OPAQUE_BLACK              = 0x01,
+	    SQ_TEX_BORDER_COLOR_OPAQUE_WHITE              = 0x02,
+	    SQ_TEX_BORDER_COLOR_REGISTER                  = 0x03,
+	POINT_SAMPLING_CLAMP_bit                          = 1 << 24,
+	TEX_ARRAY_OVERRIDE_bit                            = 1 << 25,
+	DEPTH_COMPARE_FUNCTION_mask                       = 0x07 << 26,
+	DEPTH_COMPARE_FUNCTION_shift                      = 26,
+	    SQ_TEX_DEPTH_COMPARE_NEVER                    = 0x00,
+	    SQ_TEX_DEPTH_COMPARE_LESS                     = 0x01,
+	    SQ_TEX_DEPTH_COMPARE_EQUAL                    = 0x02,
+	    SQ_TEX_DEPTH_COMPARE_LESSEQUAL                = 0x03,
+	    SQ_TEX_DEPTH_COMPARE_GREATER                  = 0x04,
+	    SQ_TEX_DEPTH_COMPARE_NOTEQUAL                 = 0x05,
+	    SQ_TEX_DEPTH_COMPARE_GREATEREQUAL             = 0x06,
+	    SQ_TEX_DEPTH_COMPARE_ALWAYS                   = 0x07,
+	CHROMA_KEY_mask                                   = 0x03 << 29,
+	CHROMA_KEY_shift                                  = 29,
+	    SQ_TEX_CHROMA_KEY_DISABLED                    = 0x00,
+	    SQ_TEX_CHROMA_KEY_KILL                        = 0x01,
+	    SQ_TEX_CHROMA_KEY_BLEND                       = 0x02,
+	LOD_USES_MINOR_AXIS_bit                           = 1 << 31,
+    SQ_TEX_SAMPLER_WORD1_0                                = 0x0003c004,
+	MIN_LOD_mask                                      = 0x3ff << 0,
+	MIN_LOD_shift                                     = 0,
+	MAX_LOD_mask                                      = 0x3ff << 10,
+	MAX_LOD_shift                                     = 10,
+	SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_mask             = 0xfff << 20,
+	SQ_TEX_SAMPLER_WORD1_0__LOD_BIAS_shift            = 20,
+    SQ_TEX_SAMPLER_WORD2_0                                = 0x0003c008,
+	LOD_BIAS_SEC_mask                                 = 0xfff << 0,
+	LOD_BIAS_SEC_shift                                = 0,
+	MC_COORD_TRUNCATE_bit                             = 1 << 12,
+	SQ_TEX_SAMPLER_WORD2_0__FORCE_DEGAMMA_bit         = 1 << 13,
+	HIGH_PRECISION_FILTER_bit                         = 1 << 14,
+	PERF_MIP_mask                                     = 0x07 << 15,
+	PERF_MIP_shift                                    = 15,
+	PERF_Z_mask                                       = 0x03 << 18,
+	PERF_Z_shift                                      = 18,
+	FETCH_4_bit                                       = 1 << 26,
+	SAMPLE_IS_PCF_bit                                 = 1 << 27,
+	SQ_TEX_SAMPLER_WORD2_0__TYPE_bit                  = 1 << 31,
+    SQ_VTX_BASE_VTX_LOC                                   = 0x0003cff0,
+    SQ_VTX_START_INST_LOC                                 = 0x0003cff4,
+    SQ_LOOP_CONST_DX10_0                                  = 0x0003e200,
+    SQ_LOOP_CONST_0                                       = 0x0003e200,
+	SQ_LOOP_CONST_0__COUNT_mask                       = 0xfff << 0,
+	SQ_LOOP_CONST_0__COUNT_shift                      = 0,
+	INIT_mask                                         = 0xfff << 12,
+	INIT_shift                                        = 12,
+	INC_mask                                          = 0xff << 24,
+	INC_shift                                         = 24,
+    SQ_BOOL_CONST_0                                       = 0x0003e380,
+	SQ_BOOL_CONST_0_num                               = 3,
+
+} ;
+
+#endif /* _AUTOREGS */
+
diff --git a/src/mesa/drivers/dri/r600/r600_reg_r6xx.h b/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
new file mode 100644
index 00000000000..f7702c46de5
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg_r6xx.h
@@ -0,0 +1,492 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_R6xx_H_
+#define _R600_REG_R6xx_H_
+
+/*
+ * Registers for R6xx chips that are not documented yet
+ */
+
+enum {
+
+    MM_INDEX                                              = 0x0000,
+    MM_DATA                                               = 0x0004,
+
+    SRBM_STATUS                                           = 0x0e50,
+	RLC_RQ_PENDING_bit                                = 1 << 3,
+	RCU_RQ_PENDING_bit                                = 1 << 4,
+	GRBM_RQ_PENDING_bit                               = 1 << 5,
+	HI_RQ_PENDING_bit                                 = 1 << 6,
+	IO_EXTERN_SIGNAL_bit                              = 1 << 7,
+	VMC_BUSY_bit                                      = 1 << 8,
+	MCB_BUSY_bit                                      = 1 << 9,
+	MCDZ_BUSY_bit                                     = 1 << 10,
+	MCDY_BUSY_bit                                     = 1 << 11,
+	MCDX_BUSY_bit                                     = 1 << 12,
+	MCDW_BUSY_bit                                     = 1 << 13,
+	SEM_BUSY_bit                                      = 1 << 14,
+	SRBM_STATUS__RLC_BUSY_bit                         = 1 << 15,
+	PDMA_BUSY_bit                                     = 1 << 16,
+	IH_BUSY_bit                                       = 1 << 17,
+	CSC_BUSY_bit                                      = 1 << 20,
+	CMC7_BUSY_bit                                     = 1 << 21,
+	CMC6_BUSY_bit                                     = 1 << 22,
+	CMC5_BUSY_bit                                     = 1 << 23,
+	CMC4_BUSY_bit                                     = 1 << 24,
+	CMC3_BUSY_bit                                     = 1 << 25,
+	CMC2_BUSY_bit                                     = 1 << 26,
+	CMC1_BUSY_bit                                     = 1 << 27,
+	CMC0_BUSY_bit                                     = 1 << 28,
+	BIF_BUSY_bit                                      = 1 << 29,
+	IDCT_BUSY_bit                                     = 1 << 30,
+
+    SRBM_READ_ERROR                                       = 0x0e98,
+	READ_ADDRESS_mask                                 = 0xffff << 2,
+	READ_ADDRESS_shift                                = 2,
+	READ_REQUESTER_HI_bit                             = 1 << 24,
+	READ_REQUESTER_GRBM_bit                           = 1 << 25,
+	READ_REQUESTER_RCU_bit                            = 1 << 26,
+	READ_REQUESTER_RLC_bit                            = 1 << 27,
+	READ_ERROR_bit                                    = 1 << 31,
+
+    SRBM_INT_STATUS                                       = 0x0ea4,
+	RDERR_INT_STAT_bit                                = 1 << 0,
+	GFX_CNTX_SWITCH_INT_STAT_bit                      = 1 << 1,
+    SRBM_INT_ACK                                          = 0x0ea8,
+	RDERR_INT_ACK_bit                                 = 1 << 0,
+	GFX_CNTX_SWITCH_INT_ACK_bit                       = 1 << 1,
+
+    R6XX_MC_VM_FB_LOCATION                                = 0x2180,
+
+    VENDOR_DEVICE_ID                                      = 0x4000,
+
+    D1GRPH_PRIMARY_SURFACE_ADDRESS                        = 0x6110,
+    D1GRPH_PITCH                                          = 0x6120,
+    D1GRPH_Y_END                                          = 0x6138,
+
+    GRBM_STATUS                                           = 0x8010,
+	CMDFIFO_AVAIL_mask                                = 0x1f << 0,
+	CMDFIFO_AVAIL_shift                               = 0,
+	SRBM_RQ_PENDING_bit                               = 1 << 5,
+	CP_RQ_PENDING_bit                                 = 1 << 6,
+	CF_RQ_PENDING_bit                                 = 1 << 7,
+	PF_RQ_PENDING_bit                                 = 1 << 8,
+	GRBM_EE_BUSY_bit                                  = 1 << 10,
+	GRBM_STATUS__VC_BUSY_bit                          = 1 << 11,
+	DB03_CLEAN_bit                                    = 1 << 12,
+	CB03_CLEAN_bit                                    = 1 << 13,
+	VGT_BUSY_NO_DMA_bit                               = 1 << 16,
+	GRBM_STATUS__VGT_BUSY_bit                         = 1 << 17,
+	TA03_BUSY_bit                                     = 1 << 18,
+	GRBM_STATUS__TC_BUSY_bit                          = 1 << 19,
+	SX_BUSY_bit                                       = 1 << 20,
+	SH_BUSY_bit                                       = 1 << 21,
+	SPI03_BUSY_bit                                    = 1 << 22,
+	SMX_BUSY_bit                                      = 1 << 23,
+	SC_BUSY_bit                                       = 1 << 24,
+	PA_BUSY_bit                                       = 1 << 25,
+	DB03_BUSY_bit                                     = 1 << 26,
+	CR_BUSY_bit                                       = 1 << 27,
+	CP_COHERENCY_BUSY_bit                             = 1 << 28,
+	GRBM_STATUS__CP_BUSY_bit                          = 1 << 29,
+	CB03_BUSY_bit                                     = 1 << 30,
+	GUI_ACTIVE_bit                                    = 1 << 31,
+    GRBM_STATUS2                                          = 0x8014,
+	CR_CLEAN_bit                                      = 1 << 0,
+	SMX_CLEAN_bit                                     = 1 << 1,
+	SPI0_BUSY_bit                                     = 1 << 8,
+	SPI1_BUSY_bit                                     = 1 << 9,
+	SPI2_BUSY_bit                                     = 1 << 10,
+	SPI3_BUSY_bit                                     = 1 << 11,
+	TA0_BUSY_bit                                      = 1 << 12,
+	TA1_BUSY_bit                                      = 1 << 13,
+	TA2_BUSY_bit                                      = 1 << 14,
+	TA3_BUSY_bit                                      = 1 << 15,
+	DB0_BUSY_bit                                      = 1 << 16,
+	DB1_BUSY_bit                                      = 1 << 17,
+	DB2_BUSY_bit                                      = 1 << 18,
+	DB3_BUSY_bit                                      = 1 << 19,
+	CB0_BUSY_bit                                      = 1 << 20,
+	CB1_BUSY_bit                                      = 1 << 21,
+	CB2_BUSY_bit                                      = 1 << 22,
+	CB3_BUSY_bit                                      = 1 << 23,
+    GRBM_SOFT_RESET                                       = 0x8020,
+	SOFT_RESET_CP_bit                                 = 1 << 0,
+	SOFT_RESET_CB_bit                                 = 1 << 1,
+	SOFT_RESET_CR_bit                                 = 1 << 2,
+	SOFT_RESET_DB_bit                                 = 1 << 3,
+	SOFT_RESET_PA_bit                                 = 1 << 5,
+	SOFT_RESET_SC_bit                                 = 1 << 6,
+	SOFT_RESET_SMX_bit                                = 1 << 7,
+	SOFT_RESET_SPI_bit                                = 1 << 8,
+	SOFT_RESET_SH_bit                                 = 1 << 9,
+	SOFT_RESET_SX_bit                                 = 1 << 10,
+	SOFT_RESET_TC_bit                                 = 1 << 11,
+	SOFT_RESET_TA_bit                                 = 1 << 12,
+	SOFT_RESET_VC_bit                                 = 1 << 13,
+	SOFT_RESET_VGT_bit                                = 1 << 14,
+	SOFT_RESET_GRBM_GCA_bit                           = 1 << 15,
+
+    WAIT_UNTIL                                            = 0x8040,
+	WAIT_CP_DMA_IDLE_bit                              = 1 << 8,
+	WAIT_CMDFIFO_bit                                  = 1 << 10,
+	WAIT_2D_IDLE_bit                                  = 1 << 14,
+	WAIT_3D_IDLE_bit                                  = 1 << 15,
+	WAIT_2D_IDLECLEAN_bit                             = 1 << 16,
+	WAIT_3D_IDLECLEAN_bit                             = 1 << 17,
+	WAIT_EXTERN_SIG_bit                               = 1 << 19,
+	CMDFIFO_ENTRIES_mask                              = 0x1f << 20,
+	CMDFIFO_ENTRIES_shift                             = 20,
+
+    GRBM_READ_ERROR                                       = 0x8058,
+/* 	READ_ADDRESS_mask                                 = 0xffff << 2, */
+/* 	READ_ADDRESS_shift                                = 2, */
+	READ_REQUESTER_SRBM_bit                           = 1 << 28,
+	READ_REQUESTER_CP_bit                             = 1 << 29,
+	READ_REQUESTER_WU_POLL_bit                        = 1 << 30,
+/* 	READ_ERROR_bit                                    = 1 << 31, */
+
+    SCRATCH_REG0		                          = 0x8500,
+    SCRATCH_REG1		                          = 0x8504,
+    SCRATCH_REG2		                          = 0x8508,
+    SCRATCH_REG3		                          = 0x850c,
+    SCRATCH_REG4		                          = 0x8510,
+    SCRATCH_REG5		                          = 0x8514,
+    SCRATCH_REG6		                          = 0x8518,
+    SCRATCH_REG7		                          = 0x851c,
+    SCRATCH_UMSK		                          = 0x8540,
+    SCRATCH_ADDR		                          = 0x8544,
+
+    CP_COHER_CNTL                                         = 0x85f0,
+	DEST_BASE_0_ENA_bit                               = 1 << 0,
+	DEST_BASE_1_ENA_bit                               = 1 << 1,
+	SO0_DEST_BASE_ENA_bit                             = 1 << 2,
+	SO1_DEST_BASE_ENA_bit                             = 1 << 3,
+	SO2_DEST_BASE_ENA_bit                             = 1 << 4,
+	SO3_DEST_BASE_ENA_bit                             = 1 << 5,
+	CB0_DEST_BASE_ENA_bit                             = 1 << 6,
+	CB1_DEST_BASE_ENA_bit                             = 1 << 7,
+	CB2_DEST_BASE_ENA_bit                             = 1 << 8,
+	CB3_DEST_BASE_ENA_bit                             = 1 << 9,
+	CB4_DEST_BASE_ENA_bit                             = 1 << 10,
+	CB5_DEST_BASE_ENA_bit                             = 1 << 11,
+	CB6_DEST_BASE_ENA_bit                             = 1 << 12,
+	CB7_DEST_BASE_ENA_bit                             = 1 << 13,
+	DB_DEST_BASE_ENA_bit                              = 1 << 14,
+	CR_DEST_BASE_ENA_bit                              = 1 << 15,
+	TC_ACTION_ENA_bit                                 = 1 << 23,
+	VC_ACTION_ENA_bit                                 = 1 << 24,
+	CB_ACTION_ENA_bit                                 = 1 << 25,
+	DB_ACTION_ENA_bit                                 = 1 << 26,
+	SH_ACTION_ENA_bit                                 = 1 << 27,
+	SMX_ACTION_ENA_bit                                = 1 << 28,
+	CR0_ACTION_ENA_bit                                = 1 << 29,
+	CR1_ACTION_ENA_bit                                = 1 << 30,
+	CR2_ACTION_ENA_bit                                = 1 << 31,
+    CP_COHER_SIZE                                         = 0x85f4,
+    CP_COHER_BASE                                         = 0x85f8,
+    CP_COHER_STATUS                                       = 0x85fc,
+	MATCHING_GFX_CNTX_mask                            = 0xff << 0,
+	MATCHING_GFX_CNTX_shift                           = 0,
+	MATCHING_CR_CNTX_mask                             = 0xffff << 8,
+	MATCHING_CR_CNTX_shift                            = 8,
+	STATUS_bit                                        = 1 << 31,
+
+    CP_STALLED_STAT1                                      = 0x8674,
+	RBIU_TO_DMA_NOT_RDY_TO_RCV_bit                    = 1 << 0,
+	RBIU_TO_IBS_NOT_RDY_TO_RCV_bit                    = 1 << 1,
+	RBIU_TO_SEM_NOT_RDY_TO_RCV_bit                    = 1 << 2,
+	RBIU_TO_2DREGS_NOT_RDY_TO_RCV_bit                 = 1 << 3,
+	RBIU_TO_MEMWR_NOT_RDY_TO_RCV_bit                  = 1 << 4,
+	RBIU_TO_MEMRD_NOT_RDY_TO_RCV_bit                  = 1 << 5,
+	RBIU_TO_EOPD_NOT_RDY_TO_RCV_bit                   = 1 << 6,
+	RBIU_TO_RECT_NOT_RDY_TO_RCV_bit                   = 1 << 7,
+	RBIU_TO_STRMO_NOT_RDY_TO_RCV_bit                  = 1 << 8,
+	RBIU_TO_PSTAT_NOT_RDY_TO_RCV_bit                  = 1 << 9,
+	MIU_WAITING_ON_RDREQ_FREE_bit                     = 1 << 16,
+	MIU_WAITING_ON_WRREQ_FREE_bit                     = 1 << 17,
+	MIU_NEEDS_AVAIL_WRREQ_PHASE_bit                   = 1 << 18,
+	RCIU_WAITING_ON_GRBM_FREE_bit                     = 1 << 24,
+	RCIU_WAITING_ON_VGT_FREE_bit                      = 1 << 25,
+	RCIU_STALLED_ON_ME_READ_bit                       = 1 << 26,
+	RCIU_STALLED_ON_DMA_READ_bit                      = 1 << 27,
+	RCIU_HALTED_BY_REG_VIOLATION_bit                  = 1 << 28,
+    CP_STALLED_STAT2                                      = 0x8678,
+	PFP_TO_CSF_NOT_RDY_TO_RCV_bit                     = 1 << 0,
+	PFP_TO_MEQ_NOT_RDY_TO_RCV_bit                     = 1 << 1,
+	PFP_TO_VGT_NOT_RDY_TO_RCV_bit                     = 1 << 2,
+	PFP_HALTED_BY_INSTR_VIOLATION_bit                 = 1 << 3,
+	MULTIPASS_IB_PENDING_IN_PFP_bit                   = 1 << 4,
+	ME_BRUSH_WC_NOT_RDY_TO_RCV_bit                    = 1 << 8,
+	ME_STALLED_ON_BRUSH_LOGIC_bit                     = 1 << 9,
+	CR_CNTX_NOT_AVAIL_TO_ME_bit                       = 1 << 10,
+	GFX_CNTX_NOT_AVAIL_TO_ME_bit                      = 1 << 11,
+	ME_RCIU_NOT_RDY_TO_RCV_bit                        = 1 << 12,
+	ME_TO_CONST_NOT_RDY_TO_RCV_bit                    = 1 << 13,
+	ME_WAITING_DATA_FROM_PFP_bit                      = 1 << 14,
+	ME_WAITING_ON_PARTIAL_FLUSH_bit                   = 1 << 15,
+	RECT_FIFO_NEEDS_CR_RECT_DONE_bit                  = 1 << 16,
+	RECT_FIFO_NEEDS_WR_CONFIRM_bit                    = 1 << 17,
+	EOPD_FIFO_NEEDS_SC_EOP_DONE_bit                   = 1 << 18,
+	EOPD_FIFO_NEEDS_SMX_EOP_DONE_bit                  = 1 << 19,
+	EOPD_FIFO_NEEDS_WR_CONFIRM_bit                    = 1 << 20,
+	EOPD_FIFO_NEEDS_SIGNAL_SEM_bit                    = 1 << 21,
+	SO_NUMPRIM_FIFO_NEEDS_SOADDR_bit                  = 1 << 22,
+	SO_NUMPRIM_FIFO_NEEDS_NUMPRIM_bit                 = 1 << 23,
+	PIPE_STATS_FIFO_NEEDS_SAMPLE_bit                  = 1 << 24,
+	SURF_SYNC_NEEDS_IDLE_CNTXS_bit                    = 1 << 30,
+	SURF_SYNC_NEEDS_ALL_CLEAN_bit                     = 1 << 31,
+    CP_BUSY_STAT                                          = 0x867c,
+	REG_BUS_FIFO_BUSY_bit                             = 1 << 0,
+	RING_FETCHING_DATA_bit                            = 1 << 1,
+	INDR1_FETCHING_DATA_bit                           = 1 << 2,
+	INDR2_FETCHING_DATA_bit                           = 1 << 3,
+	STATE_FETCHING_DATA_bit                           = 1 << 4,
+	PRED_FETCHING_DATA_bit                            = 1 << 5,
+	COHER_CNTR_NEQ_ZERO_bit                           = 1 << 6,
+	PFP_PARSING_PACKETS_bit                           = 1 << 7,
+	ME_PARSING_PACKETS_bit                            = 1 << 8,
+	RCIU_PFP_BUSY_bit                                 = 1 << 9,
+	RCIU_ME_BUSY_bit                                  = 1 << 10,
+	OUTSTANDING_READ_TAGS_bit                         = 1 << 11,
+	SEM_CMDFIFO_NOT_EMPTY_bit                         = 1 << 12,
+	SEM_FAILED_AND_HOLDING_bit                        = 1 << 13,
+	SEM_POLLING_FOR_PASS_bit                          = 1 << 14,
+	_3D_BUSY_bit                                      = 1 << 15,
+	_2D_BUSY_bit                                      = 1 << 16,
+    CP_STAT                                               = 0x8680,
+	CSF_RING_BUSY_bit                                 = 1 << 0,
+	CSF_WPTR_POLL_BUSY_bit                            = 1 << 1,
+	CSF_INDIRECT1_BUSY_bit                            = 1 << 2,
+	CSF_INDIRECT2_BUSY_bit                            = 1 << 3,
+	CSF_STATE_BUSY_bit                                = 1 << 4,
+	CSF_PREDICATE_BUSY_bit                            = 1 << 5,
+	CSF_BUSY_bit                                      = 1 << 6,
+	MIU_RDREQ_BUSY_bit                                = 1 << 7,
+	MIU_WRREQ_BUSY_bit                                = 1 << 8,
+	ROQ_RING_BUSY_bit                                 = 1 << 9,
+	ROQ_INDIRECT1_BUSY_bit                            = 1 << 10,
+	ROQ_INDIRECT2_BUSY_bit                            = 1 << 11,
+	ROQ_STATE_BUSY_bit                                = 1 << 12,
+	ROQ_PREDICATE_BUSY_bit                            = 1 << 13,
+	ROQ_ALIGN_BUSY_bit                                = 1 << 14,
+	PFP_BUSY_bit                                      = 1 << 15,
+	MEQ_BUSY_bit                                      = 1 << 16,
+	ME_BUSY_bit                                       = 1 << 17,
+	QUERY_BUSY_bit                                    = 1 << 18,
+	SEMAPHORE_BUSY_bit                                = 1 << 19,
+	INTERRUPT_BUSY_bit                                = 1 << 20,
+	SURFACE_SYNC_BUSY_bit                             = 1 << 21,
+	DMA_BUSY_bit                                      = 1 << 22,
+	RCIU_BUSY_bit                                     = 1 << 23,
+	CP_STAT__CP_BUSY_bit                              = 1 << 31,
+
+    CP_ME_CNTL                                            = 0x86d8,
+	ME_STATMUX_mask                                   = 0xff << 0,
+	ME_STATMUX_shift                                  = 0,
+	ME_HALT_bit                                       = 1 << 28,
+    CP_ME_STATUS                                          = 0x86dc,
+
+    CP_RB_RPTR                                            = 0x8700,
+	RB_RPTR_mask                                      = 0xfffff << 0,
+	RB_RPTR_shift                                     = 0,
+    CP_RB_WPTR_DELAY                                      = 0x8704,
+	PRE_WRITE_TIMER_mask                              = 0xfffffff << 0,
+	PRE_WRITE_TIMER_shift                             = 0,
+	PRE_WRITE_LIMIT_mask                              = 0x0f << 28,
+	PRE_WRITE_LIMIT_shift                             = 28,
+
+    CP_ROQ_RB_STAT                                        = 0x8780,
+	ROQ_RPTR_PRIMARY_mask                             = 0x3ff << 0,
+	ROQ_RPTR_PRIMARY_shift                            = 0,
+	ROQ_WPTR_PRIMARY_mask                             = 0x3ff << 16,
+	ROQ_WPTR_PRIMARY_shift                            = 16,
+    CP_ROQ_IB1_STAT                                       = 0x8784,
+	ROQ_RPTR_INDIRECT1_mask                           = 0x3ff << 0,
+	ROQ_RPTR_INDIRECT1_shift                          = 0,
+	ROQ_WPTR_INDIRECT1_mask                           = 0x3ff << 16,
+	ROQ_WPTR_INDIRECT1_shift                          = 16,
+    CP_ROQ_IB2_STAT                                       = 0x8788,
+	ROQ_RPTR_INDIRECT2_mask                           = 0x3ff << 0,
+	ROQ_RPTR_INDIRECT2_shift                          = 0,
+	ROQ_WPTR_INDIRECT2_mask                           = 0x3ff << 16,
+	ROQ_WPTR_INDIRECT2_shift                          = 16,
+
+    CP_MEQ_STAT                                           = 0x8794,
+	MEQ_RPTR_mask                                     = 0x3ff << 0,
+	MEQ_RPTR_shift                                    = 0,
+	MEQ_WPTR_mask                                     = 0x3ff << 16,
+	MEQ_WPTR_shift                                    = 16,
+
+    CC_GC_SHADER_PIPE_CONFIG                              = 0x8950,
+	INACTIVE_QD_PIPES_mask                            = 0xff << 8,
+	INACTIVE_QD_PIPES_shift                           = 8,
+	    R6XX_MAX_QD_PIPES                             = 8,
+	INACTIVE_SIMDS_mask                               = 0xff << 16,
+	INACTIVE_SIMDS_shift                              = 16,
+	    R6XX_MAX_SIMDS                                = 8,
+    GC_USER_SHADER_PIPE_CONFIG                            = 0x8954,
+
+    VC_ENHANCE                                            = 0x9714,
+    DB_DEBUG                                              = 0x9830,
+        PREZ_MUST_WAIT_FOR_POSTZ_DONE                     = 1 << 31,
+
+    DB_WATERMARKS                                         = 0x00009838,
+	DEPTH_FREE_mask                                   = 0x1f << 0,
+	DEPTH_FREE_shift                                  = 0,
+	DEPTH_FLUSH_mask                                  = 0x3f << 5,
+	DEPTH_FLUSH_shift                                 = 5,
+	FORCE_SUMMARIZE_mask                              = 0x0f << 11,
+	FORCE_SUMMARIZE_shift                             = 11,
+	DEPTH_PENDING_FREE_mask                           = 0x1f << 15,
+	DEPTH_PENDING_FREE_shift                          = 15,
+	DEPTH_CACHELINE_FREE_mask                         = 0x1f << 20,
+	DEPTH_CACHELINE_FREE_shift                        = 20,
+	EARLY_Z_PANIC_DISABLE_bit                         = 1 << 25,
+	LATE_Z_PANIC_DISABLE_bit                          = 1 << 26,
+	RE_Z_PANIC_DISABLE_bit                            = 1 << 27,
+	DB_EXTRA_DEBUG_mask                               = 0x0f << 28,
+	DB_EXTRA_DEBUG_shift                              = 28,
+
+    CP_RB_BASE                                            = 0xc100,
+    CP_RB_CNTL                                            = 0xc104,
+        RB_BUFSZ_mask                                     = 0x3f << 0,
+    CP_RB_WPTR                                            = 0xc114,
+	RB_WPTR_mask                                      = 0xfffff << 0,
+	RB_WPTR_shift                                     = 0,
+    CP_RB_RPTR_WR                                         = 0xc108,
+	RB_RPTR_WR_mask                                   = 0xfffff << 0,
+	RB_RPTR_WR_shift                                  = 0,
+
+    CP_INT_STATUS                                         = 0xc128,
+	DISABLE_CNTX_SWITCH_INT_STAT_bit                  = 1 << 0,
+	ENABLE_CNTX_SWITCH_INT_STAT_bit                   = 1 << 1,
+	SEM_SIGNAL_INT_STAT_bit                           = 1 << 18,
+	CNTX_BUSY_INT_STAT_bit                            = 1 << 19,
+	CNTX_EMPTY_INT_STAT_bit                           = 1 << 20,
+	WAITMEM_SEM_INT_STAT_bit                          = 1 << 21,
+	PRIV_INSTR_INT_STAT_bit                           = 1 << 22,
+	PRIV_REG_INT_STAT_bit                             = 1 << 23,
+	OPCODE_ERROR_INT_STAT_bit                         = 1 << 24,
+	SCRATCH_INT_STAT_bit                              = 1 << 25,
+	TIME_STAMP_INT_STAT_bit                           = 1 << 26,
+	RESERVED_BIT_ERROR_INT_STAT_bit                   = 1 << 27,
+	DMA_INT_STAT_bit                                  = 1 << 28,
+	IB2_INT_STAT_bit                                  = 1 << 29,
+	IB1_INT_STAT_bit                                  = 1 << 30,
+	RB_INT_STAT_bit                                   = 1 << 31,
+
+//  SX_ALPHA_TEST_CONTROL                                 = 0x00028410,
+	ALPHA_FUNC__REF_NEVER                             = 0,
+	ALPHA_FUNC__REF_ALWAYS                            = 7,
+//  DB_SHADER_CONTROL                                     = 0x0002880c,
+	Z_ORDER__EARLY_Z_THEN_LATE_Z                      = 2,
+//  PA_SU_SC_MODE_CNTL                                    = 0x00028814,
+//	POLY_MODE_mask                                    = 0x03 << 3,
+	POLY_MODE__TRIANGLES = 0, POLY_MODE__DUAL_MODE,
+//	POLYMODE_FRONT_PTYPE_mask                         = 0x07 << 5,
+	POLYMODE_PTYPE__POINTS = 0, POLYMODE_PTYPE__LINES, POLYMODE_PTYPE__TRIANGLES,
+    PA_SC_AA_SAMPLE_LOCS_8S_WD1_M                         = 0x00028c20,
+    DB_SRESULTS_COMPARE_STATE0                            = 0x00028d28,	/* See autoregs: DB_SRESULTS_COMPARE_STATE1 */
+//  DB_SRESULTS_COMPARE_STATE1                            = 0x00028d2c,
+    DB_ALPHA_TO_MASK                                      = 0x00028d44,
+	ALPHA_TO_MASK_ENABLE                              = 1 << 0,
+	ALPHA_TO_MASK_OFFSET0_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET0_shift                       = 8,
+	ALPHA_TO_MASK_OFFSET1_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET1_shift                       = 10,
+	ALPHA_TO_MASK_OFFSET2_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET2_shift                       = 12,
+	ALPHA_TO_MASK_OFFSET3_mask                        = 0x03 << 8,
+	ALPHA_TO_MASK_OFFSET3_shift                       = 14,
+
+//  SQ_VTX_CONSTANT_WORD2_0                               = 0x00038008,
+//    	SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_mask         = 0x3f << 20,
+	FMT_INVALID=0,      FMT_8,          FMT_4_4,            FMT_3_3_2,
+	                    FMT_16=5,       FMT_16_FLOAT,       FMT_8_8,
+	FMT_5_6_5,          FMT_6_5_5,      FMT_1_5_5_5,        FMT_4_4_4_4,
+	FMT_5_5_5_1,        FMT_32,         FMT_32_FLOAT,       FMT_16_16,
+	FMT_16_16_FLOAT=16, FMT_8_24,       FMT_8_24_FLOAT,     FMT_24_8,
+	FMT_24_8_FLOAT,     FMT_10_11_11,   FMT_10_11_11_FLOAT, FMT_11_11_10,
+	FMT_11_11_10_FLOAT, FMT_2_10_10_10, FMT_8_8_8_8,        FMT_10_10_10_2,
+	FMT_X24_8_32_FLOAT, FMT_32_32,      FMT_32_32_FLOAT,    FMT_16_16_16_16,
+	FMT_16_16_16_16_FLOAT=32,           FMT_32_32_32_32=34, FMT_32_32_32_32_FLOAT,
+	                    FMT_1 = 37,                         FMT_GB_GR=39,
+	FMT_BG_RG,          FMT_32_AS_8,    FMT_32_AS_8_8,      FMT_5_9_9_9_SHAREDEXP,
+	FMT_8_8_8,          FMT_16_16_16,   FMT_16_16_16_FLOAT, FMT_32_32_32,
+	FMT_32_32_32_FLOAT=48,
+
+//  High level register file lengths
+    SQ_ALU_CONSTANT                                       = SQ_ALU_CONSTANT0_0,	/* 256 PS, 256 VS */
+    SQ_ALU_CONSTANT_ps_num                                = 256,
+    SQ_ALU_CONSTANT_vs_num                                = 256,
+    SQ_ALU_CONSTANT_all_num                               = 512,
+    SQ_ALU_CONSTANT_offset                                = 16,
+    SQ_ALU_CONSTANT_ps                                    = 0,
+    SQ_ALU_CONSTANT_vs                                    = SQ_ALU_CONSTANT_ps + SQ_ALU_CONSTANT_ps_num,
+    SQ_TEX_RESOURCE                                       = SQ_TEX_RESOURCE_WORD0_0,	/* 160 PS, 160 VS, 16 FS, 160 GS */
+    SQ_TEX_RESOURCE_ps_num                                = 160,
+    SQ_TEX_RESOURCE_vs_num                                = 160,
+    SQ_TEX_RESOURCE_fs_num                                = 16,
+    SQ_TEX_RESOURCE_gs_num                                = 160,
+    SQ_TEX_RESOURCE_all_num                               = 496,
+    SQ_TEX_RESOURCE_offset                                = 28,
+    SQ_TEX_RESOURCE_ps                                    = 0,
+    SQ_TEX_RESOURCE_vs                                    = SQ_TEX_RESOURCE_ps + SQ_TEX_RESOURCE_ps_num,
+    SQ_TEX_RESOURCE_fs                                    = SQ_TEX_RESOURCE_vs + SQ_TEX_RESOURCE_vs_num,
+    SQ_TEX_RESOURCE_gs                                    = SQ_TEX_RESOURCE_fs + SQ_TEX_RESOURCE_fs_num,
+    SQ_VTX_RESOURCE                                       = SQ_VTX_CONSTANT_WORD0_0,	/* 160 PS, 160 VS, 16 FS, 160 GS */
+    SQ_VTX_RESOURCE_ps_num                                = 160,
+    SQ_VTX_RESOURCE_vs_num                                = 160,
+    SQ_VTX_RESOURCE_fs_num                                = 16,
+    SQ_VTX_RESOURCE_gs_num                                = 160,
+    SQ_VTX_RESOURCE_all_num                               = 496,
+    SQ_VTX_RESOURCE_offset                                = 28,
+    SQ_VTX_RESOURCE_ps                                    = 0,
+    SQ_VTX_RESOURCE_vs                                    = SQ_VTX_RESOURCE_ps + SQ_VTX_RESOURCE_ps_num,
+    SQ_VTX_RESOURCE_fs                                    = SQ_VTX_RESOURCE_vs + SQ_VTX_RESOURCE_vs_num,
+    SQ_VTX_RESOURCE_gs                                    = SQ_VTX_RESOURCE_fs + SQ_VTX_RESOURCE_fs_num,
+    SQ_TEX_SAMPLER_WORD                                   = SQ_TEX_SAMPLER_WORD0_0,	/* 18 per PS, VS, GS */
+    SQ_TEX_SAMPLER_WORD_ps_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_vs_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_gs_num                            = 18,
+    SQ_TEX_SAMPLER_WORD_all_num                           = 54,
+    SQ_TEX_SAMPLER_WORD_offset                            = 12,
+    SQ_TEX_SAMPLER_WORD_ps                                = 0,
+    SQ_TEX_SAMPLER_WORD_vs                                = SQ_TEX_SAMPLER_WORD_ps + SQ_TEX_SAMPLER_WORD_ps_num,
+    SQ_TEX_SAMPLER_WORD_gs                                = SQ_TEX_SAMPLER_WORD_vs + SQ_TEX_SAMPLER_WORD_vs_num,
+    SQ_LOOP_CONST                                         = SQ_LOOP_CONST_0,		/* 32 per PS, VS, GS */
+    SQ_LOOP_CONST_ps_num                                  = 32,
+    SQ_LOOP_CONST_vs_num                                  = 32,
+    SQ_LOOP_CONST_gs_num                                  = 32,
+    SQ_LOOP_CONST_all_num                                 = 96,
+    SQ_LOOP_CONST_offset                                  = 4,
+    SQ_LOOP_CONST_ps                                      = 0,
+    SQ_LOOP_CONST_vs                                      = SQ_LOOP_CONST_ps + SQ_LOOP_CONST_ps_num,
+    SQ_LOOP_CONST_gs                                      = SQ_LOOP_CONST_vs + SQ_LOOP_CONST_vs_num,
+} ;
+
+
+#endif
diff --git a/src/mesa/drivers/dri/r600/r600_reg_r7xx.h b/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
new file mode 100644
index 00000000000..e5c01c861aa
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_reg_r7xx.h
@@ -0,0 +1,149 @@
+/*
+ * RadeonHD R6xx, R7xx Register documentation
+ *
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ * Copyright (C) 2008-2009  Matthias Hopf
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef _R600_REG_R7xx_H_
+#define _R600_REG_R7xx_H_
+
+/*
+ * Register update for R7xx chips
+ */
+
+enum {
+
+    R7XX_MC_VM_FB_LOCATION                                = 0x00002024,
+
+//  GRBM_STATUS                                           = 0x00008010,
+	R7XX_TA_BUSY_bit                                  = 1 << 14,
+
+    R7xx_SQ_DYN_GPR_CNTL_PS_FLUSH_REQ                     = 0x00008d8c,
+	RING0_OFFSET_mask                                 = 0xff << 0,
+	RING0_OFFSET_shift                                = 0,
+	ISOLATE_ES_ENABLE_bit                             = 1 << 12,
+	ISOLATE_GS_ENABLE_bit                             = 1 << 13,
+	VS_PC_LIMIT_ENABLE_bit                            = 1 << 14,
+
+//  SQ_ALU_WORD0                                          = 0x00008dfc,
+//	SRC0_SEL_mask                                     = 0x1ff << 0,
+// 	SRC1_SEL_mask                                     = 0x1ff << 13,
+	    R7xx_SQ_ALU_SRC_1_DBL_L                       = 0xf4,
+	    R7xx_SQ_ALU_SRC_1_DBL_M                       = 0xf5,
+	    R7xx_SQ_ALU_SRC_0_5_DBL_L                     = 0xf6,
+	    R7xx_SQ_ALU_SRC_0_5_DBL_M                     = 0xf7,
+// 	INDEX_MODE_mask                                   = 0x07 << 26,
+	    R7xx_SQ_INDEX_GLOBAL                          = 0x05,
+	    R7xx_SQ_INDEX_GLOBAL_AR_X                     = 0x06,
+    R6xx_SQ_ALU_WORD1_OP2                                 = 0x00008dfc,
+    R7xx_SQ_ALU_WORD1_OP2_V2                              = 0x00008dfc,
+	R6xx_FOG_MERGE_bit                                = 1 << 5,
+	R6xx_OMOD_mask                                    = 0x03 << 6,
+	R7xx_OMOD_mask                                    = 0x03 << 5,
+	R6xx_OMOD_shift                                   = 6,
+	R7xx_OMOD_shift                                   = 5,
+	R6xx_SQ_ALU_WORD1_OP2__ALU_INST_mask              = 0x3ff << 8,
+	R7xx_SQ_ALU_WORD1_OP2_V2__ALU_INST_mask           = 0x7ff << 7,
+	R6xx_SQ_ALU_WORD1_OP2__ALU_INST_shift             = 8,
+	R7xx_SQ_ALU_WORD1_OP2_V2__ALU_INST_shift          = 7,
+	    R7xx_SQ_OP2_INST_FREXP_64                     = 0x07,
+	    R7xx_SQ_OP2_INST_ADD_64                       = 0x17,
+	    R7xx_SQ_OP2_INST_MUL_64                       = 0x1b,
+	    R7xx_SQ_OP2_INST_FLT64_TO_FLT32               = 0x1c,
+	    R7xx_SQ_OP2_INST_FLT32_TO_FLT64               = 0x1d,
+	    R7xx_SQ_OP2_INST_LDEXP_64                     = 0x7a,
+	    R7xx_SQ_OP2_INST_FRACT_64                     = 0x7b,
+	    R7xx_SQ_OP2_INST_PRED_SETGT_64                = 0x7c,
+	    R7xx_SQ_OP2_INST_PRED_SETE_64                 = 0x7d,
+	    R7xx_SQ_OP2_INST_PRED_SETGE_64                = 0x7e,
+//  SQ_ALU_WORD1_OP3                                      = 0x00008dfc,
+//	SRC2_SEL_mask                                     = 0x1ff << 0,
+//	    R7xx_SQ_ALU_SRC_1_DBL_L                       = 0xf4,
+//	    R7xx_SQ_ALU_SRC_1_DBL_M                       = 0xf5,
+//	    R7xx_SQ_ALU_SRC_0_5_DBL_L                     = 0xf6,
+//	    R7xx_SQ_ALU_SRC_0_5_DBL_M                     = 0xf7,
+// 	SQ_ALU_WORD1_OP3__ALU_INST_mask                   = 0x1f << 13,
+	    R7xx_SQ_OP3_INST_MULADD_64                    = 0x08,
+	    R7xx_SQ_OP3_INST_MULADD_64_M2                 = 0x09,
+	    R7xx_SQ_OP3_INST_MULADD_64_M4                 = 0x0a,
+	    R7xx_SQ_OP3_INST_MULADD_64_D2                 = 0x0b,
+//  SQ_CF_ALU_WORD1                                       = 0x00008dfc,
+	R6xx_USES_WATERFALL_bit                           = 1 << 25,
+	R7xx_SQ_CF_ALU_WORD1__ALT_CONST_bit               = 1 << 25,
+//  SQ_CF_ALLOC_EXPORT_WORD0                              = 0x00008dfc,
+//	ARRAY_BASE_mask                                   = 0x1fff << 0,
+//	TYPE_mask                                         = 0x03 << 13,
+//	    SQ_EXPORT_PARAM                               = 0x02,
+//	    X_UNUSED_FOR_SX_EXPORTS                       = 0x03,
+//	ELEM_SIZE_mask                                    = 0x03 << 30,
+//  SQ_CF_ALLOC_EXPORT_WORD1                              = 0x00008dfc,
+//	SQ_CF_ALLOC_EXPORT_WORD1__CF_INST_mask            = 0x7f << 23,
+	    R7xx_SQ_CF_INST_MEM_EXPORT                    = 0x3a,
+//  SQ_CF_WORD1                                           = 0x00008dfc,
+//	SQ_CF_WORD1__COUNT_mask                           = 0x07 << 10,
+	R7xx_COUNT_3_bit                                  = 1 << 19,
+//	SQ_CF_WORD1__CF_INST_mask                         = 0x7f << 23,
+	    R7xx_SQ_CF_INST_END_PROGRAM                   = 0x19,
+	    R7xx_SQ_CF_INST_WAIT_ACK                      = 0x1a,
+	    R7xx_SQ_CF_INST_TEX_ACK                       = 0x1b,
+	    R7xx_SQ_CF_INST_VTX_ACK                       = 0x1c,
+	    R7xx_SQ_CF_INST_VTX_TC_ACK                    = 0x1d,
+//  SQ_VTX_WORD0                                          = 0x00008dfc,
+//	VTX_INST_mask                                     = 0x1f << 0,
+	    R7xx_SQ_VTX_INST_MEM                          = 0x02,
+//  SQ_VTX_WORD2                                          = 0x00008dfc,
+	R7xx_SQ_VTX_WORD2__ALT_CONST_bit                  = 1 << 20,
+
+//  SQ_TEX_WORD0                                          = 0x00008dfc,
+//	TEX_INST_mask                                     = 0x1f << 0,
+	    R7xx_X_MEMORY_READ                            = 0x02,
+	    R7xx_SQ_TEX_INST_KEEP_GRADIENTS               = 0x0a,
+	    R7xx_X_FETCH4_LOAD4_INSTRUCTION_FOR_DX10_1    = 0x0f,
+	R7xx_SQ_TEX_WORD0__ALT_CONST_bit                  = 1 << 24,
+
+    R7xx_PA_SC_EDGERULE                                   = 0x00028230,
+    R7xx_SPI_THREAD_GROUPING                              = 0x000286c8,
+	PS_GROUPING_mask                                  = 0x1f << 0,
+	PS_GROUPING_shift                                 = 0,
+	VS_GROUPING_mask                                  = 0x1f << 8,
+	VS_GROUPING_shift                                 = 8,
+	GS_GROUPING_mask                                  = 0x1f << 16,
+	GS_GROUPING_shift                                 = 16,
+	ES_GROUPING_mask                                  = 0x1f << 24,
+	ES_GROUPING_shift                                 = 24,
+    R7xx_CB_SHADER_CONTROL                                = 0x000287a0,
+	RT0_ENABLE_bit                                    = 1 << 0,
+	RT1_ENABLE_bit                                    = 1 << 1,
+	RT2_ENABLE_bit                                    = 1 << 2,
+	RT3_ENABLE_bit                                    = 1 << 3,
+	RT4_ENABLE_bit                                    = 1 << 4,
+	RT5_ENABLE_bit                                    = 1 << 5,
+	RT6_ENABLE_bit                                    = 1 << 6,
+	RT7_ENABLE_bit                                    = 1 << 7,
+//  DB_ALPHA_TO_MASK                                      = 0x00028d44,
+	R7xx_OFFSET_ROUND_bit                             = 1 << 16,
+//  SQ_TEX_SAMPLER_MISC_0                                 = 0x0003d03c,
+	R7xx_TRUNCATE_COORD_bit                           = 1 << 9,
+	R7xx_DISABLE_CUBE_WRAP_bit                        = 1 << 10,
+
+} ;
+
+#endif /* _R600_REG_R7xx_H_ */
diff --git a/src/mesa/drivers/dri/r600/r600_tex.c b/src/mesa/drivers/dri/r600/r600_tex.c
new file mode 100644
index 00000000000..43d9f641af4
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_tex.c
@@ -0,0 +1,437 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <[email protected]>
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/colormac.h"
+#include "main/context.h"
+#include "main/enums.h"
+#include "main/image.h"
+#include "main/mipmap.h"
+#include "main/simple_list.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+
+#include "texmem.h"
+
+#include "r600_context.h"
+#include "r700_state.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_tex.h"
+
+#include "xmlpool.h"
+
+
+static unsigned int translate_wrap_mode(GLenum wrapmode)
+{
+	switch(wrapmode) {
+	case GL_REPEAT: return SQ_TEX_WRAP;
+	case GL_CLAMP: return SQ_TEX_CLAMP_HALF_BORDER;
+	case GL_CLAMP_TO_EDGE: return SQ_TEX_CLAMP_LAST_TEXEL;
+	case GL_CLAMP_TO_BORDER: return SQ_TEX_CLAMP_BORDER;
+	case GL_MIRRORED_REPEAT: return SQ_TEX_MIRROR_ONCE_HALF_BORDER;
+	case GL_MIRROR_CLAMP_EXT: return SQ_TEX_MIRROR;
+	case GL_MIRROR_CLAMP_TO_EDGE_EXT: return SQ_TEX_MIRROR_ONCE_BORDER;
+	case GL_MIRROR_CLAMP_TO_BORDER_EXT: return SQ_TEX_MIRROR_ONCE_LAST_TEXEL;
+	default:
+		_mesa_problem(NULL, "bad wrap mode in %s", __FUNCTION__);
+		return 0;
+	}
+}
+
+
+/**
+ * Update the cached hardware registers based on the current texture wrap modes.
+ *
+ * \param t Texture object whose wrap modes are to be set
+ */
+static void r600UpdateTexWrap(radeonTexObjPtr t)
+{
+	struct gl_texture_object *tObj = &t->base;
+
+        SETfield(t->SQ_TEX_SAMPLER0, translate_wrap_mode(tObj->WrapS),
+                 SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift, SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_mask);
+
+	if (tObj->Target != GL_TEXTURE_1D) {
+		SETfield(t->SQ_TEX_SAMPLER0, translate_wrap_mode(tObj->WrapT),
+			 CLAMP_Y_shift, CLAMP_Y_mask);
+
+		if (tObj->Target == GL_TEXTURE_3D)
+			SETfield(t->SQ_TEX_SAMPLER0, translate_wrap_mode(tObj->WrapR),
+				 CLAMP_Z_shift, CLAMP_Z_mask);
+	}
+}
+
+static void r600SetTexDefaultState(radeonTexObjPtr t)
+{
+        /* Init text object to default states. */
+        t->SQ_TEX_RESOURCE0              = 0;
+        SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_2D, DIM_shift, DIM_mask);
+        SETfield(t->SQ_TEX_RESOURCE0, ARRAY_LINEAR_GENERAL,
+                 SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_shift, SQ_TEX_RESOURCE_WORD0_0__TILE_MODE_mask);
+        CLEARbit(t->SQ_TEX_RESOURCE0, TILE_TYPE_bit);
+
+        t->SQ_TEX_RESOURCE1                = 0;
+        SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+                 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+        t->SQ_TEX_RESOURCE2                = 0;
+        t->SQ_TEX_RESOURCE3                = 0;
+
+        t->SQ_TEX_RESOURCE4                   = 0;
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_X_shift, FORMAT_COMP_X_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_Y_shift, FORMAT_COMP_Y_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_Z_shift, FORMAT_COMP_Z_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_FORMAT_COMP_UNSIGNED,
+                 FORMAT_COMP_W_shift, FORMAT_COMP_W_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_NUM_FORMAT_NORM,
+                 SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_shift, SQ_TEX_RESOURCE_WORD4_0__NUM_FORMAT_ALL_mask);
+        CLEARbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__SRF_MODE_ALL_bit);
+        CLEARbit(t->SQ_TEX_RESOURCE4, SQ_TEX_RESOURCE_WORD4_0__FORCE_DEGAMMA_bit);
+        SETfield(t->SQ_TEX_RESOURCE4, SQ_ENDIAN_NONE,
+                 SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_shift, SQ_TEX_RESOURCE_WORD4_0__ENDIAN_SWAP_mask);
+        SETfield(t->SQ_TEX_RESOURCE4, 1, REQUEST_SIZE_shift, REQUEST_SIZE_mask);
+        t->SQ_TEX_RESOURCE4 |= SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift
+		              |SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift
+		              |SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift
+		              |SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift;
+        SETfield(t->SQ_TEX_RESOURCE4, 0, BASE_LEVEL_shift, BASE_LEVEL_mask); /* mip-maps */
+
+        t->SQ_TEX_RESOURCE5 = 0;
+        t->SQ_TEX_RESOURCE6 = 0;
+
+        SETfield(t->SQ_TEX_RESOURCE6, SQ_TEX_VTX_VALID_TEXTURE,
+                 SQ_TEX_RESOURCE_WORD6_0__TYPE_shift, SQ_TEX_RESOURCE_WORD6_0__TYPE_mask);
+
+        /* Initialize sampler registers */
+        t->SQ_TEX_SAMPLER0                           = 0;
+        t->SQ_TEX_SAMPLER0 |=
+                         SQ_TEX_WRAP << SQ_TEX_SAMPLER_WORD0_0__CLAMP_X_shift
+                        |SQ_TEX_WRAP << CLAMP_Y_shift
+                        |SQ_TEX_WRAP << CLAMP_Z_shift
+                        |SQ_TEX_XY_FILTER_POINT << XY_MAG_FILTER_shift
+                        |SQ_TEX_XY_FILTER_POINT << XY_MIN_FILTER_shift
+                        |SQ_TEX_Z_FILTER_NONE << Z_FILTER_shift
+                        |SQ_TEX_Z_FILTER_NONE << MIP_FILTER_shift
+                        |SQ_TEX_BORDER_COLOR_TRANS_BLACK << BORDER_COLOR_TYPE_shift;
+
+        t->SQ_TEX_SAMPLER1 = 0x7FF << MAX_LOD_shift;
+
+        t->SQ_TEX_SAMPLER2                          = 0;
+        SETbit(t->SQ_TEX_SAMPLER2, SQ_TEX_SAMPLER_WORD2_0__TYPE_bit);
+}
+
+
+static GLuint aniso_filter(GLfloat anisotropy)
+{
+#if 0
+	if (anisotropy >= 16.0) {
+		return R300_TX_MAX_ANISO_16_TO_1;
+	} else if (anisotropy >= 8.0) {
+		return R300_TX_MAX_ANISO_8_TO_1;
+	} else if (anisotropy >= 4.0) {
+		return R300_TX_MAX_ANISO_4_TO_1;
+	} else if (anisotropy >= 2.0) {
+		return R300_TX_MAX_ANISO_2_TO_1;
+	} else {
+		return R300_TX_MAX_ANISO_1_TO_1;
+	}
+#endif
+	return 0;
+}
+
+/**
+ * Set the texture magnification and minification modes.
+ *
+ * \param t Texture whose filter modes are to be set
+ * \param minf Texture minification mode
+ * \param magf Texture magnification mode
+ * \param anisotropy Maximum anisotropy level
+ */
+static void r600SetTexFilter(radeonTexObjPtr t, GLenum minf, GLenum magf, GLfloat anisotropy)
+{
+	/* Force revalidation to account for switches from/to mipmapping. */
+	t->validated = GL_FALSE;
+
+	/* Note that EXT_texture_filter_anisotropic is extremely vague about
+	 * how anisotropic filtering interacts with the "normal" filter modes.
+	 * When anisotropic filtering is enabled, we override min and mag
+	 * filter settings completely. This includes driconf's settings.
+	 */
+	if (anisotropy >= 2.0 && (minf != GL_NEAREST) && (magf != GL_NEAREST)) {
+		/*t->pp_txfilter |= R300_TX_MAG_FILTER_ANISO
+			| R300_TX_MIN_FILTER_ANISO
+			| R300_TX_MIN_FILTER_MIP_LINEAR
+			| aniso_filter(anisotropy);*/
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, "Using maximum anisotropy of %f\n", anisotropy);
+		return;
+	}
+
+	switch (minf) {
+	case GL_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_None,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_None,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_NEAREST_MIPMAP_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Point,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_NEAREST_MIPMAP_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Linear,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_LINEAR_MIPMAP_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Point,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	case GL_LINEAR_MIPMAP_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MIN_FILTER_shift, XY_MIN_FILTER_mask);
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_MipFilter_Linear,
+			 MIP_FILTER_shift, MIP_FILTER_mask);
+		break;
+	}
+
+	/* Note we don't have 3D mipmaps so only use the mag filter setting
+	 * to set the 3D texture filter mode.
+	 */
+	switch (magf) {
+	case GL_NEAREST:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Point,
+			 XY_MAG_FILTER_shift, XY_MAG_FILTER_mask);
+		break;
+	case GL_LINEAR:
+		SETfield(t->SQ_TEX_SAMPLER0, TEX_XYFilter_Linear,
+			 XY_MAG_FILTER_shift, XY_MAG_FILTER_mask);
+		break;
+	}
+}
+
+static void r600SetTexBorderColor(radeonTexObjPtr t, const GLfloat color[4])
+{
+#if 0
+	GLubyte c[4];
+	CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
+	CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
+	CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
+	CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
+	t->pp_border_color = PACK_COLOR_8888(c[3], c[0], c[1], c[2]);
+#endif
+}
+
+/**
+ * Changes variables and flags for a state update, which will happen at the
+ * next UpdateTextureState
+ */
+
+static void r600TexParameter(GLcontext * ctx, GLenum target,
+			     struct gl_texture_object *texObj,
+			     GLenum pname, const GLfloat * params)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %s )\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr(pname));
+	}
+
+	switch (pname) {
+	case GL_TEXTURE_MIN_FILTER:
+	case GL_TEXTURE_MAG_FILTER:
+	case GL_TEXTURE_MAX_ANISOTROPY_EXT:
+		r600SetTexFilter(t, texObj->MinFilter, texObj->MagFilter, texObj->MaxAnisotropy);
+		break;
+
+	case GL_TEXTURE_WRAP_S:
+	case GL_TEXTURE_WRAP_T:
+	case GL_TEXTURE_WRAP_R:
+		r600UpdateTexWrap(t);
+		break;
+
+	case GL_TEXTURE_BORDER_COLOR:
+		r600SetTexBorderColor(t, texObj->BorderColor);
+		break;
+
+	case GL_TEXTURE_BASE_LEVEL:
+	case GL_TEXTURE_MAX_LEVEL:
+	case GL_TEXTURE_MIN_LOD:
+	case GL_TEXTURE_MAX_LOD:
+		/* This isn't the most efficient solution but there doesn't appear to
+		 * be a nice alternative.  Since there's no LOD clamping,
+		 * we just have to rely on loading the right subset of mipmap levels
+		 * to simulate a clamped LOD.
+		 */
+		if (t->mt) {
+			radeon_miptree_unreference(t->mt);
+			t->mt = 0;
+			t->validated = GL_FALSE;
+		}
+		break;
+
+	case GL_DEPTH_TEXTURE_MODE:
+		if (!texObj->Image[0][texObj->BaseLevel])
+			return;
+		if (texObj->Image[0][texObj->BaseLevel]->TexFormat->BaseFormat
+		    == GL_DEPTH_COMPONENT) {
+			r600SetDepthTexMode(texObj);
+			break;
+		} else {
+			/* If the texture isn't a depth texture, changing this
+			 * state won't cause any changes to the hardware.
+			 * Don't force a flush of texture state.
+			 */
+			return;
+		}
+
+	default:
+		return;
+	}
+}
+
+static void r600DeleteTexture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	context_t* rmesa = R700_CONTEXT(ctx);
+	radeonTexObj* t = radeon_tex_obj(texObj);
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			(void *)texObj,
+			_mesa_lookup_enum_by_nr(texObj->Target));
+	}
+
+	if (rmesa) {
+		int i;
+		radeon_firevertices(&rmesa->radeon);
+
+		for(i = 0; i < R700_MAX_TEXTURE_UNITS; ++i)
+			if (rmesa->hw.textures[i] == t)
+				rmesa->hw.textures[i] = 0;
+	}
+
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
+	}
+	_mesa_delete_texture_object(ctx, texObj);
+}
+
+/**
+ * Allocate a new texture object.
+ * Called via ctx->Driver.NewTextureObject.
+ * Note: this function will be called during context creation to
+ * allocate the default texture objects.
+ * Fixup MaxAnisotropy according to user preference.
+ */
+static struct gl_texture_object *r600NewTextureObject(GLcontext * ctx,
+						      GLuint name,
+						      GLenum target)
+{
+	context_t* rmesa = R700_CONTEXT(ctx);
+	radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+
+	if (RADEON_DEBUG & (DEBUG_STATE | DEBUG_TEXTURE)) {
+		fprintf(stderr, "%s( %p (target = %s) )\n", __FUNCTION__,
+			t, _mesa_lookup_enum_by_nr(target));
+	}
+
+	_mesa_initialize_texture_object(&t->base, name, target);
+	t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+	/* Initialize hardware state */
+	r600SetTexDefaultState(t);
+	r600UpdateTexWrap(t);
+	r600SetTexFilter(t, t->base.MinFilter, t->base.MagFilter, t->base.MaxAnisotropy);
+	r600SetTexBorderColor(t, t->base.BorderColor);
+
+	return &t->base;
+}
+
+void r600InitTextureFuncs(struct dd_function_table *functions)
+{
+	/* Note: we only plug in the functions we implement in the driver
+	 * since _mesa_init_driver_functions() was already called.
+	 */
+	functions->NewTextureImage = radeonNewTextureImage;
+	functions->FreeTexImageData = radeonFreeTexImageData;
+	functions->MapTexture = radeonMapTexture;
+	functions->UnmapTexture = radeonUnmapTexture;
+
+	functions->ChooseTextureFormat = radeonChooseTextureFormat_mesa;
+	functions->TexImage1D = radeonTexImage1D;
+	functions->TexImage2D = radeonTexImage2D;
+	functions->TexImage3D = radeonTexImage3D;
+	functions->TexSubImage1D = radeonTexSubImage1D;
+	functions->TexSubImage2D = radeonTexSubImage2D;
+	functions->TexSubImage3D = radeonTexSubImage3D;
+	functions->GetTexImage = radeonGetTexImage;
+	functions->GetCompressedTexImage = radeonGetCompressedTexImage;
+	functions->NewTextureObject = r600NewTextureObject;
+	functions->DeleteTexture = r600DeleteTexture;
+	functions->IsTextureResident = driIsTextureResident;
+
+	functions->TexParameter = r600TexParameter;
+
+	functions->CompressedTexImage2D = radeonCompressedTexImage2D;
+	functions->CompressedTexSubImage2D = radeonCompressedTexSubImage2D;
+
+	functions->GenerateMipmap = radeonGenerateMipmap;
+
+	driInitTextureFormats();
+}
diff --git a/src/mesa/drivers/dri/r300/radeon_ioctl.h b/src/mesa/drivers/dri/r600/r600_tex.h
index 3add775b822..fb0e1a023e1 100644
--- a/src/mesa/drivers/dri/r300/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/r600/r600_tex.h
@@ -32,26 +32,32 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *   Keith Whitwell <[email protected]>
  */
 
-#ifndef __RADEON_IOCTL_H__
-#define __RADEON_IOCTL_H__
-
-#include "main/simple_list.h"
-#include "radeon_dri.h"
-#include "radeon_lock.h"
-
-#include "xf86drm.h"
-#include "drm.h"
-#if 0
-#include "r200context.h"
-#endif
-#include "radeon_drm.h"
-
-extern void radeonCopyBuffer(__DRIdrawablePrivate * drawable,
-			     const drm_clip_rect_t	* rect);
-extern void radeonPageFlip(__DRIdrawablePrivate * drawable);
-extern void radeonFlush(GLcontext * ctx);
-extern void radeonFinish(GLcontext * ctx);
-extern void radeonWaitForIdleLocked(radeonContextPtr radeon);
-extern uint32_t radeonGetAge(radeonContextPtr radeon);
-
-#endif				/* __RADEON_IOCTL_H__ */
+#ifndef __r600_TEX_H__
+#define __r600_TEX_H__
+
+/* TODO : review this after texture load code. */
+#define R700_BLIT_WIDTH_BYTES 1024
+/* The BASE_ADDRESS and MIP_ADDRESS fields are 256-byte-aligned */
+#define R700_TEXTURE_ALIGNMENT_MASK     0x255
+/* Texel pitch is 8 alignment. */
+#define R700_TEXEL_PITCH_ALIGNMENT_MASK 0x7
+
+#define R700_MAX_TEXTURE_UNITS 8 /* TODO : should be 16, lets make it work, review later */
+
+extern void r600SetDepthTexMode(struct gl_texture_object *tObj);
+
+extern void r600SetTexBuffer(__DRIcontext *pDRICtx, GLint target,
+			     __DRIdrawable *dPriv);
+
+extern void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target,
+			      GLint format, __DRIdrawable *dPriv);
+
+extern void r600SetTexOffset(__DRIcontext *pDRICtx, GLint texname,
+			     unsigned long long offset, GLint depth,
+			     GLuint pitch);
+
+extern GLboolean r600ValidateBuffers(GLcontext * ctx);
+
+extern void r600InitTextureFuncs(struct dd_function_table *functions);
+
+#endif				/* __r600_TEX_H__ */
diff --git a/src/mesa/drivers/dri/r600/r600_texstate.c b/src/mesa/drivers/dri/r600/r600_texstate.c
new file mode 100644
index 00000000000..2466aa95950
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r600_texstate.c
@@ -0,0 +1,802 @@
+/*
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/**
+ * \file
+ *
+ * \author Keith Whitwell <[email protected]>
+ *
+ * \todo Enable R300 texture tiling code?
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/texformat.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "main/enums.h"
+
+#include "r600_context.h"
+#include "r700_state.h"
+#include "radeon_mipmap_tree.h"
+#include "r600_tex.h"
+
+void r600UpdateTextureState(GLcontext * ctx);
+
+void r600UpdateTextureState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	struct gl_texture_unit *texUnit;
+	struct radeon_tex_obj *t;
+	GLuint    unit;
+
+	for (unit = 0; unit < R700_MAX_TEXTURE_UNITS; unit++) {
+		texUnit = &ctx->Texture.Unit[unit];
+		t = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
+
+		if (texUnit->_ReallyEnabled) {
+			if (!t)
+				continue;
+			r700->textures[unit] = t;
+		}
+	}
+}
+
+static GLboolean r600GetTexFormat(struct gl_texture_object *tObj, GLuint mesa_format)
+{
+	radeonTexObj *t = radeon_tex_obj(tObj);
+
+	t->SQ_TEX_RESOURCE4 &= ~( SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_mask
+				  |SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_mask
+				  |SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_mask
+				  |SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_mask );
+
+	switch (mesa_format) /* This is mesa format. */
+	{
+	case MESA_FORMAT_RGBA8888:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGBA8888_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ARGB8888:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ARGB8888_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGB888:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGB565:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGB565_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ARGB4444:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_4_4_4_4,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ARGB4444_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_4_4_4_4,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ARGB1555:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_1_5_5_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ARGB1555_REV:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_1_5_5_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_AL88:
+	case MESA_FORMAT_AL88_REV: /* TODO : Check this. */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGB332:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_3_3_2,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_A8: /* ZERO, ZERO, ZERO, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_L8: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_I8: /* X, X, X, X */
+	case MESA_FORMAT_CI8:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+		/* YUV422 TODO conversion */  /* X, Y, Z, ONE, G8R8_G8B8 */
+		/*
+		  case MESA_FORMAT_YCBCR:
+		  t->SQ_TEX_RESOURCE1.bitfields.DATA_FORMAT = ;
+		  break;
+		*/
+		/* VUY422 TODO conversion */  /* X, Y, Z, ONE, G8R8_G8B8 */
+		/*
+		  case MESA_FORMAT_YCBCR_REV:
+		  t->SQ_TEX_RESOURCE1.bitfields.DATA_FORMAT = ;
+		  break;
+		*/
+	case MESA_FORMAT_RGB_DXT1: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_DXT1: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_DXT3: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_DXT5: /* not supported yet */
+
+		break;
+	case MESA_FORMAT_RGBA_FLOAT32:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_32_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGBA_FLOAT16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_16_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGB_FLOAT32: /* X, Y, Z, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_RGB_FLOAT16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ALPHA_FLOAT32: /* ZERO, ZERO, ZERO, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_ALPHA_FLOAT16: /* ZERO, ZERO, ZERO, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_LUMINANCE_FLOAT32: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_LUMINANCE_FLOAT16: /* X, X, X, ONE */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT32:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_LUMINANCE_ALPHA_FLOAT16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_INTENSITY_FLOAT32: /* X, X, X, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_32_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_INTENSITY_FLOAT16: /* X, X, X, X */
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_16_FLOAT,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		break;
+	case MESA_FORMAT_Z16:
+	case MESA_FORMAT_Z24_S8:
+	case MESA_FORMAT_Z32:
+		switch (mesa_format) {
+		case MESA_FORMAT_Z16:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_16,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		case MESA_FORMAT_Z24_S8:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_24_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		case MESA_FORMAT_Z32:
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_32,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+			break;
+		};
+		switch (tObj->DepthMode) {
+		case GL_LUMINANCE:  /* X, X, X, ONE */
+			t->SQ_TEX_RESOURCE4 |=
+				(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+				|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+			break;
+		case GL_INTENSITY:  /* X, X, X, X */
+			t->SQ_TEX_RESOURCE4 |=
+				(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+			break;
+		case GL_ALPHA:     /* ZERO, ZERO, ZERO, X */
+			t->SQ_TEX_RESOURCE4 |=
+				(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+				|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+				|(SQ_SEL_0 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+			break;
+		default:
+			return GL_FALSE;
+		}
+		break;
+	default:
+		/* Not supported format */
+		return GL_FALSE;
+	};
+
+	return GL_TRUE;
+}
+
+void r600SetDepthTexMode(struct gl_texture_object *tObj)
+{
+	const GLuint *format;
+	radeonTexObjPtr t;
+
+	if (!tObj)
+		return;
+
+	t = radeon_tex_obj(tObj);
+
+	r600GetTexFormat(tObj, tObj->Image[0][tObj->BaseLevel]->TexFormat->MesaFormat);
+
+}
+
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static void setup_hardware_state(context_t *rmesa, struct gl_texture_object *texObj)
+{
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	const struct gl_texture_image *firstImage;
+	int firstlevel = t->mt ? t->mt->firstLevel : 0;
+	GLuint uTexelPitch;
+
+	firstImage = t->base.Image[0][firstlevel];
+
+	if (!t->image_override) {
+		if (!r600GetTexFormat(texObj, firstImage->TexFormat->MesaFormat)) {
+			_mesa_problem(NULL, "unexpected texture format in %s",
+				      __FUNCTION__);
+			return;
+		}
+	}
+
+	if (t->image_override && t->bo)
+		return;
+
+	switch (texObj->Target) {
+        case GL_TEXTURE_1D:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_1D, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, 0, TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        case GL_TEXTURE_2D:
+        case GL_TEXTURE_RECTANGLE_NV:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_2D, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, 0, TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        case GL_TEXTURE_3D:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_3D, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, firstImage->Depth - 1, // ???
+			 TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        case GL_TEXTURE_CUBE_MAP:
+		SETfield(t->SQ_TEX_RESOURCE0, SQ_TEX_DIM_CUBEMAP, DIM_shift, DIM_mask);
+		SETfield(t->SQ_TEX_RESOURCE1, 0, TEX_DEPTH_shift, TEX_DEPTH_mask);
+		break;
+        default:
+		_mesa_problem(NULL, "unexpected texture target type in %s", __FUNCTION__);
+		return;
+	}
+
+	uTexelPitch = (firstImage->Width + R700_TEXEL_PITCH_ALIGNMENT_MASK)
+		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
+
+	SETfield(t->SQ_TEX_RESOURCE0, (uTexelPitch/8)-1, PITCH_shift, PITCH_mask);
+	SETfield(t->SQ_TEX_RESOURCE0, firstImage->Width - 1,
+		 TEX_WIDTH_shift, TEX_WIDTH_mask);
+	SETfield(t->SQ_TEX_RESOURCE1, firstImage->Height - 1,
+		 TEX_HEIGHT_shift, TEX_HEIGHT_mask);
+
+}
+
+/**
+ * Ensure the given texture is ready for rendering.
+ *
+ * Mostly this means populating the texture object's mipmap tree.
+ */
+static GLboolean r600_validate_texture(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	context_t *rmesa = R700_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+
+	if (!radeon_validate_texture_miptree(ctx, texObj))
+		return GL_FALSE;
+
+	/* Configure the hardware registers (more precisely, the cached version
+	 * of the hardware registers). */
+	setup_hardware_state(rmesa, texObj);
+
+	t->validated = GL_TRUE;
+	return GL_TRUE;
+}
+
+/**
+ * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+ */
+GLboolean r600ValidateBuffers(GLcontext * ctx)
+{
+	context_t *rmesa = R700_CONTEXT(ctx);
+	struct radeon_renderbuffer *rrb;
+	int i;
+	int ret;
+
+	radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
+
+	rrb = radeon_get_colorbuffer(&rmesa->radeon);
+	/* color buffer */
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
+	}
+
+	/* depth buffer */
+	rrb = radeon_get_depthbuffer(&rmesa->radeon);
+	if (rrb && rrb->bo) {
+		radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+						  rrb->bo, 0,
+						  RADEON_GEM_DOMAIN_VRAM);
+	}
+	
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+		radeonTexObj *t;
+
+		if (!ctx->Texture.Unit[i]._ReallyEnabled)
+			continue;
+
+		if (!r600_validate_texture(ctx, ctx->Texture.Unit[i]._Current)) {
+			_mesa_warning(ctx,
+				      "failed to validate texture for unit %d.\n",
+				      i);
+		}
+		t = radeon_tex_obj(ctx->Texture.Unit[i]._Current);
+		if (t->image_override && t->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+		else if (t->mt->bo)
+			radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs,
+							  t->mt->bo,
+							  RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+	}
+
+	ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, rmesa->radeon.dma.current, RADEON_GEM_DOMAIN_GTT, 0);
+	if (ret)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void r600SetTexOffset(__DRIcontext * pDRICtx, GLint texname,
+		      unsigned long long offset, GLint depth, GLuint pitch)
+{
+	context_t *rmesa = pDRICtx->driverPrivate;
+	struct gl_texture_object *tObj =
+	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
+	uint32_t pitch_val;
+
+	if (!tObj)
+		return;
+
+	t->image_override = GL_TRUE;
+
+	if (!offset)
+		return;
+
+	t->bo = NULL;
+	t->override_offset = offset;
+	pitch_val = pitch;
+	switch (depth) {
+	case 32:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		pitch_val /= 4;
+		break;
+	case 24:
+	default:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		pitch_val /= 4;
+		break;
+	case 16:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		pitch_val /= 2;
+		break;
+	}
+
+	pitch_val = (pitch_val + R700_TEXEL_PITCH_ALIGNMENT_MASK)
+		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
+	SETfield(t->SQ_TEX_RESOURCE0, (pitch_val/8)-1, PITCH_shift, PITCH_mask);
+}
+
+void r600SetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format, __DRIdrawable *dPriv)
+{
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	context_t *rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT) {
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+			t->SQ_TEX_RESOURCE4 |=
+				(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+				|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+				|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+				|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		} else {
+			SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+				 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+			t->SQ_TEX_RESOURCE4 |=
+				(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+				|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+				|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+				|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		}
+		pitch_val /= 4;
+		break;
+	case 3:
+	default:
+		// FMT_8_8_8 ???
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_8_8_8_8,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_W << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		pitch_val /= 4;
+		break;
+	case 2:
+		SETfield(t->SQ_TEX_RESOURCE1, FMT_5_6_5,
+			 SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_shift, SQ_TEX_RESOURCE_WORD1_0__DATA_FORMAT_mask);
+
+		t->SQ_TEX_RESOURCE4 |=
+			(SQ_SEL_Z << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_X_shift)
+			|(SQ_SEL_Y << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Y_shift)
+			|(SQ_SEL_X << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_Z_shift)
+			|(SQ_SEL_1 << SQ_TEX_RESOURCE_WORD4_0__DST_SEL_W_shift);
+		pitch_val /= 2;
+		break;
+	}
+
+	pitch_val = (pitch_val + R700_TEXEL_PITCH_ALIGNMENT_MASK)
+		& ~R700_TEXEL_PITCH_ALIGNMENT_MASK;
+
+	SETfield(t->SQ_TEX_RESOURCE0, (pitch_val/8)-1, PITCH_shift, PITCH_mask);
+	SETfield(t->SQ_TEX_RESOURCE0, rb->base.Width - 1,
+		 TEX_WIDTH_shift, TEX_WIDTH_mask);
+	SETfield(t->SQ_TEX_RESOURCE1, rb->base.Height - 1,
+		 TEX_HEIGHT_shift, TEX_HEIGHT_mask);
+
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
+}
+
+void r600SetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        r600SetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.c b/src/mesa/drivers/dri/r600/r700_assembler.c
new file mode 100644
index 00000000000..1d41c5cf785
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_assembler.c
@@ -0,0 +1,4071 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/mtypes.h"
+#include "main/imports.h"
+
+#include "r600_context.h"
+#include "r700_debug.h"
+
+#include "r700_assembler.h"
+
+BITS addrmode_PVSDST(PVSDST * pPVSDST)
+{
+	return pPVSDST->addrmode0 | ((BITS)pPVSDST->addrmode1 << 1);
+}
+
+void setaddrmode_PVSDST(PVSDST * pPVSDST, BITS addrmode) 
+{
+	pPVSDST->addrmode0 = addrmode & 1;
+	pPVSDST->addrmode1 = (addrmode >> 1) & 1;
+}
+
+void nomask_PVSDST(PVSDST * pPVSDST) 
+{
+	pPVSDST->writex = pPVSDST->writey = pPVSDST->writez = pPVSDST->writew = 1;
+}
+
+BITS addrmode_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	return pPVSSRC->addrmode0 | ((BITS)pPVSSRC->addrmode1 << 1);
+}
+
+void setaddrmode_PVSSRC(PVSSRC* pPVSSRC, BITS addrmode) 
+{
+	pPVSSRC->addrmode0 = addrmode & 1;
+	pPVSSRC->addrmode1 = (addrmode >> 1) & 1;
+}
+
+
+void setswizzle_PVSSRC(PVSSRC* pPVSSRC, BITS swz) 
+{
+	pPVSSRC->swizzlex = 
+	pPVSSRC->swizzley = 
+	pPVSSRC->swizzlez = 
+	pPVSSRC->swizzlew = swz;
+}
+
+void noswizzle_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->swizzlex = SQ_SEL_X;
+	pPVSSRC->swizzley = SQ_SEL_Y;
+	pPVSSRC->swizzlez = SQ_SEL_Z;
+	pPVSSRC->swizzlew = SQ_SEL_W;
+}
+
+void
+swizzleagain_PVSSRC(PVSSRC * pPVSSRC, BITS x, BITS y, BITS z, BITS w)
+{
+    switch (x) 
+    {
+        case SQ_SEL_X: x = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: x = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: x = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: x = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    switch (y) 
+    {
+        case SQ_SEL_X: y = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: y = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: y = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: y = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    switch (z) 
+    {
+        case SQ_SEL_X: z = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: z = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: z = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: z = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    switch (w) 
+    {
+        case SQ_SEL_X: w = pPVSSRC->swizzlex; 
+            break;
+        case SQ_SEL_Y: w = pPVSSRC->swizzley; 
+            break;
+        case SQ_SEL_Z: w = pPVSSRC->swizzlez; 
+            break;
+        case SQ_SEL_W: w = pPVSSRC->swizzlew; 
+            break;
+        default:;
+    }
+
+    pPVSSRC->swizzlex = x;
+    pPVSSRC->swizzley = y;
+    pPVSSRC->swizzlez = z;
+    pPVSSRC->swizzlew = w;
+}
+
+void neg_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->negx = 1;
+	pPVSSRC->negy = 1;
+	pPVSSRC->negz = 1;
+	pPVSSRC->negw = 1;
+}
+
+void noneg_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->negx = 0;
+	pPVSSRC->negy = 0;
+	pPVSSRC->negz = 0;
+	pPVSSRC->negw = 0;
+}
+
+// negate argument (for SUB instead of ADD and alike)
+void flipneg_PVSSRC(PVSSRC* pPVSSRC) 
+{
+	pPVSSRC->negx = !pPVSSRC->negx;
+	pPVSSRC->negy = !pPVSSRC->negy;
+	pPVSSRC->negz = !pPVSSRC->negz;
+	pPVSSRC->negw = !pPVSSRC->negw;
+}
+
+void zerocomp_PVSSRC(PVSSRC* pPVSSRC, int c) 
+{
+	switch (c) 
+	{
+		case 0: pPVSSRC->swizzlex = SQ_SEL_0; pPVSSRC->negx = 0; break;
+		case 1: pPVSSRC->swizzley = SQ_SEL_0; pPVSSRC->negy = 0; break;
+		case 2: pPVSSRC->swizzlez = SQ_SEL_0; pPVSSRC->negz = 0; break;
+		case 3: pPVSSRC->swizzlew = SQ_SEL_0; pPVSSRC->negw = 0; break;
+		default:;
+	} 
+}
+
+void onecomp_PVSSRC(PVSSRC* pPVSSRC, int c) 
+{
+	switch (c) 
+	{
+		case 0: pPVSSRC->swizzlex = SQ_SEL_1; pPVSSRC->negx = 0; break;
+		case 1: pPVSSRC->swizzley = SQ_SEL_1; pPVSSRC->negy = 0; break;
+		case 2: pPVSSRC->swizzlez = SQ_SEL_1; pPVSSRC->negz = 0; break;
+		case 3: pPVSSRC->swizzlew = SQ_SEL_1; pPVSSRC->negw = 0; break;
+		default:;
+	} 
+}
+
+BITS is_misc_component_exported(VAP_OUT_VTX_FMT_0* pOutVTXFmt0)  
+{
+	  return (pOutVTXFmt0->point_size            |
+			  pOutVTXFmt0->edge_flag             |
+			  pOutVTXFmt0->rta_index             |
+			  pOutVTXFmt0->kill_flag             |
+			  pOutVTXFmt0->viewport_index);
+}
+
+BITS is_depth_component_exported(OUT_FRAGMENT_FMT_0* pFPOutFmt) 
+{
+	  return (pFPOutFmt->depth            | 
+			  pFPOutFmt->stencil_ref      | 
+			  pFPOutFmt->mask             | 
+			  pFPOutFmt->coverage_to_mask);
+}
+
+GLboolean is_reduction_opcode(PVSDWORD* dest)
+{
+    if (dest->dst.op3 == 0) 
+    {
+        if ( (dest->dst.opcode == SQ_OP2_INST_DOT4 || dest->dst.opcode == SQ_OP2_INST_DOT4_IEEE) ) 
+        {
+            return GL_TRUE;
+        }
+    }
+    return GL_FALSE;
+}
+
+GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size)
+{
+    GLuint format = FMT_INVALID;
+	GLuint uiElemSize = 0;
+
+    switch (eType)
+    {
+        case GL_BYTE:
+        case GL_UNSIGNED_BYTE:
+			uiElemSize = 1;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_8; break;
+                case 2:
+                    format = FMT_8_8; break;
+                case 3:
+                    format = FMT_8_8_8; break;
+                case 4:
+                    format = FMT_8_8_8_8; break;
+                default:
+                    break;
+            }
+            break;
+
+        case GL_UNSIGNED_SHORT:
+        case GL_SHORT:
+			uiElemSize = 2;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_16; break;
+                case 2:
+                    format = FMT_16_16; break;
+                case 3:
+                    format = FMT_16_16_16; break;
+                case 4:
+                    format = FMT_16_16_16_16; break;
+                default:
+                    break;
+            }
+            break;
+
+        case GL_UNSIGNED_INT:
+        case GL_INT:
+			uiElemSize = 4;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_32; break;
+                case 2:
+                    format = FMT_32_32; break;
+                case 3:
+                    format = FMT_32_32_32; break;
+                case 4:
+                    format = FMT_32_32_32_32; break;
+                default:
+                    break;
+            }
+            break;
+
+        case GL_FLOAT:
+			uiElemSize = 4;
+			switch(nChannels)
+            {
+                case 1:
+                    format = FMT_32_FLOAT; break;
+                case 2:
+                    format = FMT_32_32_FLOAT; break;
+                case 3:
+                    format = FMT_32_32_32_FLOAT; break;
+                case 4:
+                    format = FMT_32_32_32_32_FLOAT; break;
+                default:
+                    break;
+            }
+			break;
+        case GL_DOUBLE:
+			uiElemSize = 8;
+            switch(nChannels)
+            {
+                case 1:
+                    format = FMT_32_FLOAT; break;
+                case 2:
+                    format = FMT_32_32_FLOAT; break;
+                case 3:
+                    format = FMT_32_32_32_FLOAT; break;
+                case 4:
+                    format = FMT_32_32_32_32_FLOAT; break;
+                default:
+                    break;
+            }
+            break;
+        default:
+			;
+            //GL_ASSERT_NO_CASE();
+    }
+
+    if(NULL != pClient_size)
+    {
+	    *pClient_size = uiElemSize * nChannels;
+    }
+
+    return(format);
+}
+
+unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm)
+{
+    if(pAsm->D.dst.op3)
+    {
+        return 3;
+    }
+
+    switch (pAsm->D.dst.opcode)
+    {
+    case SQ_OP2_INST_ADD:                          
+    case SQ_OP2_INST_MUL: 
+    case SQ_OP2_INST_MAX:
+    case SQ_OP2_INST_MIN:
+    //case SQ_OP2_INST_MAX_DX10:
+    //case SQ_OP2_INST_MIN_DX10:
+    case SQ_OP2_INST_SETGT:
+    case SQ_OP2_INST_SETGE:
+    case SQ_OP2_INST_PRED_SETE:
+    case SQ_OP2_INST_PRED_SETGT:
+    case SQ_OP2_INST_PRED_SETGE:
+    case SQ_OP2_INST_PRED_SETNE:
+    case SQ_OP2_INST_DOT4:
+    case SQ_OP2_INST_DOT4_IEEE:
+        return 2;  
+
+    case SQ_OP2_INST_MOV: 
+    case SQ_OP2_INST_FRACT:
+    case SQ_OP2_INST_FLOOR:
+    case SQ_OP2_INST_KILLGT:
+    case SQ_OP2_INST_EXP_IEEE:
+    case SQ_OP2_INST_LOG_CLAMPED:
+    case SQ_OP2_INST_LOG_IEEE:
+    case SQ_OP2_INST_RECIP_IEEE:
+    case SQ_OP2_INST_RECIPSQRT_IEEE:
+    case SQ_OP2_INST_FLT_TO_INT:
+    case SQ_OP2_INST_SIN:
+    case SQ_OP2_INST_COS:
+        return 1;
+        
+    default: r700_error(TODO_ASM_NEEDIMPINST, 
+                        "Need instruction operand number. \n");;
+    };
+
+    return 3;
+}
+
+int Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700_Shader* pShader)
+{
+    GLuint i;
+
+    Init_R700_Shader(pShader);
+    pAsm->pR700Shader = pShader;
+    pAsm->currentShaderType = spt;
+
+	pAsm->cf_last_export_ptr   = NULL;
+
+	pAsm->cf_current_export_clause_ptr = NULL;
+	pAsm->cf_current_alu_clause_ptr    = NULL;
+	pAsm->cf_current_tex_clause_ptr    = NULL;
+	pAsm->cf_current_vtx_clause_ptr    = NULL;
+	pAsm->cf_current_cf_clause_ptr     = NULL;
+
+	// No clause has been created yet
+	pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;
+
+	pAsm->number_of_colorandz_exports = 0;
+	pAsm->number_of_exports           = 0;
+	pAsm->number_of_export_opcodes    = 0;
+
+
+	pAsm->D.bits = 0;
+	pAsm->S[0].bits = 0;
+	pAsm->S[1].bits = 0;
+	pAsm->S[2].bits = 0;
+
+	pAsm->uLastPosUpdate = 0; 
+	
+	*(BITS *) &pAsm->fp_stOutFmt0 = 0;
+
+	pAsm->uIIns = 0;
+	pAsm->uOIns = 0;
+	pAsm->number_used_registers = 0;
+	pAsm->uUsedConsts = 256; 
+
+
+	// Fragment programs
+	pAsm->uBoolConsts = 0;
+	pAsm->uIntConsts = 0;
+	pAsm->uInsts = 0;
+	pAsm->uConsts = 0;
+
+	pAsm->FCSP = 0;
+	pAsm->fc_stack[0].type = FC_NONE;
+
+	pAsm->branch_depth     = 0;
+	pAsm->max_branch_depth = 0;
+
+	pAsm->aArgSubst[0] =
+	pAsm->aArgSubst[1] =
+	pAsm->aArgSubst[2] =
+	pAsm->aArgSubst[3] = (-1);
+
+	pAsm->uOutputs = 0;
+
+	for (i=0; i<NUMBER_OF_OUTPUT_COLORS; i++) 
+	{
+		pAsm->color_export_register_number[i] = (-1);
+	}
+
+
+	pAsm->depth_export_register_number = (-1);
+	pAsm->stencil_export_register_number = (-1);
+	pAsm->coverage_to_mask_export_register_number = (-1);
+	pAsm->mask_export_register_number = (-1);
+
+	pAsm->starting_export_register_number = 0;
+	pAsm->starting_vfetch_register_number = 0;
+	pAsm->starting_temp_register_number   = 0;
+	pAsm->uFirstHelpReg = 0;
+
+
+	pAsm->input_position_is_used = GL_FALSE;
+	pAsm->input_normal_is_used   = GL_FALSE;
+
+
+	for (i=0; i<NUMBER_OF_INPUT_COLORS; i++) 
+	{
+		pAsm->input_color_is_used[ i ] = GL_FALSE;
+	}
+
+	for (i=0; i<NUMBER_OF_TEXTURE_UNITS; i++) 
+	{
+		pAsm->input_texture_unit_is_used[ i ] = GL_FALSE;
+	}
+
+	for (i=0; i<VERT_ATTRIB_MAX; i++) 
+	{
+		pAsm->vfetch_instruction_ptr_array[ i ] = NULL;
+	}
+
+	pAsm->number_of_inputs = 0;
+
+	return 0;
+}
+
+GLboolean IsTex(gl_inst_opcode Opcode)
+{
+    if( (OPCODE_TEX==Opcode) || (OPCODE_TXP==Opcode) || (OPCODE_TXB==Opcode) )
+    {
+        return GL_TRUE;
+    }
+    return GL_FALSE;
+}
+
+GLboolean IsAlu(gl_inst_opcode Opcode)
+{
+    //TODO : more for fc and ex for higher spec.
+    if( IsTex(Opcode) )
+    {
+        return GL_FALSE;
+    }
+    return GL_TRUE;
+}
+
+int check_current_clause(r700_AssemblerBase* pAsm,
+					     CF_CLAUSE_TYPE      new_clause_type)
+{
+	if (pAsm->cf_current_clause_type != new_clause_type) 
+	{	//Close last open clause
+		switch (pAsm->cf_current_clause_type) 
+		{
+		case CF_ALU_CLAUSE:
+			if ( pAsm->cf_current_alu_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_alu_clause_ptr = NULL;
+            }
+			break;
+		case CF_VTX_CLAUSE:
+			if ( pAsm->cf_current_vtx_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_vtx_clause_ptr = NULL;
+            }
+			break;
+		case CF_TEX_CLAUSE:
+			if ( pAsm->cf_current_tex_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_tex_clause_ptr = NULL;
+            }
+			break;
+		case CF_EXPORT_CLAUSE:
+			if ( pAsm->cf_current_export_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_export_clause_ptr = NULL;
+            }
+			break;
+		case CF_OTHER_CLAUSE:
+			if ( pAsm->cf_current_cf_clause_ptr != NULL) 
+            {
+                pAsm->cf_current_cf_clause_ptr = NULL;
+            }
+			break;
+		case CF_EMPTY_CLAUSE:
+			break;
+		default:
+            r700_error(ERROR_ASM_VTX_CLAUSE,
+                       "Unknown CF_CLAUSE_TYPE (%d) in check_current_clause. \n", (int) new_clause_type);
+			return GL_FALSE;
+		}
+
+        pAsm->cf_current_clause_type = CF_EMPTY_CLAUSE;
+
+		// Create new clause
+        switch (new_clause_type) 
+	    {
+        case CF_ALU_CLAUSE:
+            pAsm->cf_current_clause_type = CF_ALU_CLAUSE;
+            break;
+        case CF_VTX_CLAUSE:
+            pAsm->cf_current_clause_type = CF_VTX_CLAUSE;
+            break;
+        case CF_TEX_CLAUSE:        
+            pAsm->cf_current_clause_type = CF_TEX_CLAUSE;
+            break;
+        case CF_EXPORT_CLAUSE:
+            {
+                R700ControlFlowSXClause* pR700ControlFlowSXClause 
+                            = (R700ControlFlowSXClause*) CALLOC_STRUCT(R700ControlFlowSXClause); 
+            
+                // Add new export instruction to control flow program        
+                if (pR700ControlFlowSXClause != 0) 
+                {
+                    pAsm->cf_current_export_clause_ptr = pR700ControlFlowSXClause;
+                    Init_R700ControlFlowSXClause(pR700ControlFlowSXClause);
+                    AddCFInstruction( pAsm->pR700Shader, 
+                                      (R700ControlFlowInstruction *)pR700ControlFlowSXClause );
+                }
+                else 
+                {
+                    r700_error(ERROR_ASM_ALLOCEXPORTCF,
+                               "Error allocating new EXPORT CF instruction in check_current_clause. \n");
+                    return GL_FALSE;
+                }
+                pAsm->cf_current_clause_type = CF_EXPORT_CLAUSE;
+            }
+            break;
+        case CF_EMPTY_CLAUSE:
+            break;
+        case CF_OTHER_CLAUSE:
+            pAsm->cf_current_clause_type = CF_OTHER_CLAUSE;
+            break;
+        default:
+            r700_error(ERROR_ASM_UNKOWNCLAUSE,
+                       "Unknown CF_CLAUSE_TYPE (%d) in check_current_clause. \n", (int) new_clause_type);
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean add_vfetch_instruction(r700_AssemblerBase*     pAsm,
+								 R700VertexInstruction*  vertex_instruction_ptr)
+{
+	if( GL_FALSE == check_current_clause(pAsm,  CF_VTX_CLAUSE) )
+	{
+		return GL_FALSE;
+	}
+
+    if( pAsm->cf_current_vtx_clause_ptr == NULL ||
+        ( (pAsm->cf_current_vtx_clause_ptr != NULL) && 
+         (pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count >= GetCFMaxInstructions(pAsm->cf_current_vtx_clause_ptr->m_ShaderInstType)-1) 
+        ) ) 
+    { 
+		// Create new Vfetch control flow instruction for this new clause
+		pAsm->cf_current_vtx_clause_ptr = (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);
+
+		if (pAsm->cf_current_vtx_clause_ptr != NULL) 
+		{
+			Init_R700ControlFlowGenericClause(pAsm->cf_current_vtx_clause_ptr);
+			AddCFInstruction( pAsm->pR700Shader, 
+                              (R700ControlFlowInstruction *)pAsm->cf_current_vtx_clause_ptr );
+		}
+		else 
+		{
+            r700_error(ERROR_ASM_ALLOCVTXCF, "Could not allocate a new VFetch CF instruction.");
+			return GL_FALSE;
+		}
+
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.pop_count        = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cf_const         = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count            = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_VTX;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+		LinkVertexInstruction(pAsm->cf_current_vtx_clause_ptr, vertex_instruction_ptr );
+	}
+	else
+	{
+		pAsm->cf_current_vtx_clause_ptr->m_Word1.f.count++;
+	}
+
+	AddVTXInstruction(pAsm->pR700Shader, vertex_instruction_ptr);
+
+	return GL_TRUE;
+}
+
+GLboolean add_tex_instruction(r700_AssemblerBase*     pAsm,
+                              R700TextureInstruction* tex_instruction_ptr)
+{ 
+    if ( GL_FALSE == check_current_clause(pAsm, CF_TEX_CLAUSE) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( pAsm->cf_current_tex_clause_ptr == NULL ||
+         ( (pAsm->cf_current_tex_clause_ptr != NULL) && 
+           (pAsm->cf_current_tex_clause_ptr->m_Word1.f.count >= GetCFMaxInstructions(pAsm->cf_current_tex_clause_ptr->m_ShaderInstType)-1) 
+         ) ) 
+    {
+        // new tex cf instruction for this new clause  
+        pAsm->cf_current_tex_clause_ptr = (R700ControlFlowGenericClause*) CALLOC_STRUCT(R700ControlFlowGenericClause);
+
+		if (pAsm->cf_current_tex_clause_ptr != NULL) 
+		{
+			Init_R700ControlFlowGenericClause(pAsm->cf_current_tex_clause_ptr);
+			AddCFInstruction( pAsm->pR700Shader, 
+                              (R700ControlFlowInstruction *)pAsm->cf_current_tex_clause_ptr );
+		}
+		else 
+		{
+            r700_error(ERROR_ASM_ALLOCTEXCF, "Could not allocate a new TEX CF instruction.");
+			return GL_FALSE;
+		}
+        
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.pop_count        = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cf_const         = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cond             = SQ_CF_COND_ACTIVE;
+
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_TEX;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.barrier          = 0x0;   //0x1;
+    }
+    else 
+    {        
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.count++;
+    }
+
+    // If this clause constains any TEX instruction that is dependent on a previous instruction, 
+    // set the barrier bit
+    if( pAsm->pInstDeps[pAsm->uiCurInst].nDstDep > (-1) )
+    {
+        pAsm->cf_current_tex_clause_ptr->m_Word1.f.barrier = 0x1;  
+    }
+
+    if(NULL == pAsm->cf_current_tex_clause_ptr->m_pLinkedTEXInstruction)
+    {
+        pAsm->cf_current_tex_clause_ptr->m_pLinkedTEXInstruction = tex_instruction_ptr;
+        tex_instruction_ptr->m_pLinkedGenericClause = pAsm->cf_current_tex_clause_ptr;
+    }
+
+    AddTEXInstruction(pAsm->pR700Shader, tex_instruction_ptr);
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
+								GLuint gl_client_id,
+                                GLuint destination_register,
+								GLuint number_of_elements,
+                                GLenum dataElementType,
+								VTX_FETCH_METHOD* pFetchMethod)
+{
+    GLuint client_size_inbyte;
+	GLuint data_format;
+    GLuint mega_fetch_count;
+	GLuint is_mega_fetch_flag;
+
+	R700VertexGenericFetch*   vfetch_instruction_ptr;
+	R700VertexGenericFetch*   assembled_vfetch_instruction_ptr = pAsm->vfetch_instruction_ptr_array[ gl_client_id ];
+
+	if (assembled_vfetch_instruction_ptr == NULL) 
+	{
+		vfetch_instruction_ptr = (R700VertexGenericFetch*) CALLOC_STRUCT(R700VertexGenericFetch);
+		if (vfetch_instruction_ptr == NULL) 
+		{
+			return GL_FALSE;
+		}
+        Init_R700VertexGenericFetch(vfetch_instruction_ptr);
+    }
+	else 
+	{
+		vfetch_instruction_ptr = assembled_vfetch_instruction_ptr;
+	}
+
+	data_format = GetSurfaceFormat(dataElementType, number_of_elements, &client_size_inbyte);
+
+	if(GL_TRUE == pFetchMethod->bEnableMini) //More conditions here
+	{
+		//TODO : mini fetch
+	}
+	else
+	{
+		mega_fetch_count = MEGA_FETCH_BYTES - 1;
+		is_mega_fetch_flag       = 0x1;
+		pFetchMethod->mega_fetch_remainder = MEGA_FETCH_BYTES - client_size_inbyte;
+	}
+
+	vfetch_instruction_ptr->m_Word0.f.vtx_inst         = SQ_VTX_INST_FETCH;
+	vfetch_instruction_ptr->m_Word0.f.fetch_type       = SQ_VTX_FETCH_VERTEX_DATA;
+	vfetch_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+
+	vfetch_instruction_ptr->m_Word0.f.buffer_id        = gl_client_id;
+	vfetch_instruction_ptr->m_Word0.f.src_gpr          = 0x0; 
+	vfetch_instruction_ptr->m_Word0.f.src_rel          = SQ_ABSOLUTE;
+	vfetch_instruction_ptr->m_Word0.f.src_sel_x        = SQ_SEL_X;
+	vfetch_instruction_ptr->m_Word0.f.mega_fetch_count = mega_fetch_count;
+
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_x        = (number_of_elements < 1) ? SQ_SEL_0 : SQ_SEL_X;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_y        = (number_of_elements < 2) ? SQ_SEL_0 : SQ_SEL_Y;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_z        = (number_of_elements < 3) ? SQ_SEL_0 : SQ_SEL_Z;
+	vfetch_instruction_ptr->m_Word1.f.dst_sel_w        = (number_of_elements < 4) ? SQ_SEL_1 : SQ_SEL_W;
+
+	vfetch_instruction_ptr->m_Word1.f.use_const_fields = 1;
+
+	// Destination register
+	vfetch_instruction_ptr->m_Word1_GPR.f.dst_gpr = destination_register; 
+	vfetch_instruction_ptr->m_Word1_GPR.f.dst_rel = SQ_ABSOLUTE;
+
+	vfetch_instruction_ptr->m_Word2.f.offset              = 0;
+	vfetch_instruction_ptr->m_Word2.f.const_buf_no_stride = 0x0;
+
+	vfetch_instruction_ptr->m_Word2.f.mega_fetch          = is_mega_fetch_flag;
+
+	if (assembled_vfetch_instruction_ptr == NULL) 
+	{
+		if ( GL_FALSE == add_vfetch_instruction(pAsm, (R700VertexInstruction *)vfetch_instruction_ptr) ) 
+        {   
+			return GL_FALSE;
+		}
+
+		if (pAsm->vfetch_instruction_ptr_array[ gl_client_id ] != NULL) 
+		{
+			return GL_FALSE;
+		}
+		else 
+		{
+			pAsm->vfetch_instruction_ptr_array[ gl_client_id ] = vfetch_instruction_ptr;
+		}
+	}
+
+	return GL_TRUE;
+}
+
+GLuint gethelpr(r700_AssemblerBase* pAsm) 
+{
+    GLuint r = pAsm->uHelpReg;
+    pAsm->uHelpReg++;
+    if (pAsm->uHelpReg > pAsm->number_used_registers)
+    {
+        pAsm->number_used_registers = pAsm->uHelpReg;
+	}
+    return r;
+}
+void resethelpr(r700_AssemblerBase* pAsm) 
+{
+    pAsm->uHelpReg = pAsm->uFirstHelpReg;
+}
+
+void checkop_init(r700_AssemblerBase* pAsm)
+{
+    resethelpr(pAsm);
+    pAsm->aArgSubst[0] =
+    pAsm->aArgSubst[1] =
+    pAsm->aArgSubst[2] =
+    pAsm->aArgSubst[3] = -1;
+}
+
+GLboolean mov_temp(r700_AssemblerBase* pAsm, int src)
+{
+    GLuint tmp = gethelpr(pAsm);
+
+    //mov src to temp helper gpr.
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+  
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    if( GL_FALSE == assemble_src(pAsm, src, 0) )
+    {
+        return GL_FALSE;
+    }
+
+    noswizzle_PVSSRC(&(pAsm->S[0].src));
+    noneg_PVSSRC(&(pAsm->S[0].src));
+   
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->aArgSubst[1 + src] = tmp;
+
+    return GL_TRUE;
+}
+
+GLboolean checkop1(r700_AssemblerBase* pAsm)
+{
+    checkop_init(pAsm);
+    return GL_TRUE;
+}
+
+GLboolean checkop2(r700_AssemblerBase* pAsm)
+{
+    GLboolean bSrcConst[2];
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    checkop_init(pAsm);
+
+    if( (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[0] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[0] = GL_FALSE;
+    }
+    if( (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[1] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[1] = GL_FALSE;
+    }
+
+    if( (bSrcConst[0] == GL_TRUE) && (bSrcConst[1] == GL_TRUE) )
+    {
+        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[1].Index)
+        {
+            if( GL_FALSE == mov_temp(pAsm, 1) )
+            {
+                return GL_FALSE;
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean checkop3(r700_AssemblerBase* pAsm)
+{
+    GLboolean bSrcConst[3];
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    checkop_init(pAsm);
+
+    if( (pILInst->SrcReg[0].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[0].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[0].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[0].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[0] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[0] = GL_FALSE;
+    }
+    if( (pILInst->SrcReg[1].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[1].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[1].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[1].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[1] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[1] = GL_FALSE;
+    }
+    if( (pILInst->SrcReg[2].File == PROGRAM_CONSTANT)    ||
+        (pILInst->SrcReg[2].File == PROGRAM_LOCAL_PARAM) ||
+        (pILInst->SrcReg[2].File == PROGRAM_ENV_PARAM)   ||
+        (pILInst->SrcReg[2].File == PROGRAM_STATE_VAR) )
+    {
+        bSrcConst[2] = GL_TRUE;
+    }
+    else
+    {
+        bSrcConst[2] = GL_FALSE;
+    }
+
+    if( (GL_TRUE == bSrcConst[0]) && 
+        (GL_TRUE == bSrcConst[1]) && 
+        (GL_TRUE == bSrcConst[2]) ) 
+    {
+        if( GL_FALSE == mov_temp(pAsm, 1) )
+        {
+            return GL_FALSE;
+        }
+        if( GL_FALSE == mov_temp(pAsm, 2) )
+        {
+            return GL_FALSE;
+        }
+
+        return GL_TRUE;
+    }
+    else if( (GL_TRUE == bSrcConst[0]) && 
+             (GL_TRUE == bSrcConst[1]) ) 
+    {
+        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[1].Index)    
+	    {
+            if( GL_FALSE == mov_temp(pAsm, 1) )
+            {
+                return 1;
+            }
+        }
+
+        return GL_TRUE;
+    }
+    else if ( (GL_TRUE == bSrcConst[0]) && 
+              (GL_TRUE == bSrcConst[2]) )  
+    {
+        if(pILInst->SrcReg[0].Index != pILInst->SrcReg[2].Index)     
+	    {
+            if( GL_FALSE == mov_temp(pAsm, 2) )
+            {
+                return GL_FALSE;
+            }
+        }
+
+        return GL_TRUE;
+    }
+    else if( (GL_TRUE == bSrcConst[1]) && 
+             (GL_TRUE == bSrcConst[2]) ) 
+    {
+        if(pILInst->SrcReg[1].Index != pILInst->SrcReg[2].Index)
+	    {
+            if( GL_FALSE == mov_temp(pAsm, 2) )
+            {
+                return GL_FALSE;
+            }
+        }
+
+        return GL_TRUE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_src(r700_AssemblerBase *pAsm,
+                       int src, 
+                       int fld)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if (fld == -1)
+    {
+        fld = src;
+    }
+
+    if(pAsm->aArgSubst[1+src] >= 0) 
+    {
+        setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+        pAsm->S[fld].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[fld].src.reg   = pAsm->aArgSubst[1+src];
+    }
+    else 
+    {
+        switch (pILInst->SrcReg[src].File)
+        {
+        case PROGRAM_TEMPORARY:
+            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+            pAsm->S[fld].src.rtype = SRC_REG_TEMPORARY;
+            pAsm->S[fld].src.reg = pILInst->SrcReg[src].Index + pAsm->starting_temp_register_number;
+            break;
+        case PROGRAM_CONSTANT:
+        case PROGRAM_LOCAL_PARAM:
+        case PROGRAM_ENV_PARAM:
+        case PROGRAM_STATE_VAR:
+            if (1 == pILInst->SrcReg[src].RelAddr)
+            {
+                setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_RELATIVE_A0);
+            }
+            else
+            {
+                setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);              
+            }
+
+            pAsm->S[fld].src.rtype = SRC_REG_CONSTANT;
+            pAsm->S[fld].src.reg   = pILInst->SrcReg[src].Index;
+            break;      
+        case PROGRAM_INPUT:
+            setaddrmode_PVSSRC(&(pAsm->S[fld].src), ADDR_ABSOLUTE);
+            pAsm->S[fld].src.rtype = SRC_REG_INPUT;
+            switch (pAsm->currentShaderType)
+            {
+            case SPT_FP:
+                pAsm->S[fld].src.reg = pAsm->uiFP_AttributeMap[pILInst->SrcReg[src].Index];
+                break;
+            case SPT_VP:
+                pAsm->S[fld].src.reg = pAsm->ucVP_AttributeMap[pILInst->SrcReg[src].Index];
+                break;
+            }
+            break;      
+        default:
+            r700_error(ERROR_ASM_SRCARGUMENT, "Invalid source argument type");          
+            return GL_FALSE;
+        }
+    } 
+
+    pAsm->S[fld].src.swizzlex = pILInst->SrcReg[src].Swizzle & 0x7;
+    pAsm->S[fld].src.swizzley = (pILInst->SrcReg[src].Swizzle >> 3) & 0x7;
+    pAsm->S[fld].src.swizzlez = (pILInst->SrcReg[src].Swizzle >> 6) & 0x7;
+    pAsm->S[fld].src.swizzlew = (pILInst->SrcReg[src].Swizzle >> 9) & 0x7;
+
+    pAsm->S[fld].src.negx = pILInst->SrcReg[src].Negate & 0x1;
+    pAsm->S[fld].src.negy = (pILInst->SrcReg[src].Negate >> 1) & 0x1;
+    pAsm->S[fld].src.negz = (pILInst->SrcReg[src].Negate >> 2) & 0x1;
+    pAsm->S[fld].src.negw = (pILInst->SrcReg[src].Negate >> 3) & 0x1;
+     
+    return GL_TRUE;
+}
+
+GLboolean assemble_dst(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+    switch (pILInst->DstReg.File) 
+    {
+    case PROGRAM_TEMPORARY:
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg = pILInst->DstReg.Index + pAsm->starting_temp_register_number;
+        break;
+    case PROGRAM_ADDRESS:
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_A0;
+        pAsm->D.dst.reg = 0;
+        break;
+    case PROGRAM_OUTPUT:
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_OUT;
+        switch (pAsm->currentShaderType)
+        {
+        case SPT_FP:
+            pAsm->D.dst.reg = pAsm->uiFP_OutputMap[pILInst->DstReg.Index];
+            break;
+        case SPT_VP:
+            pAsm->D.dst.reg = pAsm->ucVP_OutputMap[pILInst->DstReg.Index];
+            break;
+        }
+        break;   
+    default:
+        r700_error(ERROR_ASM_DSTARGUMENT, "Invalid destination output argument type");
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pILInst->DstReg.WriteMask & 0x1;
+    pAsm->D.dst.writey = (pILInst->DstReg.WriteMask >> 1) & 0x1;
+    pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
+    pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
+  
+    return GL_TRUE;
+}
+
+GLboolean tex_dst(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if(PROGRAM_TEMPORARY == pILInst->DstReg.File)
+    {
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = pAsm->pILInst[pAsm->uiCurInst].DstReg.Index + pAsm->starting_temp_register_number;
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    }
+    else if(PROGRAM_OUTPUT == pILInst->DstReg.File)
+    {
+        pAsm->D.dst.rtype = DST_REG_OUT;
+        switch (pAsm->currentShaderType)
+        {
+        case SPT_FP:
+            pAsm->D.dst.reg = pAsm->uiFP_OutputMap[pILInst->DstReg.Index];
+            break;
+        case SPT_VP:
+            pAsm->D.dst.reg = pAsm->ucVP_OutputMap[pILInst->DstReg.Index];
+            break;
+        }
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    }
+    else 
+    {
+        r700_error(ERROR_ASM_DSTARGUMENT, "Invalid destination output argument type");
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = pILInst->DstReg.WriteMask & 0x1;
+    pAsm->D.dst.writey = (pILInst->DstReg.WriteMask >> 1) & 0x1;
+    pAsm->D.dst.writez = (pILInst->DstReg.WriteMask >> 2) & 0x1;
+    pAsm->D.dst.writew = (pILInst->DstReg.WriteMask >> 3) & 0x1;
+  
+    return GL_TRUE;
+}
+
+GLboolean tex_src(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+ 
+    GLboolean bValidTexCoord = GL_FALSE;
+
+    switch (pILInst->SrcReg[0].File)
+    {
+    case PROGRAM_TEMPORARY:
+        bValidTexCoord = GL_TRUE;
+
+        pAsm->S[0].src.reg   = pILInst->SrcReg[0].Index + pAsm->starting_temp_register_number;
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+
+        break;
+    case PROGRAM_INPUT:
+        switch (pILInst->SrcReg[0].Index)
+        {
+        case FRAG_ATTRIB_COL0:
+        case FRAG_ATTRIB_COL1:
+        case FRAG_ATTRIB_TEX0:
+        case FRAG_ATTRIB_TEX1:
+        case FRAG_ATTRIB_TEX2:
+        case FRAG_ATTRIB_TEX3:
+        case FRAG_ATTRIB_TEX4:
+        case FRAG_ATTRIB_TEX5:
+        case FRAG_ATTRIB_TEX6:
+        case FRAG_ATTRIB_TEX7:
+            bValidTexCoord = GL_TRUE;
+
+            pAsm->S[0].src.reg   = pAsm->uiFP_AttributeMap[pILInst->SrcReg[0].Index];
+            pAsm->S[0].src.rtype = SRC_REG_INPUT;
+        }
+        break;
+    }
+
+    if(GL_TRUE == bValidTexCoord)
+    { 
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    }
+    else
+    {
+        r700_error(ERROR_ASM_BADTEXSRC, "Invalid source texcoord for TEX instruction");
+        return GL_FALSE;
+    }
+
+    pAsm->S[0].src.swizzlex = pILInst->SrcReg[0].Swizzle & 0x7;
+    pAsm->S[0].src.swizzley = (pILInst->SrcReg[0].Swizzle >> 3) & 0x7;
+    pAsm->S[0].src.swizzlez = (pILInst->SrcReg[0].Swizzle >> 6) & 0x7;
+    pAsm->S[0].src.swizzlew = (pILInst->SrcReg[0].Swizzle >> 9) & 0x7;
+
+    pAsm->S[0].src.negx = pILInst->SrcReg[0].Negate & 0x1;
+    pAsm->S[0].src.negy = (pILInst->SrcReg[0].Negate >> 1) & 0x1;
+    pAsm->S[0].src.negz = (pILInst->SrcReg[0].Negate >> 2) & 0x1;
+    pAsm->S[0].src.negw = (pILInst->SrcReg[0].Negate >> 3) & 0x1;
+     
+    return GL_TRUE;
+}
+
+GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm)
+{
+    PVSSRC *   texture_coordinate_source;
+    PVSSRC *   texture_unit_source;
+    
+    R700TextureInstruction* tex_instruction_ptr = (R700TextureInstruction*) CALLOC_STRUCT(R700TextureInstruction);
+	if (tex_instruction_ptr == NULL) 
+	{
+		return GL_FALSE;
+	}
+    Init_R700TextureInstruction(tex_instruction_ptr);
+
+    texture_coordinate_source = &(pAsm->S[0].src);
+    texture_unit_source       = &(pAsm->S[1].src);
+
+    tex_instruction_ptr->m_Word0.f.tex_inst         = pAsm->D.dst.opcode;
+    tex_instruction_ptr->m_Word0.f.bc_frac_mode     = 0x0;
+    tex_instruction_ptr->m_Word0.f.fetch_whole_quad = 0x0;
+
+    tex_instruction_ptr->m_Word0.f.resource_id      = texture_unit_source->reg;
+
+    tex_instruction_ptr->m_Word1.f.lod_bias     = 0x0;
+    tex_instruction_ptr->m_Word1.f.coord_type_x = SQ_TEX_NORMALIZED;
+    tex_instruction_ptr->m_Word1.f.coord_type_y = SQ_TEX_NORMALIZED;
+    tex_instruction_ptr->m_Word1.f.coord_type_z = SQ_TEX_NORMALIZED;
+    tex_instruction_ptr->m_Word1.f.coord_type_w = SQ_TEX_NORMALIZED;
+
+    tex_instruction_ptr->m_Word2.f.offset_x   = 0x0;
+    tex_instruction_ptr->m_Word2.f.offset_y   = 0x0;
+    tex_instruction_ptr->m_Word2.f.offset_z   = 0x0;
+
+    tex_instruction_ptr->m_Word2.f.sampler_id = texture_unit_source->reg;
+
+    // dst
+    if ( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
+         (pAsm->D.dst.rtype == DST_REG_OUT) ) 
+    {
+        tex_instruction_ptr->m_Word0.f.src_gpr    = texture_coordinate_source->reg;
+        tex_instruction_ptr->m_Word0.f.src_rel    = SQ_ABSOLUTE;
+
+        tex_instruction_ptr->m_Word1.f.dst_gpr    = pAsm->D.dst.reg;
+        tex_instruction_ptr->m_Word1.f.dst_rel    = SQ_ABSOLUTE;
+
+        tex_instruction_ptr->m_Word1.f.dst_sel_x  = (pAsm->D.dst.writex ? texture_unit_source->swizzlex : SQ_SEL_MASK);
+        tex_instruction_ptr->m_Word1.f.dst_sel_y  = (pAsm->D.dst.writey ? texture_unit_source->swizzley : SQ_SEL_MASK);
+        tex_instruction_ptr->m_Word1.f.dst_sel_z  = (pAsm->D.dst.writez ? texture_unit_source->swizzlez : SQ_SEL_MASK);
+        tex_instruction_ptr->m_Word1.f.dst_sel_w  = (pAsm->D.dst.writew ? texture_unit_source->swizzlew : SQ_SEL_MASK);
+
+
+        tex_instruction_ptr->m_Word2.f.src_sel_x  = texture_coordinate_source->swizzlex;
+        tex_instruction_ptr->m_Word2.f.src_sel_y  = texture_coordinate_source->swizzley;
+        tex_instruction_ptr->m_Word2.f.src_sel_z  = texture_coordinate_source->swizzlez;
+        tex_instruction_ptr->m_Word2.f.src_sel_w  = texture_coordinate_source->swizzlew;
+    }
+    else 
+    {
+        r700_error(ERROR_ASM_TEXDSTBADTYPE, "Only temp destination registers supported for TEX dest regs.");
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == add_tex_instruction(pAsm, tex_instruction_ptr) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+void initialize(r700_AssemblerBase *pAsm)
+{
+    GLuint cycle, component;
+
+    for (cycle=0; cycle<NUMBER_OF_CYCLES; cycle++) 
+    {
+        for (component=0; component<NUMBER_OF_COMPONENTS; component++) 
+        {
+            pAsm->hw_gpr[cycle][component] = (-1);
+        }
+    }
+    for (component=0; component<NUMBER_OF_COMPONENTS; component++) 
+    {
+        pAsm->hw_cfile_addr[component] = (-1);
+        pAsm->hw_cfile_chan[component] = (-1);
+    }
+}
+
+GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
+                           int                  source_index,
+                           PVSSRC*              pSource,
+                           BITS                 scalar_channel_index)
+{
+    BITS src_sel;
+    BITS src_rel;
+    BITS src_chan;
+    BITS src_neg;
+
+    //--------------------------------------------------------------------------
+    // Source for operands src0, src1. 
+    // Values [0,127] correspond to GPR[0..127]. 
+    // Values [256,511] correspond to cfile constants c[0..255]. 
+
+    //--------------------------------------------------------------------------
+    // Other special values are shown in the list below.
+
+    // 248	SQ_ALU_SRC_0: special constant 0.0.
+    // 249	SQ_ALU_SRC_1: special constant 1.0 float.
+
+    // 250	SQ_ALU_SRC_1_INT: special constant 1 integer.
+    // 251	SQ_ALU_SRC_M_1_INT: special constant -1 integer.
+
+    // 252	SQ_ALU_SRC_0_5: special constant 0.5 float.
+    // 253	SQ_ALU_SRC_LITERAL: literal constant.
+
+    // 254	SQ_ALU_SRC_PV: previous vector result.
+    // 255	SQ_ALU_SRC_PS: previous scalar result.
+    //--------------------------------------------------------------------------
+
+    BITS channel_swizzle;
+    switch (scalar_channel_index) 
+    {
+        case 0: channel_swizzle = pSource->swizzlex; break;
+        case 1: channel_swizzle = pSource->swizzley; break;
+        case 2: channel_swizzle = pSource->swizzlez; break;
+        case 3: channel_swizzle = pSource->swizzlew; break;
+        default: channel_swizzle = SQ_SEL_MASK; break;
+    }
+
+    if(channel_swizzle == SQ_SEL_0) 
+    {
+        src_sel = SQ_ALU_SRC_0; 
+    }
+    else if (channel_swizzle == SQ_SEL_1) 
+    {
+        src_sel = SQ_ALU_SRC_1; 
+    }
+    else 
+    {
+        if ( (pSource->rtype == SRC_REG_TEMPORARY) || 
+             (pSource->rtype == SRC_REG_INPUT)
+        ) 
+        {
+            src_sel = pSource->reg;
+        }
+        else if (pSource->rtype == SRC_REG_CONSTANT)
+        {
+            src_sel = pSource->reg + CFILE_REGISTER_OFFSET;            
+        }
+        else
+        {
+            r700_error(ERROR_ASM_ALUSRCBADTYPE, "Source (%d) register type (%d) not one of TEMP, INPUT, or CONSTANT.", 
+                     source_index, pSource->rtype);
+            return GL_FALSE;
+        }
+    }
+
+    if( ADDR_ABSOLUTE == addrmode_PVSSRC(pSource) ) 
+    {
+        src_rel = SQ_ABSOLUTE;
+    }
+    else 
+    {
+        src_rel = SQ_RELATIVE;
+    }
+
+    switch (channel_swizzle) 
+    {
+        case SQ_SEL_X: 
+            src_chan = SQ_CHAN_X; 
+            break;
+        case SQ_SEL_Y: 
+            src_chan = SQ_CHAN_Y; 
+            break;
+        case SQ_SEL_Z: 
+            src_chan = SQ_CHAN_Z; 
+            break;
+        case SQ_SEL_W: 
+            src_chan = SQ_CHAN_W; 
+            break;
+        case SQ_SEL_0:
+        case SQ_SEL_1:
+            // Does not matter since src_sel controls
+            src_chan = SQ_CHAN_X; 
+            break;
+        default:
+            r700_error(ERROR_ASM_ALUSRCSELECT, "Unknown source select value (%d) in assemble_alu_src().");
+            return GL_FALSE;
+            break;
+    }
+
+    switch (scalar_channel_index) 
+    {
+        case 0: src_neg = pSource->negx; break;
+        case 1: src_neg = pSource->negy; break;
+        case 2: src_neg = pSource->negz; break;
+        case 3: src_neg = pSource->negw; break;
+        default: src_neg = 0; break;
+    }
+
+    switch (source_index) 
+    {
+        case 0:
+            alu_instruction_ptr->m_Word0.f.src0_sel  = src_sel;
+            alu_instruction_ptr->m_Word0.f.src0_rel  = src_rel;
+            alu_instruction_ptr->m_Word0.f.src0_chan = src_chan;
+            alu_instruction_ptr->m_Word0.f.src0_neg  = src_neg;
+            break;
+        case 1:
+            alu_instruction_ptr->m_Word0.f.src1_sel  = src_sel;
+            alu_instruction_ptr->m_Word0.f.src1_rel  = src_rel;
+            alu_instruction_ptr->m_Word0.f.src1_chan = src_chan;
+            alu_instruction_ptr->m_Word0.f.src1_neg  = src_neg;
+            break;
+        case 2:
+            alu_instruction_ptr->m_Word1_OP3.f.src2_sel  = src_sel;
+            alu_instruction_ptr->m_Word1_OP3.f.src2_rel  = src_rel;
+            alu_instruction_ptr->m_Word1_OP3.f.src2_chan = src_chan;
+            alu_instruction_ptr->m_Word1_OP3.f.src2_neg  = src_neg;
+            break;
+        default:
+            r700_error(ERROR_ASM_ALUSRCNUMBER, "Only three sources allowed in ALU opcodes.");
+          return GL_FALSE;
+          break;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
+                              R700ALUInstruction* alu_instruction_ptr,
+                              GLuint              contiguous_slots_needed)
+{
+    if( GL_FALSE == check_current_clause(pAsm, CF_ALU_CLAUSE) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( pAsm->cf_current_alu_clause_ptr == NULL ||
+         ( (pAsm->cf_current_alu_clause_ptr != NULL) && 
+           (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-contiguous_slots_needed-1) )
+         ) ) 
+    {
+
+        //new cf inst for this clause
+        pAsm->cf_current_alu_clause_ptr = (R700ControlFlowALUClause*) CALLOC_STRUCT(R700ControlFlowALUClause);
+            
+        // link the new cf to cf segment    
+        if(NULL != pAsm->cf_current_alu_clause_ptr) 
+        {
+            Init_R700ControlFlowALUClause(pAsm->cf_current_alu_clause_ptr);
+			AddCFInstruction( pAsm->pR700Shader, 
+                              (R700ControlFlowInstruction *)pAsm->cf_current_alu_clause_ptr );            
+        }
+        else 
+        {
+            r700_error(ERROR_ASM_ALLOCALUCF, "Could not allocate a new ALU CF instruction.");
+            return GL_FALSE;
+        }
+
+        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_bank0 = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_bank1 = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word0.f.kcache_mode0 = SQ_CF_KCACHE_NOP;
+
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_mode1 = SQ_CF_KCACHE_NOP;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr0 = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.kcache_addr1 = 0x0;
+
+        //cf_current_alu_clause_ptr->m_Word1.f.count           = number_of_scalar_operations - 1;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count           = 0x0;
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.cf_inst         = SQ_CF_INST_ALU;
+
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x0;
+
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.barrier         = 0x1;
+    }
+    else 
+    {
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.count++;
+    }
+
+    // If this clause constains any instruction that is forward dependent on a TEX instruction, 
+    // set the whole_quad_mode for this clause
+    if ( pAsm->pInstDeps[pAsm->uiCurInst].nDstDep > (-1) ) 
+    {
+        pAsm->cf_current_alu_clause_ptr->m_Word1.f.whole_quad_mode = 0x1;   
+    }
+
+    if (pAsm->cf_current_alu_clause_ptr->m_Word1.f.count >= (GetCFMaxInstructions(pAsm->cf_current_alu_clause_ptr->m_ShaderInstType)-1) ) 
+    {
+        alu_instruction_ptr->m_Word0.f.last = 1;
+    }
+
+    if(NULL == pAsm->cf_current_alu_clause_ptr->m_pLinkedALUInstruction)
+    {
+        pAsm->cf_current_alu_clause_ptr->m_pLinkedALUInstruction = alu_instruction_ptr;
+        alu_instruction_ptr->m_pLinkedALUClause = pAsm->cf_current_alu_clause_ptr;
+    }
+    
+    AddALUInstruction(pAsm->pR700Shader, alu_instruction_ptr);
+
+    return GL_TRUE;
+}
+
+void get_src_properties(R700ALUInstruction*  alu_instruction_ptr,
+                        int                  source_index,
+                        BITS*                psrc_sel,
+                        BITS*                psrc_rel,
+                        BITS*                psrc_chan,
+                        BITS*                psrc_neg)
+{
+    switch (source_index) 
+    {
+        case 0:
+            *psrc_sel  = alu_instruction_ptr->m_Word0.f.src0_sel ;
+            *psrc_rel  = alu_instruction_ptr->m_Word0.f.src0_rel ;
+            *psrc_chan = alu_instruction_ptr->m_Word0.f.src0_chan;
+            *psrc_neg  = alu_instruction_ptr->m_Word0.f.src0_neg ;
+            break;
+
+        case 1:
+            *psrc_sel  = alu_instruction_ptr->m_Word0.f.src1_sel ;
+            *psrc_rel  = alu_instruction_ptr->m_Word0.f.src1_rel ;
+            *psrc_chan = alu_instruction_ptr->m_Word0.f.src1_chan;
+            *psrc_neg  = alu_instruction_ptr->m_Word0.f.src1_neg ;
+            break;
+
+        case 2:
+            *psrc_sel  = alu_instruction_ptr->m_Word1_OP3.f.src2_sel;
+            *psrc_rel  = alu_instruction_ptr->m_Word1_OP3.f.src2_rel;
+            *psrc_chan = alu_instruction_ptr->m_Word1_OP3.f.src2_chan;
+            *psrc_neg  = alu_instruction_ptr->m_Word1_OP3.f.src2_neg;
+            break;
+    }
+}
+
+int is_cfile(BITS sel) 
+{
+    if (sel > 255 && sel < 512) 
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int is_const(BITS sel) 
+{
+    if (is_cfile(sel)) 
+    {
+        return 1;
+    }
+    else if(sel >= SQ_ALU_SRC_0 && sel <= SQ_ALU_SRC_LITERAL) 
+    {
+        return 1;
+    }
+    return 0;
+}
+
+int is_gpr(BITS sel) 
+{
+    if (sel >= 0 && sel < 128) 
+    {
+        return 1;
+    }
+    return 0;
+}
+
+const GLuint BANK_SWIZZLE_VEC[8] = {SQ_ALU_VEC_210,  //000
+                                    SQ_ALU_VEC_120,  //001
+                                    SQ_ALU_VEC_102,  //010
+
+                                    SQ_ALU_VEC_201,  //011
+                                    SQ_ALU_VEC_012,  //100
+                                    SQ_ALU_VEC_021,  //101
+
+                                    SQ_ALU_VEC_012,  //110
+                                    SQ_ALU_VEC_012}; //111
+
+const GLuint BANK_SWIZZLE_SCL[8] = {SQ_ALU_SCL_210,  //000
+                                    SQ_ALU_SCL_122,  //001 
+                                    SQ_ALU_SCL_122,  //010
+
+                                    SQ_ALU_SCL_221,  //011
+                                    SQ_ALU_SCL_212,  //100
+                                    SQ_ALU_SCL_122,  //101
+
+                                    SQ_ALU_SCL_122,  //110
+                                    SQ_ALU_SCL_122}; //111
+
+GLboolean reserve_cfile(r700_AssemblerBase* pAsm, 
+                        GLuint sel, 
+                        GLuint chan)
+{
+    int res_match = (-1);
+    int res_empty = (-1);
+
+    GLint res;
+
+    for (res=3; res>=0; res--) 
+    {
+        if(pAsm->hw_cfile_addr[ res] < 0)  
+        {
+            res_empty = res;
+        }
+        else if( (pAsm->hw_cfile_addr[res] == (int)sel)
+                 &&
+                 (pAsm->hw_cfile_chan[ res ] == (int) chan) ) 
+        {
+            res_match = res;
+        }
+    }
+
+    if(res_match >= 0) 
+    {
+        // Read for this scalar component already reserved, nothing to do here.
+        ;
+    }
+    else if(res_empty >= 0) 
+    {
+        pAsm->hw_cfile_addr[ res_empty ] = sel;
+        pAsm->hw_cfile_chan[ res_empty ] = chan;
+    }
+    else 
+    {
+        r700_error(ERROR_ASM_CONSTCHANNEL, "All cfile read ports are used, cannot reference C$sel, channel $chan.");
+        return GL_FALSE;
+    }
+    return GL_TRUE;
+}
+
+GLboolean reserve_gpr(r700_AssemblerBase* pAsm, GLuint sel, GLuint chan, GLuint cycle)
+{
+    if(pAsm->hw_gpr[cycle][chan] < 0) 
+    {
+        pAsm->hw_gpr[cycle][chan] = sel;
+    }
+    else if(pAsm->hw_gpr[cycle][chan] != (int)sel) 
+    {
+        r700_error(ERROR_ASM_BADGPRRESERVE, "Another scalar operation has already used GPR read port for given channel");
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean cycle_for_scalar_bank_swizzle(const int swiz, const int sel, GLuint* pCycle)
+{
+    switch (swiz) 
+    {
+        case SQ_ALU_SCL_210:
+            {
+                int table[3] = {2,	1,	0};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        case SQ_ALU_SCL_122:
+            {
+                int table[3] = {1,	2,	2};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        case SQ_ALU_SCL_212:
+            {	
+                int table[3] = {2,	1,	2};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        case SQ_ALU_SCL_221:
+            {
+                int table[3] = {2, 2, 1};
+                *pCycle = table[sel];
+                return GL_TRUE;
+            }
+            break;
+        default:
+            r700_error(ERROR_ASM_BADSCALARBZ, "Bad Scalar bank swizzle value");
+            break;
+    }
+
+    return GL_FALSE;
+}
+
+GLboolean cycle_for_vector_bank_swizzle(const int swiz, const int sel, GLuint* pCycle)
+{
+    switch (swiz) 
+    {
+        case SQ_ALU_VEC_012:
+            {
+                int table[3] = {0, 1, 2};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_021:
+            {
+                int table[3] = {0, 2,	1};
+                *pCycle = table[sel];
+            }
+            break;        
+        case SQ_ALU_VEC_120:
+            {
+                int table[3] = {1, 2,	0};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_102:
+            {
+                int table[3] = {1, 0,	2};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_201:
+            {
+                int table[3] = {2, 0,	1};
+                *pCycle = table[sel];
+            }
+            break;
+        case SQ_ALU_VEC_210:
+            {
+                int table[3] = {2, 1,	0};
+                *pCycle = table[sel];
+            }
+            break;
+        default:
+            r700_error(ERROR_ASM_BADVECTORBZ, "Bad Vec bank swizzle value");
+            return GL_FALSE;
+            break;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean check_scalar(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr)
+{
+    GLuint cycle;
+    GLuint bank_swizzle;
+    GLuint const_count = 0;
+
+    BITS sel;
+    BITS chan;
+    BITS rel;
+    BITS neg;
+
+    GLuint src;
+
+    BITS src_sel [3] = {0,0,0};
+    BITS src_chan[3] = {0,0,0};
+    BITS src_rel [3] = {0,0,0};
+    BITS src_neg [3] = {0,0,0};
+
+    GLuint swizzle_key;
+
+    GLuint number_of_operands = r700GetNumOperands(pAsm);
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        get_src_properties(alu_instruction_ptr,
+                           src,
+                           &(src_sel[src]), 
+                           &(src_rel[src]), 
+                           &(src_chan[src]), 
+                           &(src_neg[src]) );
+    }
+
+
+    swizzle_key = ( (is_const( src_sel[0] ) ? 4 : 0) + 
+                    (is_const( src_sel[1] ) ? 2 : 0) + 
+                    (is_const( src_sel[2] ) ? 1 : 0) );
+  
+    alu_instruction_ptr->m_Word1.f.bank_swizzle = BANK_SWIZZLE_SCL[ swizzle_key ];
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        sel  = src_sel [src];
+        chan = src_chan[src];
+        rel  = src_rel [src];
+        neg  = src_neg [src];
+
+        if (is_const( sel )) 
+        {
+            // Any constant, including literal and inline constants
+            const_count++;
+
+            if (is_cfile( sel )) 
+            {
+                reserve_cfile(pAsm, sel, chan);
+            }
+
+        }
+    }
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        sel  = src_sel [src];
+        chan = src_chan[src];
+        rel  = src_rel [src];
+        neg  = src_neg [src];
+
+        if( is_gpr(sel) ) 
+        {
+            bank_swizzle = alu_instruction_ptr->m_Word1.f.bank_swizzle;
+
+            if( GL_FALSE == cycle_for_scalar_bank_swizzle(bank_swizzle, src, &cycle) )
+            {
+                return GL_FALSE;
+            }
+
+            if(cycle < const_count) 
+            {
+                if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
+                {
+                    return GL_FALSE;
+                }
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean check_vector(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr)
+{
+    GLuint cycle;
+    GLuint bank_swizzle;
+    GLuint const_count = 0;
+
+    GLuint src;
+
+    BITS sel;
+    BITS chan;
+    BITS rel;
+    BITS neg;
+
+    BITS src_sel [3] = {0,0,0};
+    BITS src_chan[3] = {0,0,0};
+    BITS src_rel [3] = {0,0,0};
+    BITS src_neg [3] = {0,0,0};
+
+    GLuint swizzle_key;
+
+    GLuint number_of_operands = r700GetNumOperands(pAsm);
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        get_src_properties(alu_instruction_ptr,
+                           src,
+                           &(src_sel[src]), 
+                           &(src_rel[src]), 
+                           &(src_chan[src]), 
+                           &(src_neg[src]) );
+    }
+
+
+    swizzle_key = ( (is_const( src_sel[0] ) ? 4 : 0) + 
+                           (is_const( src_sel[1] ) ? 2 : 0) + 
+                           (is_const( src_sel[2] ) ? 1 : 0) 
+                         );
+
+    alu_instruction_ptr->m_Word1.f.bank_swizzle = BANK_SWIZZLE_VEC[swizzle_key];
+
+    for (src=0; src<number_of_operands; src++) 
+    {
+        sel  = src_sel [src];
+        chan = src_chan[src];
+        rel  = src_rel [src];
+        neg  = src_neg [src];
+
+
+        bank_swizzle = alu_instruction_ptr->m_Word1.f.bank_swizzle;
+
+        if( is_gpr(sel) ) 
+        {
+            if( GL_FALSE == cycle_for_vector_bank_swizzle(bank_swizzle, src, &cycle) )
+            {
+                return GL_FALSE;
+            }
+
+            if ( (src  == 1)          && 
+                 (sel  == src_sel[0]) &&
+                 (chan == src_chan[0]) ) 
+            {        
+            }
+            else 
+            {
+                if( GL_FALSE == reserve_gpr(pAsm, sel, chan, cycle) )
+                {
+                    return GL_FALSE;
+                }
+            }
+        }
+        else if( is_const(sel) ) 
+        {                  
+            const_count++;
+
+            if( is_cfile(sel) ) 
+            {        
+                if( GL_FALSE == reserve_cfile(pAsm, sel, chan) )
+                {
+                    return GL_FALSE;
+                }
+            }
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm)
+{
+    GLuint    number_of_scalar_operations;
+    GLboolean is_single_scalar_operation;
+    GLuint    scalar_channel_index;
+
+    PVSSRC * pcurrent_source;
+    int    current_source_index;
+    GLuint contiguous_slots_needed;
+
+    GLuint    uNumSrc = r700GetNumOperands(pAsm);
+    GLuint    channel_swizzle, j;
+    GLuint    chan_counter[4] = {0, 0, 0, 0};
+    PVSSRC *  pSource[3];
+    GLboolean bSplitInst = GL_FALSE;
+
+    if (1 == pAsm->D.dst.math) 
+    {
+        is_single_scalar_operation = GL_TRUE;
+        number_of_scalar_operations = 1;
+    }
+    else 
+    {
+        is_single_scalar_operation = GL_FALSE;
+        number_of_scalar_operations = 4;
+        
+        /* check read port, only very preliminary algorithm, not count in 
+           src0/1 same comp case and prev slot repeat case; also not count relative
+           addressing. TODO: improve performance. */
+        for(j=0; j<uNumSrc; j++)
+        {
+            pSource[j] = &(pAsm->S[j].src);
+        }
+        for(scalar_channel_index=0; scalar_channel_index<4; scalar_channel_index++) 
+        {
+            for(j=0; j<uNumSrc; j++) 
+            {
+                switch (scalar_channel_index) 
+                {
+                    case 0: channel_swizzle = pSource[j]->swizzlex; break;
+                    case 1: channel_swizzle = pSource[j]->swizzley; break;
+                    case 2: channel_swizzle = pSource[j]->swizzlez; break;
+                    case 3: channel_swizzle = pSource[j]->swizzlew; break;
+                    default: channel_swizzle = SQ_SEL_MASK; break;
+                }
+                if ( ((pSource[j]->rtype == SRC_REG_TEMPORARY) || 
+                     (pSource[j]->rtype == SRC_REG_INPUT))
+                     && (channel_swizzle <= SQ_SEL_W) )
+                {                    
+                    chan_counter[channel_swizzle]++;                        
+                }
+            }
+        }
+        if(   (chan_counter[SQ_SEL_X] > 3)
+           || (chan_counter[SQ_SEL_Y] > 3)
+           || (chan_counter[SQ_SEL_Z] > 3)
+           || (chan_counter[SQ_SEL_W] > 3) ) /* each chan bank has only 3 ports. */
+        {
+            bSplitInst = GL_TRUE;
+        }
+    }
+
+    contiguous_slots_needed = 0;
+
+    if(GL_TRUE == is_reduction_opcode(&(pAsm->D)) ) 
+    {
+        contiguous_slots_needed = 4;
+    }
+
+    initialize(pAsm);    
+
+    for (scalar_channel_index=0;
+            scalar_channel_index < number_of_scalar_operations; 
+                scalar_channel_index++) 
+    {
+        R700ALUInstruction* alu_instruction_ptr = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+        if (alu_instruction_ptr == NULL) 
+		{
+			return GL_FALSE;
+		}
+        Init_R700ALUInstruction(alu_instruction_ptr);
+        
+        //src 0
+        current_source_index = 0;
+        pcurrent_source = &(pAsm->S[0].src);
+
+        if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
+                                         current_source_index,
+                                         pcurrent_source, 
+                                         scalar_channel_index) )     
+        {
+            return GL_FALSE;
+        }
+   
+        if (pAsm->D.dst.math == 0) 
+        {            
+            // Process source 1            
+            current_source_index = 1;
+            pcurrent_source = &(pAsm->S[current_source_index].src);
+
+            if (GL_FALSE == assemble_alu_src(alu_instruction_ptr,
+                                             current_source_index,
+                                             pcurrent_source, 
+                                             scalar_channel_index) ) 
+            {
+                return GL_FALSE;
+            }
+        }
+
+        //other bits
+        alu_instruction_ptr->m_Word0.f.index_mode = SQ_INDEX_LOOP;
+
+        if(   (is_single_scalar_operation == GL_TRUE) 
+           || (GL_TRUE == bSplitInst) )
+        {
+            alu_instruction_ptr->m_Word0.f.last = 1;
+        }
+        else 
+        {
+            alu_instruction_ptr->m_Word0.f.last = (scalar_channel_index == 3) ?  1 : 0;
+        }
+
+        alu_instruction_ptr->m_Word0.f.pred_sel                = 0x0;
+        alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;  
+        alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+
+        // dst
+        if( (pAsm->D.dst.rtype == DST_REG_TEMPORARY) || 
+            (pAsm->D.dst.rtype == DST_REG_OUT) ) 
+        {
+            alu_instruction_ptr->m_Word1.f.dst_gpr  = pAsm->D.dst.reg;
+        }
+        else 
+        {
+            r700_error(ERROR_ASM_ALUDSTBADTYPE, "Only temp destination registers supported for ALU dest regs.");
+            return GL_FALSE;
+        }
+
+        alu_instruction_ptr->m_Word1.f.dst_rel  = SQ_ABSOLUTE;  //D.rtype
+
+        if ( is_single_scalar_operation == GL_TRUE ) 
+        {
+            // Override scalar_channel_index since only one scalar value will be written
+            if(pAsm->D.dst.writex) 
+            {
+                scalar_channel_index = 0;
+            }
+            else if(pAsm->D.dst.writey) 
+            {
+                scalar_channel_index = 1;
+            }
+            else if(pAsm->D.dst.writez) 
+            {
+                scalar_channel_index = 2;
+            }
+            else if(pAsm->D.dst.writew) 
+            {
+                scalar_channel_index = 3;
+            }
+        }
+
+        alu_instruction_ptr->m_Word1.f.dst_chan = scalar_channel_index;
+
+        alu_instruction_ptr->m_Word1.f.clamp    = pAsm->pILInst[pAsm->uiCurInst].SaturateMode;
+
+        if (pAsm->D.dst.op3) 
+        {            
+            //op3
+
+            alu_instruction_ptr->m_Word1_OP3.f.alu_inst = pAsm->D.dst.opcode;
+
+            //There's 3rd src for op3
+            current_source_index = 2;
+            pcurrent_source = &(pAsm->S[current_source_index].src);
+
+            if ( GL_FALSE == assemble_alu_src(alu_instruction_ptr,
+                                              current_source_index,
+                                              pcurrent_source, 
+                                              scalar_channel_index) ) 
+            {
+                return GL_FALSE;
+            }
+        }
+        else 
+        {
+            //op2
+            if (pAsm->bR6xx)
+            {
+                alu_instruction_ptr->m_Word1_OP2.f6.alu_inst           = pAsm->D.dst.opcode;
+
+                alu_instruction_ptr->m_Word1_OP2.f6.src0_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f6.src1_abs           = 0x0;
+
+                //alu_instruction_ptr->m_Word1_OP2.f6.update_execute_mask = 0x0;
+                //alu_instruction_ptr->m_Word1_OP2.f6.update_pred         = 0x0;
+                switch (scalar_channel_index) 
+                {
+                    case 0: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writex; 
+                        break;
+                    case 1: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writey; 
+                        break;
+                    case 2: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writez; 
+                        break;
+                    case 3: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = pAsm->D.dst.writew; 
+                        break;
+                    default: 
+                        alu_instruction_ptr->m_Word1_OP2.f6.write_mask = SQ_SEL_MASK; 
+                        break;
+                }            
+                alu_instruction_ptr->m_Word1_OP2.f6.omod               = SQ_ALU_OMOD_OFF;
+            }
+            else
+            {
+                alu_instruction_ptr->m_Word1_OP2.f.alu_inst           = pAsm->D.dst.opcode;
+
+                alu_instruction_ptr->m_Word1_OP2.f.src0_abs           = 0x0;
+                alu_instruction_ptr->m_Word1_OP2.f.src1_abs           = 0x0;
+
+                //alu_instruction_ptr->m_Word1_OP2.f.update_execute_mask = 0x0;
+                //alu_instruction_ptr->m_Word1_OP2.f.update_pred         = 0x0;
+                switch (scalar_channel_index) 
+                {
+                    case 0: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writex; 
+                        break;
+                    case 1: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writey; 
+                        break;
+                    case 2: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writez; 
+                        break;
+                    case 3: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = pAsm->D.dst.writew; 
+                        break;
+                    default: 
+                        alu_instruction_ptr->m_Word1_OP2.f.write_mask = SQ_SEL_MASK; 
+                        break;
+                }            
+                alu_instruction_ptr->m_Word1_OP2.f.omod               = SQ_ALU_OMOD_OFF;
+            }
+        }
+
+        if(GL_FALSE == add_alu_instruction(pAsm, alu_instruction_ptr, contiguous_slots_needed) )
+        {
+            return GL_FALSE;
+        }
+
+        /*
+         * Judge the type of current instruction, is it vector or scalar 
+         * instruction.
+         */        
+        if (is_single_scalar_operation) 
+        {
+            if(GL_FALSE == check_scalar(pAsm, alu_instruction_ptr) )
+            {
+                return GL_FALSE;
+            }
+        }
+        else 
+        {
+            if(GL_FALSE == check_vector(pAsm, alu_instruction_ptr) )
+            {
+                return 1;
+            }
+        }
+
+        contiguous_slots_needed = 0;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean next_ins(r700_AssemblerBase *pAsm)
+{
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+    if( GL_TRUE == IsTex(pILInst->Opcode) )
+    {
+        if( GL_FALSE == assemble_tex_instruction(pAsm) ) 
+        {
+            r700_error(ERROR_ASM_TEXINSTRUCTION, "Error assembling TEX instruction");
+            return GL_FALSE;
+        }
+    }
+    else 
+    {   //ALU      
+        if( GL_FALSE == assemble_alu_instruction(pAsm) ) 
+        {
+            r700_error(ERROR_ASM_TEXINSTRUCTION, "Error assembling ALU instruction");
+            return GL_FALSE;
+        }
+    } 
+      
+    if(pAsm->D.dst.rtype == DST_REG_OUT) 
+    {
+        if(pAsm->D.dst.op3) 
+        {        
+            // There is no mask for OP3 instructions, so all channels are written        
+            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] = 0xF;
+        }
+        else 
+        {
+            pAsm->pucOutMask[pAsm->D.dst.reg - pAsm->starting_export_register_number] 
+               |= (unsigned char)pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask;
+        }
+    }
+    
+    //reset for next inst.
+    pAsm->D.bits    = 0;
+    pAsm->S[0].bits = 0;
+    pAsm->S[1].bits = 0;
+    pAsm->S[2].bits = 0;
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode)
+{
+    BITS tmp;
+
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    // opcode  tmp.x,    a.x
+    // MOV     dst,      tmp.x
+
+    pAsm->D.dst.opcode = opcode;
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype  = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg    = tmp;
+    pAsm->D.dst.writex = 1;
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // Now replicate result to all necessary channels in destination
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_ABS(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MAX;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    pAsm->S[1].bits = pAsm->S[0].bits;
+    flipneg_PVSSRC(&(pAsm->S[1].src));
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_ADD(r700_AssemblerBase *pAsm)
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+ 
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+ 
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_SUB)
+    {
+        flipneg_PVSSRC(&(pAsm->S[1].src));
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_BAD(char *opcode_str) 
+{
+    r700_error(TODO_ASM_NEEDIMPINST, "Not yet implemented instruction (%s)", opcode_str);
+    return GL_FALSE;
+}
+
+GLboolean assemble_CMP(r700_AssemblerBase *pAsm)
+{
+    int tmp;
+
+    if( GL_FALSE == checkop3(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_CNDGE;
+    pAsm->D.dst.op3     = 1;  
+
+    tmp = (-1);
+
+    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask)
+    {
+        //OP3 has no support for write mask
+        tmp = gethelpr(pAsm);
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = tmp;
+
+        nomask_PVSDST(&(pAsm->D.dst));
+    }
+    else 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+              
+    if( GL_FALSE == assemble_src(pAsm, 2, 1) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, 2) ) 
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if (0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask) 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        //tmp for source
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        noneg_PVSSRC(&(pAsm->S[0].src));
+        noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_COS(r700_AssemblerBase *pAsm)
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_COS);
+}
+ 
+GLboolean assemble_DOT(r700_AssemblerBase *pAsm)
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+        return GL_FALSE;
+    }
+ 
+    pAsm->D.dst.opcode = SQ_OP2_INST_DOT4;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if(OPCODE_DP3 == pAsm->pILInst[pAsm->uiCurInst].Opcode)
+    {
+        zerocomp_PVSSRC(&(pAsm->S[0].src), 3);
+        zerocomp_PVSSRC(&(pAsm->S[1].src), 3);
+    }
+    else if(pAsm->pILInst[pAsm->uiCurInst].Opcode == OPCODE_DPH) 
+    {
+        onecomp_PVSSRC(&(pAsm->S[1].src), 3);
+    } 
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_DST(r700_AssemblerBase *pAsm)
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    onecomp_PVSSRC(&(pAsm->S[0].src), 0);
+    onecomp_PVSSRC(&(pAsm->S[0].src), 3);
+
+    onecomp_PVSSRC(&(pAsm->S[1].src), 0);
+    onecomp_PVSSRC(&(pAsm->S[1].src), 2);
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_EX2(r700_AssemblerBase *pAsm)
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_EXP_IEEE);
+}
+ 
+GLboolean assemble_FLR(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FLOOR;  
+
+    if ( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm)
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_FLT_TO_INT);
+}
+
+GLboolean assemble_FRC(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_FRACT; 
+
+    if ( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_KIL(r700_AssemblerBase *pAsm)
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_KILLGT;  
+  
+    if ( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.writex = 0;
+    pAsm->D.dst.writey = 0;
+    pAsm->D.dst.writez = 0;
+    pAsm->D.dst.writew = 0;
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = 0;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_0);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+
+    if(PROGRAM_TEMPORARY == pAsm->pILInst[pAsm->uiCurInst].DstReg.File)
+    {
+        pAsm->S[1].src.reg = pAsm->pILInst[pAsm->uiCurInst].DstReg.Index + pAsm->starting_temp_register_number;
+    }
+    else
+    {   //PROGRAM_OUTPUT
+        pAsm->S[1].src.reg = pAsm->uiFP_OutputMap[pAsm->pILInst[pAsm->uiCurInst].DstReg.Index];
+    }
+  
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noswizzle_PVSSRC(&(pAsm->S[1].src));
+  
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->pR700Shader->killIsUsed = GL_TRUE;
+    
+    return GL_TRUE;
+}
+
+GLboolean assemble_LG2(r700_AssemblerBase *pAsm) 
+{ 
+    return assemble_math_function(pAsm, SQ_OP2_INST_LOG_IEEE);
+}
+
+GLboolean assemble_LRP(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+    if( GL_FALSE == checkop3(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    tmp = gethelpr(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_ADD;
+
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    nomask_PVSDST(&(pAsm->D.dst));
+
+          
+    if( GL_FALSE == assemble_src(pAsm, 1, 0) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    if ( GL_FALSE == assemble_src(pAsm, 2, 1) )   
+    {
+	    return GL_FALSE;
+    }
+
+    neg_PVSSRC(&(pAsm->S[1].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+
+    if( GL_FALSE == assemble_src(pAsm, 0, 1) ) 
+    {
+        return GL_FALSE;
+    }
+    if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_MAD(struct r700_AssemblerBase *pAsm) 
+{
+    int tmp, ii;
+    GLboolean bReplaceDst = GL_FALSE;
+    struct prog_instruction *pILInst = &(pAsm->pILInst[pAsm->uiCurInst]);
+
+	if( GL_FALSE == checkop3(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+	pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;  
+	pAsm->D.dst.op3     = 1; 
+
+	tmp = (-1);
+
+    if(PROGRAM_TEMPORARY == pILInst->DstReg.File)
+    {   /* TODO : more investigation on MAD src and dst using same register */
+        for(ii=0; ii<3; ii++)
+        {
+            if(   (PROGRAM_TEMPORARY == pILInst->SrcReg[ii].File)
+               && (pILInst->DstReg.Index == pILInst->SrcReg[ii].Index) )
+            {
+                bReplaceDst = GL_TRUE;
+                break;
+            }
+        }
+    }
+    if(0xF != pILInst->DstReg.WriteMask)
+    {   /* OP3 has no support for write mask */
+        bReplaceDst = GL_TRUE;
+    }
+
+	if(GL_TRUE == bReplaceDst)
+    {
+        tmp = gethelpr(pAsm);
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = tmp;
+
+        nomask_PVSDST(&(pAsm->D.dst));
+    }
+    else 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+              
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 2, -1) ) 
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+	if (GL_TRUE == bReplaceDst) 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        //tmp for source
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        noneg_PVSSRC(&(pAsm->S[0].src));
+        noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+/* LIT dst, src */
+GLboolean assemble_LIT(r700_AssemblerBase *pAsm)
+{
+    unsigned int dstReg;
+    unsigned int dstType;
+    unsigned int srcReg;
+    unsigned int srcType;
+    checkop1(pAsm);
+    int tmp = gethelpr(pAsm);
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+    dstReg  = pAsm->D.dst.reg;
+    dstType = pAsm->D.dst.rtype;
+    srcReg  = pAsm->S[0].src.reg;
+    srcType = pAsm->S[0].src.rtype;
+
+    /* dst.xw, <- 1.0  */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_MOV;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 1;
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_1;
+    pAsm->S[0].src.swizzley = SQ_SEL_1;
+    pAsm->S[0].src.swizzlez = SQ_SEL_1;
+    pAsm->S[0].src.swizzlew = SQ_SEL_1;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* dst.y = max(src.x, 0.0) */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_MAX;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 0;
+    pAsm->D.dst.writey   = 1;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+    pAsm->S[0].src.rtype = srcType;
+    pAsm->S[0].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_X;
+    pAsm->S[0].src.swizzley = SQ_SEL_X;
+    pAsm->S[0].src.swizzlez = SQ_SEL_X;
+    pAsm->S[0].src.swizzlew = SQ_SEL_X;
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = tmp;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_0;
+    pAsm->S[1].src.swizzley = SQ_SEL_0;
+    pAsm->S[1].src.swizzlez = SQ_SEL_0;
+    pAsm->S[1].src.swizzlew = SQ_SEL_0;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* before: dst.w = log(src.y)
+     * after : dst.x = log(src.y)
+     * why change dest register is that dst.w has been initialized as 1 before
+     */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_LOG_CLAMPED;
+    pAsm->D.dst.math     = 1;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+    pAsm->S[0].src.rtype = srcType;
+    pAsm->S[0].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_Y;
+    pAsm->S[0].src.swizzley = SQ_SEL_Y;
+    pAsm->S[0].src.swizzlez = SQ_SEL_Y;
+    pAsm->S[0].src.swizzlew = SQ_SEL_Y;
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* before: tmp.x = amd MUL_LIT(src.w, dst.w, src.x ) */
+    /* after : tmp.x = amd MUL_LIT(src.w, dst.x, src.x ) */
+    pAsm->D.dst.opcode   = SQ_OP3_INST_MUL_LIT;
+    pAsm->D.dst.op3      = 1;
+    pAsm->D.dst.rtype    = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg      = tmp;
+    pAsm->D.dst.writex   = 1;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 0;
+    pAsm->D.dst.writew   = 0;
+
+    pAsm->S[0].src.rtype = srcType;
+    pAsm->S[0].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_W;
+    pAsm->S[0].src.swizzley = SQ_SEL_W;
+    pAsm->S[0].src.swizzlez = SQ_SEL_W;
+    pAsm->S[0].src.swizzlew = SQ_SEL_W;
+
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[1].src.reg   = dstReg;
+    setaddrmode_PVSSRC(&(pAsm->S[1].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[1].src));
+    pAsm->S[1].src.swizzlex = SQ_SEL_X;
+    pAsm->S[1].src.swizzley = SQ_SEL_X;
+    pAsm->S[1].src.swizzlez = SQ_SEL_X;
+    pAsm->S[1].src.swizzlew = SQ_SEL_X;
+
+    pAsm->S[2].src.rtype = srcType;
+    pAsm->S[2].src.reg   = srcReg;
+    setaddrmode_PVSSRC(&(pAsm->S[2].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[2].src));
+    pAsm->S[2].src.swizzlex = SQ_SEL_X;
+    pAsm->S[2].src.swizzley = SQ_SEL_X;
+    pAsm->S[2].src.swizzlez = SQ_SEL_X;
+    pAsm->S[2].src.swizzlew = SQ_SEL_X;
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    /* dst.z = exp(tmp.x) */
+    pAsm->D.dst.opcode   = SQ_OP2_INST_EXP_IEEE;
+    pAsm->D.dst.math     = 1;
+    pAsm->D.dst.rtype    = dstType;
+    pAsm->D.dst.reg      = dstReg;
+    pAsm->D.dst.writex   = 0;
+    pAsm->D.dst.writey   = 0;
+    pAsm->D.dst.writez   = 1;
+    pAsm->D.dst.writew   = 0;
+
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+    pAsm->S[0].src.swizzlex = SQ_SEL_X;
+    pAsm->S[0].src.swizzley = SQ_SEL_X;
+    pAsm->S[0].src.swizzlez = SQ_SEL_X;
+    pAsm->S[0].src.swizzlew = SQ_SEL_X;
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MAX(r700_AssemblerBase *pAsm) 
+{
+	if( GL_FALSE == checkop2(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	pAsm->D.dst.opcode = SQ_OP2_INST_MAX; 
+	
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MIN(r700_AssemblerBase *pAsm) 
+{
+	if( GL_FALSE == checkop2(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	pAsm->D.dst.opcode = SQ_OP2_INST_MIN;  
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+	{
+		return GL_FALSE;
+	}
+ 
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MOV(r700_AssemblerBase *pAsm) 
+{
+    checkop1(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if (GL_FALSE == assemble_dst(pAsm))
+    {
+        return GL_FALSE;
+    }
+
+    if (GL_FALSE == assemble_src(pAsm, 0, -1))
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_MUL(r700_AssemblerBase *pAsm) 
+{
+	if( GL_FALSE == checkop2(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == next_ins(pAsm) ) 
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_POW(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+    checkop1(pAsm);
+
+    tmp = gethelpr(pAsm);
+
+    // LG2 tmp.x,     a.swizzle
+    pAsm->D.dst.opcode = SQ_OP2_INST_LOG_IEEE;  
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // MUL tmp.x,     tmp.x, b.swizzle
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // EX2 dst.mask,          tmp.x
+    // EX2 tmp.x,             tmp.x
+    pAsm->D.dst.opcode = SQ_OP2_INST_EXP_IEEE;
+    pAsm->D.dst.math = 1;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[0].src.reg = tmp;
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    // Now replicate result to all necessary channels in destination
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = tmp;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), SQ_SEL_X);
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_RCP(r700_AssemblerBase *pAsm) 
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_RECIP_IEEE);
+}
+ 
+GLboolean assemble_RSQ(r700_AssemblerBase *pAsm) 
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_RECIPSQRT_IEEE);
+}
+ 
+GLboolean assemble_SIN(r700_AssemblerBase *pAsm) 
+{
+    return assemble_math_function(pAsm, SQ_OP2_INST_SIN);
+}
+ 
+GLboolean assemble_SCS(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+	checkop1(pAsm);
+
+	tmp = gethelpr(pAsm);
+
+	// COS tmp.x,    a.x
+	pAsm->D.dst.opcode = SQ_OP2_INST_COS;
+	pAsm->D.dst.math = 1;
+
+	setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+	pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+	pAsm->D.dst.reg = tmp;
+	pAsm->D.dst.writex = 1;
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if ( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	// SIN tmp.y,    a.x
+	pAsm->D.dst.opcode = SQ_OP2_INST_SIN;
+	pAsm->D.dst.math = 1;
+
+	setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+	pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+	pAsm->D.dst.reg = tmp;
+	pAsm->D.dst.writey = 1;
+
+	if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+	{
+		return GL_FALSE;
+	}
+
+	if( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	// MOV dst.mask,     tmp
+	pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+	if( GL_FALSE == assemble_dst(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+	setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+	pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+	pAsm->S[0].src.reg = tmp;
+
+	noswizzle_PVSSRC(&(pAsm->S[0].src));
+	pAsm->S[0].src.swizzlez = SQ_SEL_0;
+	pAsm->S[0].src.swizzlew = SQ_SEL_0;
+
+	if ( GL_FALSE == next_ins(pAsm) )
+	{
+		return GL_FALSE;
+	}
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_SGE(r700_AssemblerBase *pAsm) 
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_SETGE;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+	    return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+	    return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_SLT(r700_AssemblerBase *pAsm) 
+{
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_SETGT;  
+
+    if( GL_FALSE == assemble_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+                
+    if( GL_FALSE == assemble_src(pAsm, 0, 1) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, 0) )  
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_STP(r700_AssemblerBase *pAsm) 
+{
+    return GL_TRUE;
+}
+ 
+GLboolean assemble_TEX(r700_AssemblerBase *pAsm) 
+{
+    GLboolean src_const;
+
+    switch (pAsm->pILInst[pAsm->uiCurInst].SrcReg[0].File)
+    {
+    case PROGRAM_CONSTANT:
+    case PROGRAM_LOCAL_PARAM:
+    case PROGRAM_ENV_PARAM:
+    case PROGRAM_STATE_VAR:
+        src_const = GL_TRUE;
+    case PROGRAM_TEMPORARY:
+    case PROGRAM_INPUT:
+        src_const = GL_FALSE;
+    }
+
+    if (GL_TRUE == src_const) 
+    {
+        r700_error(TODO_ASM_CONSTTEXADDR, "TODO: Texture coordinates from a constant register not supported.");
+        return GL_FALSE;
+    }
+
+    switch (pAsm->pILInst[pAsm->uiCurInst].Opcode) 
+    {
+        case OPCODE_TEX:
+            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;            
+            break;
+        case OPCODE_TXB:            
+            r700_error(TODO_ASM_TXB, "do not support TXB yet");
+            return GL_FALSE;
+            break;
+        case OPCODE_TXP:            
+            /* TODO : tex proj version : divid first 3 components by 4th */ 
+            pAsm->D.dst.opcode = SQ_TEX_INST_SAMPLE;
+            break;
+        default:
+            r700_error(ERROR_ASM_BADTEXINST, "Internal error: bad texture op (not TEX)");
+            return GL_FALSE;
+            break;
+    }
+
+    // Set src1 to tex unit id
+    pAsm->S[1].src.reg   = pAsm->pILInst[pAsm->uiCurInst].TexSrcUnit;
+    pAsm->S[1].src.rtype = SRC_REG_TEMPORARY;
+
+    //No sw info from mesa compiler, so hard code here.
+    pAsm->S[1].src.swizzlex = SQ_SEL_X;
+    pAsm->S[1].src.swizzley = SQ_SEL_Y;
+    pAsm->S[1].src.swizzlez = SQ_SEL_Z;
+    pAsm->S[1].src.swizzlew = SQ_SEL_W;
+
+    if( GL_FALSE == tex_dst(pAsm) )
+    {
+        return GL_FALSE;
+    }
+ 
+    if( GL_FALSE == tex_src(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    if ( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_XPD(r700_AssemblerBase *pAsm) 
+{
+    BITS tmp;
+
+    if( GL_FALSE == checkop2(pAsm) )
+    {
+	    return GL_FALSE;
+    }
+
+    tmp = gethelpr(pAsm);
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MUL;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = tmp;
+    nomask_PVSDST(&(pAsm->D.dst));
+  
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y, SQ_SEL_0);
+    swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Y, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_0);
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->D.dst.opcode = SQ_OP3_INST_MULADD;
+    pAsm->D.dst.op3    = 1;
+
+    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask)
+    {
+        tmp = gethelpr(pAsm);
+
+        setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+        pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+        pAsm->D.dst.reg   = tmp;
+
+        nomask_PVSDST(&(pAsm->D.dst));
+    }
+    else 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 0, -1) )
+    {
+        return GL_FALSE;
+    }
+
+    if( GL_FALSE == assemble_src(pAsm, 1, -1) )
+    {
+        return GL_FALSE;
+    }
+ 
+    swizzleagain_PVSSRC(&(pAsm->S[0].src), SQ_SEL_Y, SQ_SEL_Z, SQ_SEL_X, SQ_SEL_0);
+    swizzleagain_PVSSRC(&(pAsm->S[1].src), SQ_SEL_Z, SQ_SEL_X, SQ_SEL_Y, SQ_SEL_0);
+
+    // result1 + (neg) result0
+    setaddrmode_PVSSRC(&(pAsm->S[2].src),ADDR_ABSOLUTE);
+    pAsm->S[2].src.rtype = SRC_REG_TEMPORARY;
+    pAsm->S[2].src.reg   = tmp;
+
+    neg_PVSSRC(&(pAsm->S[2].src));
+    noswizzle_PVSSRC(&(pAsm->S[2].src));
+
+    if( GL_FALSE == next_ins(pAsm) ) 
+    {
+        return GL_FALSE;
+    }
+
+
+    if(0xF != pAsm->pILInst[pAsm->uiCurInst].DstReg.WriteMask) 
+    {
+        if( GL_FALSE == assemble_dst(pAsm) )
+        {
+            return GL_FALSE;
+        }
+
+        pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+        // Use tmp as source
+        setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+        pAsm->S[0].src.rtype = SRC_REG_TEMPORARY;
+        pAsm->S[0].src.reg   = tmp;
+
+        noneg_PVSSRC(&(pAsm->S[0].src));
+        noswizzle_PVSSRC(&(pAsm->S[0].src));
+
+        if( GL_FALSE == next_ins(pAsm) )
+        {
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm)
+{
+    return GL_TRUE;
+}
+
+GLboolean assemble_IF(r700_AssemblerBase *pAsm)
+{
+    return GL_TRUE;
+}
+
+GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm)
+{
+    return GL_TRUE;
+}
+
+GLboolean AssembleInstr(GLuint uiNumberInsts,
+                        struct prog_instruction *pILInst, 
+						r700_AssemblerBase *pR700AsmCode)
+{
+    GLuint i;
+
+    pR700AsmCode->pILInst = pILInst;
+	for(i=0; i<uiNumberInsts; i++)
+    {
+        pR700AsmCode->uiCurInst = i;
+
+        switch (pILInst[i].Opcode)
+        {
+        case OPCODE_ABS: 
+            if ( GL_FALSE == assemble_ABS(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_ADD: 
+        case OPCODE_SUB: 
+            if ( GL_FALSE == assemble_ADD(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_ARL: 
+            r700_error(TODO_ASM_NEEDIMPINST, "Not yet implemented instruction OPCODE_ARL ");
+            //if ( GL_FALSE == assemble_BAD("ARL") ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_ARR: 
+            r700_error(TODO_ASM_NEEDIMPINST, "Not yet implemented instruction OPCODE_ARR ");
+            //if ( GL_FALSE == assemble_BAD("ARR") ) 
+                return GL_FALSE;
+            break;
+
+        case OPCODE_CMP: 
+            if ( GL_FALSE == assemble_CMP(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_COS: 
+            if ( GL_FALSE == assemble_COS(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_DP3: 
+        case OPCODE_DP4: 
+        case OPCODE_DPH: 
+            if ( GL_FALSE == assemble_DOT(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_DST: 
+            if ( GL_FALSE == assemble_DST(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_EX2: 
+            if ( GL_FALSE == assemble_EX2(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_EXP: 
+            r700_error(TODO_ASM_NEEDIMPINST, "Not yet implemented instruction OPCODE_EXP ");
+            //if ( GL_FALSE == assemble_BAD("EXP") ) 
+                return GL_FALSE;
+            break; // approx of EX2
+
+        case OPCODE_FLR:     
+            if ( GL_FALSE == assemble_FLR(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        //case OP_FLR_INT: 
+        //    if ( GL_FALSE == assemble_FLR_INT() ) 
+        //        return GL_FALSE;
+        //    break;  
+
+        case OPCODE_FRC: 
+            if ( GL_FALSE == assemble_FRC(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_KIL: 
+            if ( GL_FALSE == assemble_KIL(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_LG2: 
+            if ( GL_FALSE == assemble_LG2(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_LIT:
+            if ( GL_FALSE == assemble_LIT(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_LRP: 
+            if ( GL_FALSE == assemble_LRP(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_LOG: 
+            r700_error(TODO_ASM_NEEDIMPINST, "Not yet implemented instruction OPCODE_LOG ");
+            //if ( GL_FALSE == assemble_BAD("LOG") ) 
+                return GL_FALSE;
+            break; // approx of LG2
+
+        case OPCODE_MAD: 
+            if ( GL_FALSE == assemble_MAD(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_MAX: 
+            if ( GL_FALSE == assemble_MAX(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_MIN: 
+            if ( GL_FALSE == assemble_MIN(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_MOV: 
+            if ( GL_FALSE == assemble_MOV(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_MUL: 
+            if ( GL_FALSE == assemble_MUL(pR700AsmCode) ) 
+                return GL_FALSE;
+            break; 
+
+        case OPCODE_POW: 
+            if ( GL_FALSE == assemble_POW(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_RCP: 
+            if ( GL_FALSE == assemble_RCP(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_RSQ: 
+            if ( GL_FALSE == assemble_RSQ(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_SIN: 
+            if ( GL_FALSE == assemble_SIN(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+        case OPCODE_SCS: 
+            if ( GL_FALSE == assemble_SCS(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_SGE: 
+            if ( GL_FALSE == assemble_SGE(pR700AsmCode) ) 
+                return GL_FALSE;
+            break; 
+        case OPCODE_SLT: 
+            if ( GL_FALSE == assemble_SLT(pR700AsmCode) ) 
+                return GL_FALSE;
+            break; 
+
+        //case OP_STP: 
+        //    if ( GL_FALSE == assemble_STP(pR700AsmCode) ) 
+        //        return GL_FALSE;
+        //    break;
+
+        case OPCODE_SWZ: 
+            if ( GL_FALSE == assemble_MOV(pR700AsmCode) ) 
+            {
+                return GL_FALSE; 
+            }
+            else
+            {
+                if( (i+1)<uiNumberInsts )
+                {
+                    if(OPCODE_END != pILInst[i+1].Opcode)
+                    {
+                        if( GL_TRUE == IsTex(pILInst[i+1].Opcode) )
+                        {
+                            pR700AsmCode->pInstDeps[i+1].nDstDep = i+1; //=1?
+                        }
+                    }
+                }
+            }
+            break;
+
+        case OPCODE_TEX: 
+        case OPCODE_TXB:  
+        case OPCODE_TXP: 
+            if ( GL_FALSE == assemble_TEX(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+
+        case OPCODE_XPD: 
+            if ( GL_FALSE == assemble_XPD(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;  
+
+        case OPCODE_IF   : 
+            if ( GL_FALSE == assemble_IF(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_ELSE : 
+            r700_error(TODO_ASM_NEEDIMPINST, "Not yet implemented instruction OPCODE_ELSE ");
+            //if ( GL_FALSE == assemble_BAD("ELSE") ) 
+                return GL_FALSE;
+            break;
+        case OPCODE_ENDIF: 
+            if ( GL_FALSE == assemble_ENDIF(pR700AsmCode) ) 
+                return GL_FALSE;
+            break;
+
+        //case OPCODE_EXPORT: 
+        //    if ( GL_FALSE == assemble_EXPORT() ) 
+        //        return GL_FALSE;
+        //    break;
+
+        case OPCODE_END: 
+			//pR700AsmCode->uiCurInst = i;
+			//This is to remaind that if in later exoort there is depth/stencil
+			//export, we need a mov to re-arrange DST channel, where using a
+			//psuedo inst, we will use this end inst to do it.
+            return GL_TRUE;
+
+        default:
+            r700_error(ERROR_ASM_UNKNOWNILINST, "internal: unknown instruction");
+            return GL_FALSE;
+        }
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean Process_Export(r700_AssemblerBase* pAsm,
+                         GLuint type,
+                         GLuint export_starting_index,
+                         GLuint export_count, 
+                         GLuint starting_register_number,
+                         GLboolean is_depth_export)
+{
+    unsigned char ucWriteMask;
+
+    check_current_clause(pAsm, CF_EMPTY_CLAUSE);
+    check_current_clause(pAsm, CF_EXPORT_CLAUSE); //alloc the cf_current_export_clause_ptr
+
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.type = type;
+
+    switch (type) 
+    {
+        case SQ_EXPORT_PIXEL:
+            if(GL_TRUE == is_depth_export) 
+            {
+                pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_PIXEL_Z;
+            }
+            else 
+            {
+                pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_PIXEL_MRT0 + export_starting_index;
+            }
+            break;
+
+        case SQ_EXPORT_POS:
+            pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = SQ_CF_POS_0 + export_starting_index; 
+            break;
+
+        case SQ_EXPORT_PARAM:
+            pAsm->cf_current_export_clause_ptr->m_Word0.f.array_base  = 0x0 + export_starting_index; 
+            break;
+
+        default:
+            r700_error(ERROR_ASM_BADEXPORTTYPE, "Unknown export type: %d", type);
+            return GL_FALSE;
+            break;
+    }
+
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.rw_gpr      = starting_register_number;
+
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.rw_rel      = SQ_ABSOLUTE;
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.index_gpr   = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word0.f.elem_size   = 0x3; 
+
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.burst_count      = (export_count - 1);
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.end_of_program   = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.valid_pixel_mode = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.cf_inst          = SQ_CF_INST_EXPORT;  // _DONE
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.whole_quad_mode  = 0x0;
+    pAsm->cf_current_export_clause_ptr->m_Word1.f.barrier          = 0x1;
+
+    if (export_count == 1) 
+    {
+        ucWriteMask = pAsm->pucOutMask[starting_register_number - pAsm->starting_export_register_number];
+
+        if( (ucWriteMask & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_X;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_MASK;
+        }
+        if( ((ucWriteMask>>1) & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_Y;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_MASK;
+        }
+        if( ((ucWriteMask>>2) & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_Z;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_MASK;
+        }
+        if( ((ucWriteMask>>3) & 0x1) != 0)
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_W;
+        }
+        else
+        {
+            pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_MASK;
+        }
+    }
+    else 
+    {
+        // This should only be used if all components for all registers have been written
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_X;
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_Y;
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_Z;
+        pAsm->cf_current_export_clause_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_W;
+    }
+
+    pAsm->cf_last_export_ptr = pAsm->cf_current_export_clause_ptr;
+
+    return GL_TRUE;
+}
+
+GLboolean Move_Depth_Exports_To_Correct_Channels(r700_AssemblerBase *pAsm, BITS depth_channel_select)
+{
+	gl_inst_opcode Opcode_save = pAsm->pILInst[pAsm->uiCurInst].Opcode; //Should be OPCODE_END
+    pAsm->pILInst[pAsm->uiCurInst].Opcode = OPCODE_MOV;
+
+    // MOV depth_export_register.hw_depth_channel, depth_export_register.depth_channel_select
+
+    pAsm->D.dst.opcode = SQ_OP2_INST_MOV;
+
+    setaddrmode_PVSDST(&(pAsm->D.dst), ADDR_ABSOLUTE);
+    pAsm->D.dst.rtype = DST_REG_TEMPORARY;
+    pAsm->D.dst.reg   = pAsm->depth_export_register_number;
+
+    pAsm->D.dst.writex = 1;   // depth          goes in R channel for HW                       
+
+    setaddrmode_PVSSRC(&(pAsm->S[0].src), ADDR_ABSOLUTE);
+    pAsm->S[0].src.rtype = DST_REG_TEMPORARY;
+    pAsm->S[0].src.reg   = pAsm->depth_export_register_number;
+
+    setswizzle_PVSSRC(&(pAsm->S[0].src), depth_channel_select);
+
+    noneg_PVSSRC(&(pAsm->S[0].src));
+
+    if( GL_FALSE == next_ins(pAsm) )
+    {
+        return GL_FALSE;
+    }
+
+    pAsm->pILInst[pAsm->uiCurInst].Opcode = Opcode_save;
+
+    return GL_TRUE;
+}
+ 
+GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode,
+                                   GLbitfield          OutputsWritten)  
+{ 
+    unsigned int unBit;
+
+    if(pR700AsmCode->depth_export_register_number >= 0) 
+    {
+        if( GL_FALSE == Move_Depth_Exports_To_Correct_Channels(pR700AsmCode, SQ_SEL_Z) )  // depth
+		{
+			return GL_FALSE;
+		}
+    }
+
+    unBit = 1 << FRAG_RESULT_COLOR;
+	if(OutputsWritten & unBit)
+	{
+		if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PIXEL, 
+                                       0, 
+                                       1, 
+                                       pR700AsmCode->uiFP_OutputMap[FRAG_RESULT_COLOR], 
+                                       GL_FALSE) ) 
+        {
+            return GL_FALSE;
+        }
+	}
+	unBit = 1 << FRAG_RESULT_DEPTH;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PIXEL, 
+                                       0, 
+                                       1, 
+                                       pR700AsmCode->uiFP_OutputMap[FRAG_RESULT_DEPTH], 
+                                       GL_TRUE)) 
+        {
+            return GL_FALSE;
+        }
+	}
+
+    if(pR700AsmCode->cf_last_export_ptr != NULL) 
+    {
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst        = SQ_CF_INST_EXPORT_DONE;
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.end_of_program = 0x1;
+    }
+
+    return GL_TRUE;
+}
+
+GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode,
+                                 GLbitfield          OutputsWritten)  
+{
+    unsigned int unBit;
+    unsigned int i;
+
+    GLuint export_starting_index  = 0;
+    GLuint export_count           = pR700AsmCode->number_of_exports;
+
+    unBit = 1 << VERT_RESULT_HPOS;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode, 
+                                       SQ_EXPORT_POS, 
+                                       export_starting_index, 
+                                       1, 
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_HPOS],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_count--;
+
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;
+	}
+
+    pR700AsmCode->number_of_exports = export_count;
+
+	unBit = 1 << VERT_RESULT_COL0;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode, 
+                                       SQ_EXPORT_PARAM, 
+                                       export_starting_index, 
+                                       1, 
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_COL0],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_starting_index++;
+	}
+
+	unBit = 1 << VERT_RESULT_COL1;
+	if(OutputsWritten & unBit)
+	{
+        if( GL_FALSE == Process_Export(pR700AsmCode, 
+                                       SQ_EXPORT_PARAM, 
+                                       export_starting_index, 
+                                       1, 
+                                       pR700AsmCode->ucVP_OutputMap[VERT_RESULT_COL1],
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+
+        export_starting_index++;
+	}
+
+	for(i=0; i<8; i++)
+	{
+		unBit = 1 << (VERT_RESULT_TEX0 + i);
+		if(OutputsWritten & unBit)
+		{
+            if( GL_FALSE == Process_Export(pR700AsmCode,
+                                          SQ_EXPORT_PARAM, 
+                                          export_starting_index, 
+                                          1, 
+                                          pR700AsmCode->ucVP_OutputMap[VERT_RESULT_TEX0 + i],
+                                          GL_FALSE) )
+            {
+                return GL_FALSE;
+            }
+
+            export_starting_index++;
+		}
+	}
+
+    // At least one param should be exported
+    if (export_count) 
+    {
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;    
+    }
+    else
+    {
+        if( GL_FALSE == Process_Export(pR700AsmCode,
+                                       SQ_EXPORT_PARAM, 
+                                       0, 
+                                       1, 
+                                       pR700AsmCode->starting_export_register_number,
+                                       GL_FALSE) )
+        {
+            return GL_FALSE;
+        }
+      
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_x = SQ_SEL_0;
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_y = SQ_SEL_0;
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_z = SQ_SEL_0;
+        pR700AsmCode->cf_last_export_ptr->m_Word1_SWIZ.f.sel_w = SQ_SEL_1;
+        pR700AsmCode->cf_last_export_ptr->m_Word1.f.cf_inst = SQ_CF_INST_EXPORT_DONE;
+    }
+
+    pR700AsmCode->cf_last_export_ptr->m_Word1.f.end_of_program = 0x1;
+
+    return GL_TRUE;
+}
+
+GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode)
+{
+    FREE(pR700AsmCode->pucOutMask);
+    FREE(pR700AsmCode->pInstDeps);
+    return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_assembler.h b/src/mesa/drivers/dri/r600/r700_assembler.h
new file mode 100644
index 00000000000..e9b21b802ec
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_assembler.h
@@ -0,0 +1,512 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_ASSEMBLER_H_
+#define _R700_ASSEMBLER_H_
+
+#include "main/mtypes.h"
+#include "shader/prog_instruction.h"
+
+#include "r700_chip.h"
+#include "r700_shaderinst.h"
+#include "r700_shader.h"
+
+typedef enum SHADER_PIPE_TYPE 
+{
+    SPT_VP = 0,
+    SPT_FP = 1
+} SHADER_PIPE_TYPE;
+
+typedef enum ConstantCycles 
+{
+    NUMBER_OF_CYCLES     = 3,
+    NUMBER_OF_COMPONENTS = 4
+} ConstantCycles;
+
+typedef enum  HARDWARE_LIMIT_VALUES  
+{
+   TEMPORARY_REGISTER_OFFSET = SQ_ALU_SRC_GPR_BASE,
+   MAX_TEMPORARY_REGISTERS   = SQ_ALU_SRC_GPR_SIZE,
+   MAX_CONSTANT_REGISTERS    = SQ_ALU_SRC_CFILE_SIZE,
+   CFILE_REGISTER_OFFSET     = SQ_ALU_SRC_CFILE_BASE,
+   NUMBER_OF_INPUT_COLORS    = 2,
+   NUMBER_OF_OUTPUT_COLORS   = 8,
+   NUMBER_OF_TEXTURE_UNITS   = 16,
+   MEGA_FETCH_BYTES          = 32
+} HARDWARE_LIMIT_VALUES;
+
+typedef enum AddressMode 
+{
+    ADDR_ABSOLUTE          = 0,
+    ADDR_RELATIVE_A0       = 1,
+    ADDR_RELATIVE_FLI_0    = 2,
+    NUMBER_OF_ADDR_MOD     = 3
+} AddressMode;
+
+typedef enum SrcRegisterType 
+{
+    SRC_REG_TEMPORARY      = 0,
+    SRC_REG_INPUT          = 1,
+    SRC_REG_CONSTANT       = 2,
+    SRC_REG_ALT_TEMPORARY  = 3,
+    NUMBER_OF_SRC_REG_TYPE = 4
+} SrcRegisterType;
+
+typedef enum DstRegisterType 
+{
+    DST_REG_TEMPORARY      = 0,
+    DST_REG_A0             = 1,
+    DST_REG_OUT            = 2,
+    DST_REG_OUT_X_REPL     = 3,
+    DST_REG_ALT_TEMPORARY  = 4,
+    DST_REG_INPUT          = 5,
+    NUMBER_OF_DST_REG_TYPE = 6
+} DstRegisterType;
+
+typedef unsigned int BITS;
+
+typedef struct PVSDSTtag 
+{
+	BITS opcode:8;     //(:6)  //@@@ really should be 10 bits for OP2
+	BITS math:1;
+	BITS predicated:1; //10   //8
+	BITS pred_inv  :1; //11   //8
+
+	BITS rtype:3;
+	BITS reg:10;       //24   //20
+
+	BITS writex:1;
+	BITS writey:1;
+	BITS writez:1;
+	BITS writew:1;     //28
+
+	BITS op3:1;       // 29  Represents *_OP3_* ALU opcode
+
+	BITS dualop:1;    // 30  //26
+
+	BITS addrmode0:1; //31   //29
+	BITS addrmode1:1; //32
+} PVSDST;
+
+typedef struct PVSSRCtag 
+{
+	BITS rtype:4;            
+	BITS addrmode0:1;        
+	BITS reg:10;      //15     (8)
+	BITS swizzlex:3;
+	BITS swizzley:3;
+	BITS swizzlez:3;
+	BITS swizzlew:3;  //27        
+
+	BITS negx:1;
+	BITS negy:1;
+	BITS negz:1;
+	BITS negw:1;      //31
+	//BITS addrsel:2;
+	BITS addrmode1:1; //32
+} PVSSRC;
+
+typedef struct PVSMATHtag 
+{
+	BITS rtype:4;
+	BITS spare:1;
+	BITS reg:8;
+	BITS swizzlex:3;
+	BITS swizzley:3;
+	BITS dstoff:2; // 2 bits of dest offset into alt ram
+	BITS opcode:4;
+	BITS negx:1;
+	BITS negy:1;
+	BITS dstcomp:2; // select dest component
+	BITS spare2:3;
+} PVSMATH;
+
+typedef union PVSDWORDtag 
+{
+	BITS    bits;
+	PVSDST  dst;
+	PVSSRC  src;
+	PVSMATH math;
+	float   f;
+} PVSDWORD;
+
+typedef struct VAP_OUT_VTX_FMT_0tag 
+{
+	BITS pos:1;      // 0
+	BITS misc:1;
+	BITS clip_dist0:1;
+	BITS clip_dist1:1;
+	BITS pos_param:1; // 4
+
+	BITS color0:1;    // 5
+	BITS color1:1;
+	BITS color2:1;
+	BITS color3:1;
+	BITS color4:1;
+	BITS color5:1;
+	BITS color6:1;
+	BITS color7:1;
+
+	BITS normal:1;    
+
+	BITS depth:1;          // 14
+
+	BITS point_size:1;     // 15   
+	BITS edge_flag:1;      
+	BITS rta_index:1;      //     shares same channel as kill_flag
+	BITS kill_flag:1;
+	BITS viewport_index:1; // 19   
+
+	BITS resvd1:12;        // 20
+} VAP_OUT_VTX_FMT_0;
+
+typedef struct VAP_OUT_VTX_FMT_1tag 
+{
+	BITS tex0comp:3;
+	BITS tex1comp:3;
+	BITS tex2comp:3;
+	BITS tex3comp:3;
+	BITS tex4comp:3;
+	BITS tex5comp:3;
+	BITS tex6comp:3;
+	BITS tex7comp:3;
+
+	BITS resvd:8;
+} VAP_OUT_VTX_FMT_1;
+
+typedef struct VAP_OUT_VTX_FMT_2tag 
+{
+	BITS tex8comp :3;
+	BITS tex9comp :3;
+	BITS tex10comp:3;
+	BITS tex11comp:3;
+	BITS tex12comp:3;
+	BITS tex13comp:3;
+	BITS tex14comp:3;
+	BITS tex15comp:3;
+
+	BITS resvd:8;
+} VAP_OUT_VTX_FMT_2;
+
+typedef struct OUT_FRAGMENT_FMT_0tag 
+{
+	BITS color0:1;
+	BITS color1:1;
+	BITS color2:1;
+	BITS color3:1;
+	BITS color4:1;
+	BITS color5:1;
+	BITS color6:1;
+	BITS color7:1;
+
+	BITS depth:1;
+	BITS stencil_ref:1;
+	BITS coverage_to_mask:1;
+	BITS mask:1;
+
+	BITS resvd1:20;
+} OUT_FRAGMENT_FMT_0;
+
+typedef enum  CF_CLAUSE_TYPE 
+{
+   CF_EXPORT_CLAUSE,
+   CF_ALU_CLAUSE,
+   CF_TEX_CLAUSE,
+   CF_VTX_CLAUSE,
+   CF_OTHER_CLAUSE,
+   CF_EMPTY_CLAUSE,
+   NUMBER_CF_CLAUSE_TYPES
+} CF_CLAUSE_TYPE;
+
+enum 
+{
+    MAX_BOOL_CONSTANTS   = 32,
+    MAX_INT_CONSTANTS    = 32,
+    MAX_FLOAT_CONSTANTS  = 256,
+
+    FC_NONE = 0,
+    FC_IF = 1,
+    FC_LOOP = 2,
+    FC_REP = 3,
+
+    COND_NONE = 0,
+    COND_BOOL = 1,
+    COND_PRED = 2,
+    COND_ALU = 3,
+
+    SAFEDIST_TEX = 6, ///< safe distance for using result of texture lookup in alu or another tex lookup
+    SAFEDIST_ALU = 6 ///< the same for alu->fc
+};
+
+typedef struct FC_LEVEL 
+{
+	unsigned int           first; ///< first fc instruction on level (if, rep, loop)
+	unsigned int*          mid; ///< middle instructions - else or all breaks on this level
+	unsigned int           midLen;
+	unsigned int           type;
+	unsigned int           cond;
+	unsigned int           inv;
+	unsigned int           bpush; ///< 1 if first instruction does branch stack push
+			 int           id; ///< id of bool or int variable
+} FC_LEVEL;
+
+typedef struct VTX_FETCH_METHOD 
+{
+	GLboolean bEnableMini;
+	GLuint mega_fetch_remainder;
+} VTX_FETCH_METHOD;
+
+typedef struct r700_AssemblerBase 
+{
+	R700ControlFlowSXClause*      cf_last_export_ptr;
+	R700ControlFlowSXClause*      cf_current_export_clause_ptr;
+	R700ControlFlowALUClause*     cf_current_alu_clause_ptr;
+	R700ControlFlowGenericClause* cf_current_tex_clause_ptr;
+	R700ControlFlowGenericClause* cf_current_vtx_clause_ptr;
+	R700ControlFlowGenericClause* cf_current_cf_clause_ptr;
+
+    //Result shader
+    R700_Shader * pR700Shader;
+
+	// No clause has been created yet
+	CF_CLAUSE_TYPE cf_current_clause_type;
+
+	GLuint number_of_exports;
+	GLuint number_of_colorandz_exports;
+	GLuint number_of_export_opcodes;
+
+	PVSDWORD D;
+	PVSDWORD S[3];
+
+	unsigned int uLastPosUpdate;
+
+	OUT_FRAGMENT_FMT_0     fp_stOutFmt0;
+
+	unsigned int uIIns;
+	unsigned int uOIns;
+	unsigned int number_used_registers;
+	unsigned int uUsedConsts; 
+
+	// Fragment programs
+	unsigned int uiFP_AttributeMap[FRAG_ATTRIB_MAX];
+	unsigned int uiFP_OutputMap[FRAG_RESULT_MAX];
+	unsigned int uBoolConsts;
+	unsigned int uIntConsts;
+	unsigned int uInsts;
+	unsigned int uConsts;
+
+	// Vertex programs
+	unsigned char ucVP_AttributeMap[VERT_ATTRIB_MAX];
+	unsigned char ucVP_OutputMap[VERT_RESULT_MAX];
+
+    unsigned char * pucOutMask;
+
+	//-----------------------------------------------------------------------------------
+	// flow control members
+	//-----------------------------------------------------------------------------------
+	unsigned int FCSP;
+	FC_LEVEL fc_stack[32];
+
+	unsigned int branch_depth;
+	unsigned int max_branch_depth;
+
+	//-----------------------------------------------------------------------------------
+	// ArgSubst used in Assemble_Source() function
+	//-----------------------------------------------------------------------------------
+	int aArgSubst[4];
+
+    GLint hw_gpr[ NUMBER_OF_CYCLES ][ NUMBER_OF_COMPONENTS ];
+    GLint hw_cfile_addr[ NUMBER_OF_COMPONENTS ];
+    GLint hw_cfile_chan[ NUMBER_OF_COMPONENTS ];
+
+    GLuint uOutputs;
+  
+    GLint color_export_register_number[NUMBER_OF_OUTPUT_COLORS];
+	GLint depth_export_register_number;
+
+	GLint stencil_export_register_number;
+	GLint coverage_to_mask_export_register_number;
+	GLint mask_export_register_number;
+
+	GLuint starting_export_register_number;
+	GLuint starting_vfetch_register_number;
+	GLuint starting_temp_register_number;
+	GLuint uHelpReg;
+	GLuint uFirstHelpReg;
+
+	GLboolean input_position_is_used;
+	GLboolean input_normal_is_used;
+
+	GLboolean input_color_is_used[NUMBER_OF_INPUT_COLORS];
+  
+	GLboolean input_texture_unit_is_used[NUMBER_OF_TEXTURE_UNITS];
+  
+    R700VertexGenericFetch* vfetch_instruction_ptr_array[VERT_ATTRIB_MAX];
+  
+	GLuint number_of_inputs;
+
+    InstDeps *pInstDeps;
+
+    SHADER_PIPE_TYPE currentShaderType;
+    struct prog_instruction * pILInst;
+    GLuint             uiCurInst;
+    GLboolean   bR6xx;
+} r700_AssemblerBase;
+
+//Internal use
+BITS addrmode_PVSDST(PVSDST * pPVSDST);
+void setaddrmode_PVSDST(PVSDST * pPVSDST, BITS addrmode);
+void nomask_PVSDST(PVSDST * pPVSDST);
+BITS addrmode_PVSSRC(PVSSRC* pPVSSRC);
+void setaddrmode_PVSSRC(PVSSRC* pPVSSRC, BITS addrmode);
+void setswizzle_PVSSRC(PVSSRC* pPVSSRC, BITS swz);
+void noswizzle_PVSSRC(PVSSRC* pPVSSRC);
+void swizzleagain_PVSSRC(PVSSRC * pPVSSRC, BITS x, BITS y, BITS z, BITS w);
+void neg_PVSSRC(PVSSRC* pPVSSRC);
+void noneg_PVSSRC(PVSSRC* pPVSSRC);
+void flipneg_PVSSRC(PVSSRC* pPVSSRC);
+void zerocomp_PVSSRC(PVSSRC* pPVSSRC, int c);
+void onecomp_PVSSRC(PVSSRC* pPVSSRC, int c);
+BITS is_misc_component_exported(VAP_OUT_VTX_FMT_0* pOutVTXFmt0);
+BITS is_depth_component_exported(OUT_FRAGMENT_FMT_0* pFPOutFmt) ;
+GLboolean is_reduction_opcode(PVSDWORD * dest);
+GLuint GetSurfaceFormat(GLenum eType, GLuint nChannels, GLuint * pClient_size);
+
+unsigned int r700GetNumOperands(r700_AssemblerBase* pAsm);
+
+GLboolean IsTex(gl_inst_opcode Opcode);
+GLboolean IsAlu(gl_inst_opcode Opcode);
+int check_current_clause(r700_AssemblerBase* pAsm,
+					     CF_CLAUSE_TYPE      new_clause_type);
+GLboolean add_vfetch_instruction(r700_AssemblerBase*     pAsm,
+								 R700VertexInstruction*  vertex_instruction_ptr);
+GLboolean add_tex_instruction(r700_AssemblerBase*     pAsm,
+                              R700TextureInstruction* tex_instruction_ptr);
+GLboolean assemble_vfetch_instruction(r700_AssemblerBase* pAsm,
+								GLuint gl_client_id,
+                                GLuint destination_register,
+								GLuint number_of_elements,
+                                GLenum dataElementType,
+								VTX_FETCH_METHOD* pFetchMethod);
+GLuint gethelpr(r700_AssemblerBase* pAsm);
+void resethelpr(r700_AssemblerBase* pAsm);
+void checkop_init(r700_AssemblerBase* pAsm);
+GLboolean mov_temp(r700_AssemblerBase* pAsm, int src);
+GLboolean checkop1(r700_AssemblerBase* pAsm);
+GLboolean checkop2(r700_AssemblerBase* pAsm);
+GLboolean checkop3(r700_AssemblerBase* pAsm);
+GLboolean assemble_src(r700_AssemblerBase *pAsm,
+                       int src, 
+                       int fld);
+GLboolean assemble_dst(r700_AssemblerBase *pAsm);
+GLboolean tex_dst(r700_AssemblerBase *pAsm);
+GLboolean tex_src(r700_AssemblerBase *pAsm);
+GLboolean assemble_tex_instruction(r700_AssemblerBase *pAsm);
+void initialize(r700_AssemblerBase *pAsm);
+GLboolean assemble_alu_src(R700ALUInstruction*  alu_instruction_ptr,
+                           int                  source_index,
+                           PVSSRC*              pSource,
+                           BITS                 scalar_channel_index);
+GLboolean add_alu_instruction(r700_AssemblerBase* pAsm,
+                              R700ALUInstruction* alu_instruction_ptr,
+                              GLuint              contiguous_slots_needed);
+void get_src_properties(R700ALUInstruction*  alu_instruction_ptr,
+                        int                  source_index,
+                        BITS*                psrc_sel,
+                        BITS*                psrc_rel,
+                        BITS*                psrc_chan,
+                        BITS*                psrc_neg);
+int is_cfile(BITS sel);
+int is_const(BITS sel);
+int is_gpr(BITS sel);
+GLboolean reserve_cfile(r700_AssemblerBase* pAsm, 
+                        GLuint sel, 
+                        GLuint chan);
+GLboolean reserve_gpr(r700_AssemblerBase* pAsm, GLuint sel, GLuint chan, GLuint cycle);
+GLboolean cycle_for_scalar_bank_swizzle(const int swiz, const int sel, GLuint* pCycle);
+GLboolean cycle_for_vector_bank_swizzle(const int swiz, const int sel, GLuint* pCycle);
+GLboolean check_scalar(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr);
+GLboolean check_vector(r700_AssemblerBase* pAsm,
+                       R700ALUInstruction* alu_instruction_ptr);
+GLboolean assemble_alu_instruction(r700_AssemblerBase *pAsm);
+GLboolean next_ins(r700_AssemblerBase *pAsm);
+GLboolean assemble_math_function(r700_AssemblerBase* pAsm, BITS opcode);
+GLboolean assemble_ABS(r700_AssemblerBase *pAsm);
+GLboolean assemble_ADD(r700_AssemblerBase *pAsm);
+GLboolean assemble_BAD(char *opcode_str);
+GLboolean assemble_CMP(r700_AssemblerBase *pAsm);
+GLboolean assemble_COS(r700_AssemblerBase *pAsm);
+GLboolean assemble_DOT(r700_AssemblerBase *pAsm);
+GLboolean assemble_DST(r700_AssemblerBase *pAsm);
+GLboolean assemble_EX2(r700_AssemblerBase *pAsm);
+GLboolean assemble_FLR(r700_AssemblerBase *pAsm);
+GLboolean assemble_FLR_INT(r700_AssemblerBase *pAsm);
+GLboolean assemble_FRC(r700_AssemblerBase *pAsm);
+GLboolean assemble_KIL(r700_AssemblerBase *pAsm);
+GLboolean assemble_LG2(r700_AssemblerBase *pAsm);
+GLboolean assemble_LRP(r700_AssemblerBase *pAsm);
+GLboolean assemble_MAD(r700_AssemblerBase *pAsm);
+GLboolean assemble_LIT(r700_AssemblerBase *pAsm);
+GLboolean assemble_MAX(r700_AssemblerBase *pAsm);
+GLboolean assemble_MIN(r700_AssemblerBase *pAsm);
+GLboolean assemble_MOV(r700_AssemblerBase *pAsm);
+GLboolean assemble_MUL(r700_AssemblerBase *pAsm);
+GLboolean assemble_POW(r700_AssemblerBase *pAsm);
+GLboolean assemble_RCP(r700_AssemblerBase *pAsm);
+GLboolean assemble_RSQ(r700_AssemblerBase *pAsm);
+GLboolean assemble_SIN(r700_AssemblerBase *pAsm);
+GLboolean assemble_SCS(r700_AssemblerBase *pAsm);
+GLboolean assemble_SGE(r700_AssemblerBase *pAsm);
+GLboolean assemble_SLT(r700_AssemblerBase *pAsm);
+GLboolean assemble_STP(r700_AssemblerBase *pAsm);
+GLboolean assemble_TEX(r700_AssemblerBase *pAsm);
+GLboolean assemble_XPD(r700_AssemblerBase *pAsm);
+GLboolean assemble_EXPORT(r700_AssemblerBase *pAsm);
+GLboolean assemble_IF(r700_AssemblerBase *pAsm);
+GLboolean assemble_ENDIF(r700_AssemblerBase *pAsm);
+
+GLboolean Process_Export(r700_AssemblerBase* pAsm,
+                         GLuint type, 
+                         GLuint export_starting_index,
+                         GLuint export_count, 
+                         GLuint starting_register_number,
+                         GLboolean is_depth_export);
+GLboolean Move_Depth_Exports_To_Correct_Channels(r700_AssemblerBase *pAsm, 
+                                                 BITS depth_channel_select);
+
+
+//Interface
+GLboolean AssembleInstr(GLuint uiNumberInsts,
+                        struct prog_instruction *pILInst, 
+						r700_AssemblerBase *pR700AsmCode);
+GLboolean Process_Fragment_Exports(r700_AssemblerBase *pR700AsmCode, GLbitfield OutputsWritten);  
+GLboolean Process_Vertex_Exports(r700_AssemblerBase *pR700AsmCode, GLbitfield OutputsWritten);
+
+int       Init_r700_AssemblerBase(SHADER_PIPE_TYPE spt, r700_AssemblerBase* pAsm, R700_Shader* pShader);
+GLboolean Clean_Up_Assembler(r700_AssemblerBase *pR700AsmCode);
+
+#endif //_R700_ASSEMBLER_H_
diff --git a/src/mesa/drivers/dri/r600/r700_chip.c b/src/mesa/drivers/dri/r600/r700_chip.c
new file mode 100644
index 00000000000..087d17312e9
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_chip.c
@@ -0,0 +1,724 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#include "main/imports.h"
+#include "main/glheader.h"
+
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r700_state.h"
+#include "r600_tex.h"
+#include "r700_oglprog.h"
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+#include "r700_ioctl.h"
+
+#define LINK_STATES(reg)                                            \
+do                                                                  \
+{                                                                   \
+    pStateListWork->puiValue = (unsigned int*)&(r700->reg);         \
+    pStateListWork->unOffset = mm##reg - ASIC_CONTEXT_BASE_INDEX; \
+    pStateListWork->pNext    = pStateListWork + 1;                  \
+    pStateListWork++;                                               \
+}while(0)
+
+GLboolean r700InitChipObject(context_t *context)
+{
+    ContextState * pStateListWork;
+
+    R700_CHIP_CONTEXT *r700 = &context->hw;
+
+    /* init state list */
+    r700->pStateList = (ContextState*) MALLOC (sizeof(ContextState)*sizeof(R700_CHIP_CONTEXT)/sizeof(unsigned int));
+    pStateListWork = r700->pStateList;
+
+    // misc
+    LINK_STATES(TA_CNTL_AUX);
+    LINK_STATES(VC_ENHANCE);
+    LINK_STATES(SQ_DYN_GPR_CNTL_PS_FLUSH_REQ);
+    LINK_STATES(DB_DEBUG);
+    LINK_STATES(DB_WATERMARKS);
+
+    // SC
+    LINK_STATES(PA_SC_SCREEN_SCISSOR_TL);
+    LINK_STATES(PA_SC_SCREEN_SCISSOR_BR);
+    LINK_STATES(PA_SC_WINDOW_OFFSET);
+    LINK_STATES(PA_SC_WINDOW_SCISSOR_TL);
+    LINK_STATES(PA_SC_WINDOW_SCISSOR_BR);
+    LINK_STATES(PA_SC_CLIPRECT_RULE);
+    LINK_STATES(PA_SC_CLIPRECT_0_TL);
+    LINK_STATES(PA_SC_CLIPRECT_0_BR);
+    LINK_STATES(PA_SC_CLIPRECT_1_TL);
+    LINK_STATES(PA_SC_CLIPRECT_1_BR);
+    LINK_STATES(PA_SC_CLIPRECT_2_TL);
+    LINK_STATES(PA_SC_CLIPRECT_2_BR);
+    LINK_STATES(PA_SC_CLIPRECT_3_TL);
+    LINK_STATES(PA_SC_CLIPRECT_3_BR);
+    LINK_STATES(PA_SC_EDGERULE);
+    LINK_STATES(PA_SC_GENERIC_SCISSOR_TL);
+    LINK_STATES(PA_SC_GENERIC_SCISSOR_BR);
+    LINK_STATES(PA_SC_LINE_STIPPLE);
+    LINK_STATES(PA_SC_MPASS_PS_CNTL);
+    LINK_STATES(PA_SC_MODE_CNTL);
+    LINK_STATES(PA_SC_LINE_CNTL);
+    LINK_STATES(PA_SC_AA_CONFIG);
+    LINK_STATES(PA_SC_AA_SAMPLE_LOCS_MCTX);
+    LINK_STATES(PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX);
+    LINK_STATES(PA_SC_AA_MASK);
+
+    // SU
+    LINK_STATES(PA_SU_POINT_SIZE);
+    LINK_STATES(PA_SU_POINT_MINMAX);
+    LINK_STATES(PA_SU_LINE_CNTL);
+    LINK_STATES(PA_SU_SC_MODE_CNTL);
+    LINK_STATES(PA_SU_VTX_CNTL);
+    LINK_STATES(PA_SU_POLY_OFFSET_DB_FMT_CNTL);
+    LINK_STATES(PA_SU_POLY_OFFSET_CLAMP);
+    LINK_STATES(PA_SU_POLY_OFFSET_FRONT_SCALE);
+    LINK_STATES(PA_SU_POLY_OFFSET_FRONT_OFFSET);
+    LINK_STATES(PA_SU_POLY_OFFSET_BACK_SCALE);
+    LINK_STATES(PA_SU_POLY_OFFSET_BACK_OFFSET);
+
+    // CL
+    LINK_STATES(PA_CL_CLIP_CNTL);
+    LINK_STATES(PA_CL_VTE_CNTL);
+    LINK_STATES(PA_CL_VS_OUT_CNTL);
+    LINK_STATES(PA_CL_NANINF_CNTL);
+    LINK_STATES(PA_CL_GB_VERT_CLIP_ADJ);
+    LINK_STATES(PA_CL_GB_VERT_DISC_ADJ);
+    LINK_STATES(PA_CL_GB_HORZ_CLIP_ADJ);
+    LINK_STATES(PA_CL_GB_HORZ_DISC_ADJ);
+
+    // CB
+    LINK_STATES(CB_CLEAR_RED_R6XX);
+    LINK_STATES(CB_CLEAR_GREEN_R6XX);
+    LINK_STATES(CB_CLEAR_BLUE_R6XX);
+    LINK_STATES(CB_CLEAR_ALPHA_R6XX);
+    LINK_STATES(CB_TARGET_MASK);
+    LINK_STATES(CB_SHADER_MASK);
+    LINK_STATES(CB_BLEND_RED);
+    LINK_STATES(CB_BLEND_GREEN);
+    LINK_STATES(CB_BLEND_BLUE);
+    LINK_STATES(CB_BLEND_ALPHA);
+    LINK_STATES(CB_FOG_RED_R6XX);
+    LINK_STATES(CB_FOG_GREEN_R6XX);
+    LINK_STATES(CB_FOG_BLUE_R6XX);
+    LINK_STATES(CB_SHADER_CONTROL);
+    LINK_STATES(CB_COLOR_CONTROL);
+    LINK_STATES(CB_CLRCMP_CONTROL);
+    LINK_STATES(CB_CLRCMP_SRC);
+    LINK_STATES(CB_CLRCMP_DST);
+    LINK_STATES(CB_CLRCMP_MSK);
+    LINK_STATES(CB_BLEND_CONTROL);
+
+    // SX
+    LINK_STATES(SX_MISC);
+    LINK_STATES(SX_ALPHA_TEST_CONTROL);
+
+    // VGT
+    LINK_STATES(VGT_MAX_VTX_INDX);
+    LINK_STATES(VGT_MIN_VTX_INDX);
+    LINK_STATES(VGT_INDX_OFFSET);
+    LINK_STATES(VGT_MULTI_PRIM_IB_RESET_INDX);
+    LINK_STATES(VGT_OUTPUT_PATH_CNTL);
+    LINK_STATES(VGT_HOS_CNTL);
+    LINK_STATES(VGT_HOS_MAX_TESS_LEVEL);
+    LINK_STATES(VGT_HOS_MIN_TESS_LEVEL);
+    LINK_STATES(VGT_HOS_REUSE_DEPTH);
+    LINK_STATES(VGT_GROUP_PRIM_TYPE);
+    LINK_STATES(VGT_GROUP_FIRST_DECR);
+    LINK_STATES(VGT_GROUP_DECR);
+    LINK_STATES(VGT_GROUP_VECT_0_CNTL);
+    LINK_STATES(VGT_GROUP_VECT_1_CNTL);
+    LINK_STATES(VGT_GROUP_VECT_0_FMT_CNTL);
+    LINK_STATES(VGT_GROUP_VECT_1_FMT_CNTL);
+    LINK_STATES(VGT_GS_MODE);
+    LINK_STATES(VGT_PRIMITIVEID_EN);
+    LINK_STATES(VGT_DMA_NUM_INSTANCES);
+    LINK_STATES(VGT_MULTI_PRIM_IB_RESET_EN);
+    LINK_STATES(VGT_INSTANCE_STEP_RATE_0);
+    LINK_STATES(VGT_INSTANCE_STEP_RATE_1);
+    LINK_STATES(VGT_STRMOUT_EN);
+    LINK_STATES(VGT_REUSE_OFF);
+    LINK_STATES(VGT_VTX_CNT_EN);
+    LINK_STATES(VGT_STRMOUT_BUFFER_EN);
+
+    LINK_STATES(SQ_VTX_SEMANTIC_0);
+    LINK_STATES(SQ_VTX_SEMANTIC_1);
+    LINK_STATES(SQ_VTX_SEMANTIC_2);
+    LINK_STATES(SQ_VTX_SEMANTIC_3);
+    LINK_STATES(SQ_VTX_SEMANTIC_4);
+    LINK_STATES(SQ_VTX_SEMANTIC_5);
+    LINK_STATES(SQ_VTX_SEMANTIC_6);
+    LINK_STATES(SQ_VTX_SEMANTIC_7);
+    LINK_STATES(SQ_VTX_SEMANTIC_8);
+    LINK_STATES(SQ_VTX_SEMANTIC_9);
+    LINK_STATES(SQ_VTX_SEMANTIC_10);
+    LINK_STATES(SQ_VTX_SEMANTIC_11);
+    LINK_STATES(SQ_VTX_SEMANTIC_12);
+    LINK_STATES(SQ_VTX_SEMANTIC_13);
+    LINK_STATES(SQ_VTX_SEMANTIC_14);
+    LINK_STATES(SQ_VTX_SEMANTIC_15);
+    LINK_STATES(SQ_VTX_SEMANTIC_16);
+    LINK_STATES(SQ_VTX_SEMANTIC_17);
+    LINK_STATES(SQ_VTX_SEMANTIC_18);
+    LINK_STATES(SQ_VTX_SEMANTIC_19);
+    LINK_STATES(SQ_VTX_SEMANTIC_20);
+    LINK_STATES(SQ_VTX_SEMANTIC_21);
+    LINK_STATES(SQ_VTX_SEMANTIC_22);
+    LINK_STATES(SQ_VTX_SEMANTIC_23);
+    LINK_STATES(SQ_VTX_SEMANTIC_24);
+    LINK_STATES(SQ_VTX_SEMANTIC_25);
+    LINK_STATES(SQ_VTX_SEMANTIC_26);
+    LINK_STATES(SQ_VTX_SEMANTIC_27);
+    LINK_STATES(SQ_VTX_SEMANTIC_28);
+    LINK_STATES(SQ_VTX_SEMANTIC_29);
+    LINK_STATES(SQ_VTX_SEMANTIC_30);
+    LINK_STATES(SQ_VTX_SEMANTIC_31);
+
+    // SPI
+    LINK_STATES(SPI_VS_OUT_ID_0);
+    LINK_STATES(SPI_VS_OUT_ID_1);
+    LINK_STATES(SPI_VS_OUT_ID_2);
+    LINK_STATES(SPI_VS_OUT_ID_3);
+    LINK_STATES(SPI_VS_OUT_ID_4);
+    LINK_STATES(SPI_VS_OUT_ID_5);
+    LINK_STATES(SPI_VS_OUT_ID_6);
+    LINK_STATES(SPI_VS_OUT_ID_7);
+    LINK_STATES(SPI_VS_OUT_ID_8);
+    LINK_STATES(SPI_VS_OUT_ID_9);
+
+    LINK_STATES(SPI_PS_INPUT_CNTL_0);
+    LINK_STATES(SPI_PS_INPUT_CNTL_1);
+    LINK_STATES(SPI_PS_INPUT_CNTL_2);
+    LINK_STATES(SPI_PS_INPUT_CNTL_3);
+    LINK_STATES(SPI_PS_INPUT_CNTL_4);
+    LINK_STATES(SPI_PS_INPUT_CNTL_5);
+    LINK_STATES(SPI_PS_INPUT_CNTL_6);
+    LINK_STATES(SPI_PS_INPUT_CNTL_7);
+    LINK_STATES(SPI_PS_INPUT_CNTL_8);
+    LINK_STATES(SPI_PS_INPUT_CNTL_9);
+    LINK_STATES(SPI_PS_INPUT_CNTL_10);
+    LINK_STATES(SPI_PS_INPUT_CNTL_11);
+    LINK_STATES(SPI_PS_INPUT_CNTL_12);
+    LINK_STATES(SPI_PS_INPUT_CNTL_13);
+    LINK_STATES(SPI_PS_INPUT_CNTL_14);
+    LINK_STATES(SPI_PS_INPUT_CNTL_15);
+    LINK_STATES(SPI_PS_INPUT_CNTL_16);
+    LINK_STATES(SPI_PS_INPUT_CNTL_17);
+    LINK_STATES(SPI_PS_INPUT_CNTL_18);
+    LINK_STATES(SPI_PS_INPUT_CNTL_19);
+    LINK_STATES(SPI_PS_INPUT_CNTL_20);
+    LINK_STATES(SPI_PS_INPUT_CNTL_21);
+    LINK_STATES(SPI_PS_INPUT_CNTL_22);
+    LINK_STATES(SPI_PS_INPUT_CNTL_23);
+    LINK_STATES(SPI_PS_INPUT_CNTL_24);
+    LINK_STATES(SPI_PS_INPUT_CNTL_25);
+    LINK_STATES(SPI_PS_INPUT_CNTL_26);
+    LINK_STATES(SPI_PS_INPUT_CNTL_27);
+    LINK_STATES(SPI_PS_INPUT_CNTL_28);
+    LINK_STATES(SPI_PS_INPUT_CNTL_29);
+    LINK_STATES(SPI_PS_INPUT_CNTL_30);
+    LINK_STATES(SPI_PS_INPUT_CNTL_31);
+
+    LINK_STATES(SPI_VS_OUT_CONFIG);
+    LINK_STATES(SPI_THREAD_GROUPING);
+    LINK_STATES(SPI_PS_IN_CONTROL_0);
+    LINK_STATES(SPI_PS_IN_CONTROL_1);
+    LINK_STATES(SPI_INTERP_CONTROL_0);
+    LINK_STATES(SPI_INPUT_Z);
+    LINK_STATES(SPI_FOG_CNTL);
+    LINK_STATES(SPI_FOG_FUNC_SCALE);
+    LINK_STATES(SPI_FOG_FUNC_BIAS);
+
+    // SQ
+    LINK_STATES(SQ_ESGS_RING_ITEMSIZE);
+    LINK_STATES(SQ_GSVS_RING_ITEMSIZE);
+    LINK_STATES(SQ_ESTMP_RING_ITEMSIZE);
+    LINK_STATES(SQ_GSTMP_RING_ITEMSIZE);
+    LINK_STATES(SQ_VSTMP_RING_ITEMSIZE);
+    LINK_STATES(SQ_PSTMP_RING_ITEMSIZE);
+    LINK_STATES(SQ_FBUF_RING_ITEMSIZE);
+    LINK_STATES(SQ_REDUC_RING_ITEMSIZE);
+    //LINK_STATES(SQ_GS_VERT_ITEMSIZE);
+
+    pStateListWork->puiValue = (unsigned int*)&(r700->SQ_GS_VERT_ITEMSIZE);
+    pStateListWork->unOffset = mmSQ_GS_VERT_ITEMSIZE - ASIC_CONTEXT_BASE_INDEX;
+    pStateListWork->pNext    = NULL;  /* END OF STATE LIST */
+
+    return GL_TRUE;
+}
+
+void r700SetupVTXConstants(GLcontext  * ctx,
+			   unsigned int nStreamID,
+			   void *       pAos,
+			   unsigned int size,      /* number of elements in vector */
+			   unsigned int stride,
+			   unsigned int count)     /* number of vectors in stream */
+{
+    context_t *context = R700_CONTEXT(ctx);
+    uint32_t *dest;
+    struct radeon_aos * paos = (struct radeon_aos *)pAos;
+    offset_modifiers offset_mod = {NO_SHIFT, 0, 0xFFFFFFFF};
+
+    BATCH_LOCALS(&context->radeon);
+
+    unsigned int uSQ_VTX_CONSTANT_WORD0_0;
+    unsigned int uSQ_VTX_CONSTANT_WORD1_0;
+    unsigned int uSQ_VTX_CONSTANT_WORD2_0 = 0;
+    unsigned int uSQ_VTX_CONSTANT_WORD3_0 = 0;
+    unsigned int uSQ_VTX_CONSTANT_WORD6_0 = 0;
+
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS780) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV710))
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, TC_ACTION_ENA_bit);
+    else
+	    r700SyncSurf(context, paos->bo, RADEON_GEM_DOMAIN_GTT, 0, VC_ACTION_ENA_bit);
+
+    uSQ_VTX_CONSTANT_WORD0_0 = paos->offset;
+    uSQ_VTX_CONSTANT_WORD1_0 = count * (size * 4) - 1;
+
+    uSQ_VTX_CONSTANT_WORD2_0 |= 0 << BASE_ADDRESS_HI_shift /* TODO */
+	    |stride << SQ_VTX_CONSTANT_WORD2_0__STRIDE_shift
+	    |GetSurfaceFormat(GL_FLOAT, size, NULL) << SQ_VTX_CONSTANT_WORD2_0__DATA_FORMAT_shift /* TODO : trace back api for initial data type, not only GL_FLOAT */
+	    |SQ_NUM_FORMAT_SCALED << SQ_VTX_CONSTANT_WORD2_0__NUM_FORMAT_ALL_shift
+	    |SQ_VTX_CONSTANT_WORD2_0__FORMAT_COMP_ALL_bit;
+
+    uSQ_VTX_CONSTANT_WORD3_0 |= 1 << MEM_REQUEST_SIZE_shift;
+
+    uSQ_VTX_CONSTANT_WORD6_0 |= SQ_TEX_VTX_VALID_BUFFER << SQ_TEX_RESOURCE_WORD6_0__TYPE_shift;
+
+    BEGIN_BATCH_NO_AUTOSTATE(9);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
+    R600_OUT_BATCH((nStreamID + SQ_FETCH_RESOURCE_VS_OFFSET) * FETCH_RESOURCE_STRIDE);
+
+    R600_OUT_BATCH_RELOC(uSQ_VTX_CONSTANT_WORD0_0,
+                         paos->bo,
+                         uSQ_VTX_CONSTANT_WORD0_0,
+                         RADEON_GEM_DOMAIN_GTT, 0, 0, &offset_mod);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD1_0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD2_0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD3_0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(0);
+    R600_OUT_BATCH(uSQ_VTX_CONSTANT_WORD6_0);
+
+    END_BATCH();
+    COMMIT_BATCH();
+
+}
+
+int r700SetupStreams(GLcontext * ctx)
+{
+    context_t         *context = R700_CONTEXT(ctx);
+
+    BATCH_LOCALS(&context->radeon);
+
+    struct r700_vertex_program *vpc
+             = (struct r700_vertex_program *)ctx->VertexProgram._Current;
+
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+	struct vertex_buffer *vb = &tnl->vb;
+
+    unsigned int unBit;
+    unsigned int i, j = 0;
+
+    BEGIN_BATCH_NO_AUTOSTATE(6);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
+	R600_OUT_BATCH(mmSQ_VTX_BASE_VTX_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CTL_CONST, 1));
+    R600_OUT_BATCH(mmSQ_VTX_START_INST_LOC - ASIC_CTL_CONST_BASE_INDEX);
+    R600_OUT_BATCH(0);
+    END_BATCH();
+    COMMIT_BATCH();
+
+    context->radeon.tcl.aos_count = 0;
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(vpc->mesa_program.Base.InputsRead & unBit)
+		{
+			rcommon_emit_vector(ctx,
+					    &context->radeon.tcl.aos[j],
+					    vb->AttribPtr[i]->data,
+					    vb->AttribPtr[i]->size,
+					    vb->AttribPtr[i]->stride,
+					    vb->Count);
+
+			/* currently aos are packed */
+			r700SetupVTXConstants(ctx,
+					      i,
+					      (void*)(&context->radeon.tcl.aos[j]),
+					      (unsigned int)context->radeon.tcl.aos[j].components,
+					      (unsigned int)context->radeon.tcl.aos[j].stride * 4,
+					      (unsigned int)context->radeon.tcl.aos[j].count);
+			j++;
+			context->radeon.tcl.aos_count++;
+		}
+	}
+
+    return R600_FALLBACK_NONE;
+}
+
+GLboolean r700SendContextStates(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+
+    R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+
+    ContextState * pState = r700->pStateList;
+    ContextState * pInit;
+    unsigned int   toSend;
+    unsigned int   ui;
+
+    while(NULL != pState)
+    {
+        toSend = 1;
+
+        pInit = pState;
+
+	while(NULL != pState->pNext)
+	{
+                if ((pState->pNext->unOffset - pState->unOffset) > 1)
+                {
+			break;
+                }
+                else
+                {
+			pState = pState->pNext;
+			toSend++;
+                }
+	}
+
+        pState = pState->pNext;
+
+        BEGIN_BATCH_NO_AUTOSTATE(toSend + 2);
+        R600_OUT_BATCH_REGSEQ(((pInit->unOffset + ASIC_CONTEXT_BASE_INDEX)<<2), toSend);
+        for(ui=0; ui<toSend; ui++)
+        {
+                R600_OUT_BATCH(*(pInit->puiValue));
+		pInit = pInit->pNext;
+        };
+        END_BATCH();
+    };
+    COMMIT_BATCH();
+
+    return GL_TRUE;
+}
+
+
+GLboolean r700SendDepthTargetState(context_t *context, int id)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	offset_modifiers offset_mod;
+	BATCH_LOCALS(&context->radeon);
+
+	rrb = radeon_get_depthbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return GL_FALSE;
+	}
+
+	offset_mod.shift     = NO_SHIFT;
+	offset_mod.shiftbits = 0;
+	offset_mod.mask      = 0xFFFFFFFF;
+
+        BEGIN_BATCH_NO_AUTOSTATE(9);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_SIZE, 2);
+	R600_OUT_BATCH(r700->DB_DEPTH_SIZE.u32All);
+	R600_OUT_BATCH(r700->DB_DEPTH_VIEW.u32All);
+	R600_OUT_BATCH_REGSEQ(DB_DEPTH_BASE, 3);
+	R600_OUT_BATCH_RELOC(r700->DB_DEPTH_BASE.u32All,
+			     rrb->bo,
+			     r700->DB_DEPTH_BASE.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0, &offset_mod);
+	R600_OUT_BATCH(r700->DB_DEPTH_INFO.u32All);
+	R600_OUT_BATCH(r700->DB_HTILE_DATA_BASE.u32All);
+        END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(24);
+	R600_OUT_BATCH_REGSEQ(DB_STENCIL_CLEAR, 2);
+	R600_OUT_BATCH(r700->DB_STENCIL_CLEAR.u32All);
+	R600_OUT_BATCH(r700->DB_DEPTH_CLEAR.u32All);
+
+	R600_OUT_BATCH_REGSEQ(DB_STENCILREFMASK, 2);
+	R600_OUT_BATCH(r700->DB_STENCILREFMASK.u32All);
+	R600_OUT_BATCH(r700->DB_STENCILREFMASK_BF.u32All);
+
+	R600_OUT_BATCH_REGVAL(DB_DEPTH_CONTROL, r700->DB_DEPTH_CONTROL.u32All);
+	R600_OUT_BATCH_REGVAL(DB_SHADER_CONTROL, r700->DB_SHADER_CONTROL.u32All);
+
+	R600_OUT_BATCH_REGSEQ(DB_RENDER_CONTROL, 2);
+	R600_OUT_BATCH(r700->DB_RENDER_CONTROL.u32All);
+	R600_OUT_BATCH(r700->DB_RENDER_OVERRIDE.u32All);
+
+	R600_OUT_BATCH_REGVAL(DB_HTILE_SURFACE, r700->DB_HTILE_SURFACE.u32All);
+	R600_OUT_BATCH_REGVAL(DB_ALPHA_TO_MASK, r700->DB_ALPHA_TO_MASK.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+	r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
+		     DB_ACTION_ENA_bit | DB_DEST_BASE_ENA_bit);
+
+	return GL_TRUE;
+}
+
+GLboolean r700SendRenderTargetState(context_t *context, int id)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	offset_modifiers offset_mod;
+	BATCH_LOCALS(&context->radeon);
+
+	rrb = radeon_get_colorbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return GL_FALSE;
+	}
+
+	if (id > R700_MAX_RENDER_TARGETS)
+		return GL_FALSE;
+
+	if (!r700->render_target[id].enabled)
+		return GL_FALSE;
+
+	offset_mod.shift     = NO_SHIFT;
+	offset_mod.shiftbits = 0;
+	offset_mod.mask      = 0xFFFFFFFF;
+
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+	R600_OUT_BATCH_REGSEQ(CB_COLOR0_BASE + (4 * id), 1);
+	R600_OUT_BATCH_RELOC(r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     rrb->bo,
+			     r700->render_target[id].CB_COLOR0_BASE.u32All,
+			     0, RADEON_GEM_DOMAIN_VRAM, 0, &offset_mod);
+        END_BATCH();
+
+	if ((context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) &&
+	    (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)) {
+		BEGIN_BATCH_NO_AUTOSTATE(2);
+		R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_BASE_UPDATE, 0));
+		R600_OUT_BATCH((2 << id));
+		END_BATCH();
+	}
+
+        BEGIN_BATCH_NO_AUTOSTATE(18);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_SIZE + (4 * id), r700->render_target[id].CB_COLOR0_SIZE.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_VIEW + (4 * id), r700->render_target[id].CB_COLOR0_VIEW.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_INFO + (4 * id), r700->render_target[id].CB_COLOR0_INFO.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_TILE + (4 * id), r700->render_target[id].CB_COLOR0_TILE.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_FRAG + (4 * id), r700->render_target[id].CB_COLOR0_FRAG.u32All);
+	R600_OUT_BATCH_REGVAL(CB_COLOR0_MASK + (4 * id), r700->render_target[id].CB_COLOR0_MASK.u32All);
+        END_BATCH();
+
+	if (context->radeon.radeonScreen->chip_family > CHIP_FAMILY_R600) {
+		BEGIN_BATCH_NO_AUTOSTATE(3);
+		R600_OUT_BATCH_REGVAL(CB_BLEND0_CONTROL + (4 * id), r700->render_target[id].CB_BLEND0_CONTROL.u32All);
+		END_BATCH();
+	}
+
+	COMMIT_BATCH();
+
+	r700SyncSurf(context, rrb->bo, 0, RADEON_GEM_DOMAIN_VRAM,
+		     CB_ACTION_ENA_bit | (1 << (id + 6)));
+
+	return GL_TRUE;
+}
+
+GLboolean r700SendPSState(context_t *context)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	struct radeon_bo * pbo;
+	offset_modifiers offset_mod;
+	BATCH_LOCALS(&context->radeon);
+
+	pbo = (struct radeon_bo *)r700GetActiveFpShaderBo(GL_CONTEXT(context));
+
+	offset_mod.shift     = NO_SHIFT;
+	offset_mod.shiftbits = 0;
+	offset_mod.mask      = 0xFFFFFFFF;
+
+	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_PS, 1);
+	R600_OUT_BATCH_RELOC(r700->ps.SQ_PGM_START_PS.u32All,
+			     pbo,
+			     r700->ps.SQ_PGM_START_PS.u32All,
+			     RADEON_GEM_DOMAIN_GTT, 0, 0, &offset_mod);
+	END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(9);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_PS, r700->ps.SQ_PGM_RESOURCES_PS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_EXPORTS_PS, r700->ps.SQ_PGM_EXPORTS_PS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_PS, r700->ps.SQ_PGM_CF_OFFSET_PS.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+	return GL_TRUE;
+}
+
+GLboolean r700SendVSState(context_t *context)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	struct radeon_bo * pbo;
+	offset_modifiers offset_mod;
+	BATCH_LOCALS(&context->radeon);
+
+	pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(GL_CONTEXT(context));
+
+	offset_mod.shift     = NO_SHIFT;
+	offset_mod.shiftbits = 0;
+	offset_mod.mask      = 0xFFFFFFFF;
+
+	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_VS, 1);
+	R600_OUT_BATCH_RELOC(r700->vs.SQ_PGM_START_VS.u32All,
+			     pbo,
+			     r700->vs.SQ_PGM_START_VS.u32All,
+			     RADEON_GEM_DOMAIN_GTT, 0, 0, &offset_mod);
+	END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_VS, r700->vs.SQ_PGM_RESOURCES_VS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_VS, r700->vs.SQ_PGM_CF_OFFSET_VS.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+	return GL_TRUE;
+}
+
+GLboolean r700SendFSState(context_t *context)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	struct radeon_bo * pbo;
+	offset_modifiers offset_mod;
+	BATCH_LOCALS(&context->radeon);
+
+	/* XXX fixme
+	 * R6xx chips require a FS be emitted, even if it's not used.
+	 * since we aren't using FS yet, just send the VS address to make
+	 * the kernel command checker happy
+	 */
+	pbo = (struct radeon_bo *)r700GetActiveVpShaderBo(GL_CONTEXT(context));
+	r700->fs.SQ_PGM_START_FS.u32All = r700->vs.SQ_PGM_START_VS.u32All;
+	r700->fs.SQ_PGM_RESOURCES_FS.u32All = 0;
+	r700->fs.SQ_PGM_CF_OFFSET_FS.u32All = 0;
+	/* XXX */
+
+	offset_mod.shift     = NO_SHIFT;
+	offset_mod.shiftbits = 0;
+	offset_mod.mask      = 0xFFFFFFFF;
+
+	r700SyncSurf(context, pbo, RADEON_GEM_DOMAIN_GTT, 0, SH_ACTION_ENA_bit);
+
+        BEGIN_BATCH_NO_AUTOSTATE(3);
+	R600_OUT_BATCH_REGSEQ(SQ_PGM_START_FS, 1);
+	R600_OUT_BATCH_RELOC(r700->fs.SQ_PGM_START_FS.u32All,
+			     pbo,
+			     r700->fs.SQ_PGM_START_FS.u32All,
+			     RADEON_GEM_DOMAIN_GTT, 0, 0, &offset_mod);
+	END_BATCH();
+
+        BEGIN_BATCH_NO_AUTOSTATE(6);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_RESOURCES_FS, r700->fs.SQ_PGM_RESOURCES_FS.u32All);
+	R600_OUT_BATCH_REGVAL(SQ_PGM_CF_OFFSET_FS, r700->fs.SQ_PGM_CF_OFFSET_FS.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+	return GL_TRUE;
+}
+
+GLboolean r700SendViewportState(context_t *context, int id)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	struct radeon_renderbuffer *rrb;
+	offset_modifiers offset_mod;
+	BATCH_LOCALS(&context->radeon);
+
+	if (id > R700_MAX_VIEWPORTS)
+		return GL_FALSE;
+
+	if (!r700->viewport[id].enabled)
+		return GL_FALSE;
+
+        BEGIN_BATCH_NO_AUTOSTATE(16);
+	R600_OUT_BATCH_REGSEQ(PA_SC_VPORT_SCISSOR_0_TL + (8 * id), 2);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All);
+	R600_OUT_BATCH_REGSEQ(PA_SC_VPORT_ZMIN_0 + (8 * id), 2);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_ZMIN_0.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_SC_VPORT_ZMAX_0.u32All);
+	R600_OUT_BATCH_REGSEQ(PA_CL_VPORT_XSCALE_0 + (24 * id), 6);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_XSCALE.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_XOFFSET.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_YSCALE.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_YOFFSET.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_ZSCALE.u32All);
+	R600_OUT_BATCH(r700->viewport[id].PA_CL_VPORT_ZOFFSET.u32All);
+        END_BATCH();
+
+	COMMIT_BATCH();
+
+	return GL_TRUE;
+}
+
+GLboolean r700SendSQConfig(context_t *context)
+{
+	R700_CHIP_CONTEXT *r700 = R700_CONTEXT_STATES(context);
+	BATCH_LOCALS(&context->radeon);
+
+        BEGIN_BATCH_NO_AUTOSTATE(8);
+	R600_OUT_BATCH_REGSEQ(SQ_CONFIG, 6);
+	R600_OUT_BATCH(r700->sq_config.SQ_CONFIG.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All);
+	R600_OUT_BATCH(r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All);
+        END_BATCH();
+	COMMIT_BATCH();
+
+	return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_chip.h b/src/mesa/drivers/dri/r600/r700_chip.h
new file mode 100644
index 00000000000..04af4bc3b98
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_chip.h
@@ -0,0 +1,521 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_CHIP_H_
+#define _R700_CHIP_H_
+
+#include "r600_context.h"
+
+#include "r600_reg.h"
+#include "r600_reg_auto_r6xx.h"
+#include "r600_reg_r6xx.h"
+#include "r600_reg_r7xx.h"
+
+#include "r700_chipoffset.h"
+
+#define SETfield(x, val, shift, mask)  ( (x) = ((x) & ~(mask)) | ((val) << (shift)) ) /* u32All */
+#define CLEARfield(x, mask)            ( (x) &= ~(mask) )
+#define SETbit(x, bit)                 ( (x) |= (bit) )
+#define CLEARbit(x, bit)               ( (x) &= ~(bit) )
+
+#define R700_TEXTURE_NUMBERUNITS 16
+#define R700_MAX_RENDER_TARGETS  8
+#define R700_MAX_VIEWPORTS       16
+#define R700_MAX_SHADER_EXPORTS  32
+#define R700_MAX_UCP             6
+
+/* Enum not show in r600_*.h */
+
+#define FETCH_RESOURCE_STRIDE 7
+
+#define ASIC_CONFIG_BASE_INDEX    0x2000
+#define ASIC_CONTEXT_BASE_INDEX   0xA000
+#define ASIC_CTL_CONST_BASE_INDEX 0xF3FC
+
+
+enum 
+{
+    SQ_ABSOLUTE                              = 0x00000000,
+    SQ_RELATIVE                              = 0x00000001,
+};
+
+enum 
+{
+    SQ_ALU_SCL_210                           = 0x00000000,
+    SQ_ALU_SCL_122                           = 0x00000001,
+    SQ_ALU_SCL_212                           = 0x00000002,
+    SQ_ALU_SCL_221                           = 0x00000003,
+};
+
+enum 
+{
+    SQ_TEX_UNNORMALIZED                      = 0x00000000,
+    SQ_TEX_NORMALIZED                        = 0x00000001,
+};
+
+enum 
+{
+    SQ_CF_PIXEL_MRT0                         = 0x00000000,
+    SQ_CF_PIXEL_MRT1                         = 0x00000001,
+    SQ_CF_PIXEL_MRT2                         = 0x00000002,
+    SQ_CF_PIXEL_MRT3                         = 0x00000003,
+    SQ_CF_PIXEL_MRT4                         = 0x00000004,
+    SQ_CF_PIXEL_MRT5                         = 0x00000005,
+    SQ_CF_PIXEL_MRT6                         = 0x00000006,
+    SQ_CF_PIXEL_MRT7                         = 0x00000007,
+    SQ_CF_PIXEL_Z                            = 0x0000003d,
+};
+
+typedef enum ENUM_SQ_CF_ARRAY_BASE_POS {
+SQ_CF_POS_0                              = 0x0000003c,
+SQ_CF_POS_1                              = 0x0000003d,
+SQ_CF_POS_2                              = 0x0000003e,
+SQ_CF_POS_3                              = 0x0000003f,
+} ENUM_SQ_CF_ARRAY_BASE_POS;
+
+enum
+{
+    PGM_RESOURCES__PRIME_CACHE_ON_DRAW_bit = 23,
+};
+
+enum 
+{
+    TEX_XYFilter_Point                       = 0x00000000,
+    TEX_XYFilter_Linear                      = 0x00000001,
+    TEX_XYFilter_Cubic                       = 0x00000002,
+    TEX_XYFilter_Cleartype                   = 0x00000003,
+
+    TEX_MipFilter_None                       = 0x00000000,
+    TEX_MipFilter_Point                      = 0x00000001,
+    TEX_MipFilter_Linear                     = 0x00000002,
+};
+
+enum 
+{
+    SQ_EXPORT_WRITE                          = 0x00000000,
+    SQ_EXPORT_WRITE_IND                      = 0x00000001,
+    SQ_EXPORT_WRITE_ACK                      = 0x00000002,
+    SQ_EXPORT_WRITE_IND_ACK                  = 0x00000003,
+};
+
+/* --------------------------------- */
+
+enum
+{
+    R700_PM4_PACKET0_NOP = 0x00000000,
+    R700_PM4_PACKET1_NOP = 0x40000000,
+    R700_PM4_PACKET2_NOP = 0x80000000,
+    R700_PM4_PACKET3_NOP = 0xC0000000,
+};
+
+#define  PM4_OPCODE_SET_INDEX_TYPE      (R700_PM4_PACKET3_NOP | (IT_INDEX_TYPE << 8))
+
+#define  PM4_OPCODE_DRAW_INDEX_AUTO     (R700_PM4_PACKET3_NOP | (IT_DRAW_INDEX_AUTO << 8))
+#define  PM4_OPCODE_DRAW_INDEX_IMMD     (R700_PM4_PACKET3_NOP | (IT_DRAW_INDEX_IMMD << 8))
+#define  PM4_OPCODE_WAIT_REG_MEM        (R700_PM4_PACKET3_NOP | (IT_WAIT_REG_MEM << 8))
+#define  PM4_OPCODE_SET_CONTEXT_REG     (R700_PM4_PACKET3_NOP | (IT_SET_CONTEXT_REG << 8))
+#define  PM4_OPCODE_SET_CONFIG_REG      (R700_PM4_PACKET3_NOP | (IT_SET_CONFIG_REG << 8))
+#define  PM4_OPCODE_SET_ALU_CONST       (R700_PM4_PACKET3_NOP | (IT_SET_ALU_CONST << 8))
+#define  PM4_OPCODE_SET_RESOURCE        (R700_PM4_PACKET3_NOP | (IT_SET_RESOURCE << 8))
+#define  PM4_OPCODE_SET_SAMPLER         (R700_PM4_PACKET3_NOP | (IT_SET_SAMPLER << 8))
+#define  PM4_OPCODE_CONTEXT_CONTROL     (R700_PM4_PACKET3_NOP | (IT_CONTEXT_CONTROL << 8))
+
+union UINT_FLOAT 
+{
+    unsigned int u32All;
+    float	f32All;
+};
+
+#if 0
+typedef struct _TEXTURE_STATE_STRUCT
+{
+    union UINT_FLOAT     SQ_TEX_RESOURCE0;
+    union UINT_FLOAT     SQ_TEX_RESOURCE1;
+    union UINT_FLOAT     SQ_TEX_RESOURCE2;
+    union UINT_FLOAT     SQ_TEX_RESOURCE3;
+    union UINT_FLOAT     SQ_TEX_RESOURCE4;
+    union UINT_FLOAT     SQ_TEX_RESOURCE5;
+    union UINT_FLOAT     SQ_TEX_RESOURCE6;
+    GLboolean                         enabled;
+} TEXTURE_STATE_STRUCT;
+
+typedef struct _SAMPLER_STATE_STRUCT
+{
+    union UINT_FLOAT      SQ_TEX_SAMPLER0;
+    union UINT_FLOAT      SQ_TEX_SAMPLER1;
+    union UINT_FLOAT      SQ_TEX_SAMPLER2;
+    GLboolean                         enabled;
+} SAMPLER_STATE_STRUCT;
+
+typedef struct _R700_TEXTURE_STATES
+{
+    TEXTURE_STATE_STRUCT *textures[R700_TEXTURE_NUMBERUNITS];
+    SAMPLER_STATE_STRUCT *samplers[R700_TEXTURE_NUMBERUNITS];
+} R700_TEXTURE_STATES;
+#endif
+
+typedef struct _RENDER_TARGET_STATE_STRUCT
+{
+	union UINT_FLOAT            	CB_COLOR0_BASE;  /* 0xA010 */
+	union UINT_FLOAT            	CB_COLOR0_SIZE;  /* 0xA018 */
+	union UINT_FLOAT            	CB_COLOR0_VIEW;  /* 0xA020 */
+	union UINT_FLOAT            	CB_COLOR0_INFO;  /* 0xA028 */
+	union UINT_FLOAT            	CB_COLOR0_TILE;  /* 0xA030 */
+	union UINT_FLOAT            	CB_COLOR0_FRAG;  /* 0xA038 */
+	union UINT_FLOAT            	CB_COLOR0_MASK;  /* 0xA040 */
+	union UINT_FLOAT         	CB_BLEND0_CONTROL;  /* 0xA1E0 */
+	GLboolean                         enabled;
+} RENDER_TARGET_STATE_STRUCT;
+
+typedef struct _VIEWPORT_STATE_STRUCT
+{
+	union UINT_FLOAT  	PA_SC_VPORT_SCISSOR_0_TL;  /* 0xA094 */
+	union UINT_FLOAT  	PA_SC_VPORT_SCISSOR_0_BR;  /* 0xA095 */
+	union UINT_FLOAT        PA_SC_VPORT_ZMIN_0;        /* 0xA0B4 */
+	union UINT_FLOAT        PA_SC_VPORT_ZMAX_0;        /* 0xA0B5 */
+	union UINT_FLOAT        PA_CL_VPORT_XSCALE;        /* 0xA10F */
+	union UINT_FLOAT       	PA_CL_VPORT_XOFFSET;       /* 0xA110 */
+	union UINT_FLOAT        PA_CL_VPORT_YSCALE;        /* 0xA111 */
+	union UINT_FLOAT       	PA_CL_VPORT_YOFFSET;       /* 0xA112 */
+	union UINT_FLOAT        PA_CL_VPORT_ZSCALE;        /* 0xA113 */
+	union UINT_FLOAT       	PA_CL_VPORT_ZOFFSET;       /* 0xA114 */
+	GLboolean                         enabled;
+} VIEWPORT_STATE_STRUCT;
+
+typedef struct _UCP_STATE_STRUCT
+{
+	union UINT_FLOAT        PA_CL_UCP_0_X;
+	union UINT_FLOAT        PA_CL_UCP_0_Y;
+	union UINT_FLOAT        PA_CL_UCP_0_Z;
+	union UINT_FLOAT        PA_CL_UCP_0_W;
+	GLboolean                         enabled;
+} UCP_STATE_STRUCT;
+
+typedef struct _PS_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_PS           ;  /* 0xA210 */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_PS       ;  /* 0xA214 */
+	union UINT_FLOAT         	SQ_PGM_EXPORTS_PS         ;  /* 0xA215 */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_PS       ;  /* 0xA233 */
+} PS_STATE_STRUCT;
+
+typedef struct _VS_STATE_STRUCT
+{
+ 	union UINT_FLOAT           	SQ_PGM_START_VS           ;  /* 0xA216 */
+	union UINT_FLOAT  		SQ_PGM_RESOURCES_VS       ;  /* 0xA21A */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_VS       ;  /* 0xA234 */
+} VS_STATE_STRUCT;
+
+typedef struct _GS_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_GS           ;  /* 0xA21B */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_GS       ;  /* 0xA21F */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_GS       ;  /* 0xA235 */
+} GS_STATE_STRUCT;
+
+typedef struct _ES_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_ES           ;  /* 0xA220 */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_ES       ;  /* 0xA224 */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_ES       ;  /* 0xA236 */
+} ES_STATE_STRUCT;
+
+typedef struct _FS_STATE_STRUCT
+{
+	union UINT_FLOAT           	SQ_PGM_START_FS           ;  /* 0xA225 */
+	union UINT_FLOAT       	        SQ_PGM_RESOURCES_FS       ;  /* 0xA229 */
+	union UINT_FLOAT       	        SQ_PGM_CF_OFFSET_FS       ;  /* 0xA237 */
+} FS_STATE_STRUCT;
+
+typedef struct _SQ_CONFIG_STRUCT
+{
+	union UINT_FLOAT     	        SQ_CONFIG                 ;  /* 0x2300 */
+	union UINT_FLOAT     	        SQ_GPR_RESOURCE_MGMT_1    ;  /* 0x2301 */
+	union UINT_FLOAT     	        SQ_GPR_RESOURCE_MGMT_2    ;  /* 0x2302 */
+	union UINT_FLOAT     	        SQ_THREAD_RESOURCE_MGMT   ;  /* 0x2303 */
+	union UINT_FLOAT     	        SQ_STACK_RESOURCE_MGMT_1  ;  /* 0x2304 */
+	union UINT_FLOAT     	        SQ_STACK_RESOURCE_MGMT_2  ;  /* 0x2305 */
+} SQ_CONFIG_STRUCT;
+
+typedef struct ContextState
+{
+    unsigned int * puiValue;
+    unsigned int   unOffset;
+    struct ContextState * pNext;
+} ContextState;
+
+typedef struct _R700_CHIP_CONTEXT
+{
+	// misc
+	union UINT_FLOAT             	TA_CNTL_AUX               ;  /* 0x2542 */
+	union UINT_FLOAT             	VC_ENHANCE                ;  /* 0x25C5 */
+	union UINT_FLOAT             	SQ_DYN_GPR_CNTL_PS_FLUSH_REQ;  /* 0x2363 */
+	union UINT_FLOAT             	DB_DEBUG                  ;  /* 0x260C */
+	union UINT_FLOAT             	DB_WATERMARKS             ;  /* 0x260E */
+
+	// DB
+	union UINT_FLOAT             	DB_DEPTH_SIZE             ;  /* 0xA000 */
+	union UINT_FLOAT             	DB_DEPTH_VIEW             ;  /* 0xA001 */
+	union UINT_FLOAT             	DB_DEPTH_BASE             ;  /* 0xA003 */
+	union UINT_FLOAT             	DB_DEPTH_INFO             ;  /* 0xA004 */
+	union UINT_FLOAT                DB_HTILE_DATA_BASE        ;  /* 0xA005 */
+	union UINT_FLOAT          	DB_STENCIL_CLEAR          ;  /* 0xA00A */
+	union UINT_FLOAT            	DB_DEPTH_CLEAR            ;  /* 0xA00B */
+	union UINT_FLOAT            	DB_STENCILREFMASK         ;  /* 0xA10C */
+	union UINT_FLOAT            	DB_STENCILREFMASK_BF      ;  /* 0xA10D */
+	union UINT_FLOAT         	DB_RENDER_CONTROL         ;  /* 0xA343 */
+	union UINT_FLOAT        	DB_RENDER_OVERRIDE        ;  /* 0xA344 */
+	union UINT_FLOAT          	DB_HTILE_SURFACE          ;  /* 0xA349 */
+	union UINT_FLOAT          	DB_ALPHA_TO_MASK          ;  /* 0xA351 */
+	union UINT_FLOAT          	DB_DEPTH_CONTROL          ;  /* 0xA200 */
+	union UINT_FLOAT         	DB_SHADER_CONTROL         ;  /* 0xA203 */
+
+	// SC
+	union UINT_FLOAT   	        PA_SC_SCREEN_SCISSOR_TL   ;  /* 0xA00C */
+	union UINT_FLOAT   	        PA_SC_SCREEN_SCISSOR_BR   ;  /* 0xA00D */
+	union UINT_FLOAT       	        PA_SC_WINDOW_OFFSET       ;  /* 0xA080 */
+	union UINT_FLOAT   	        PA_SC_WINDOW_SCISSOR_TL   ;  /* 0xA081 */
+	union UINT_FLOAT   	        PA_SC_WINDOW_SCISSOR_BR   ;  /* 0xA082 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_RULE       ;  /* 0xA083 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_0_TL       ;  /* 0xA084 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_0_BR       ;  /* 0xA085 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_1_TL       ;  /* 0xA086 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_1_BR       ;  /* 0xA087 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_2_TL       ;  /* 0xA088 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_2_BR       ;  /* 0xA089 */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_3_TL       ;  /* 0xA08A */
+	union UINT_FLOAT       	        PA_SC_CLIPRECT_3_BR       ;  /* 0xA08B */
+	union UINT_FLOAT            	PA_SC_EDGERULE            ;  /* 0xA08C */
+	union UINT_FLOAT  	        PA_SC_GENERIC_SCISSOR_TL  ;  /* 0xA090 */
+	union UINT_FLOAT  	        PA_SC_GENERIC_SCISSOR_BR  ;  /* 0xA091 */
+	union UINT_FLOAT        	PA_SC_LINE_STIPPLE        ;  /* 0xA283 */
+	union UINT_FLOAT           	PA_SC_LINE_CNTL           ;  /* 0xA300 */
+	union UINT_FLOAT           	PA_SC_AA_CONFIG           ;  /* 0xA301 */
+	union UINT_FLOAT       	        PA_SC_MPASS_PS_CNTL       ;  /* 0xA292 */
+	union UINT_FLOAT           	PA_SC_MODE_CNTL           ;  /* 0xA293 */
+	union UINT_FLOAT 	        PA_SC_AA_SAMPLE_LOCS_MCTX ;  /* 0xA307 */
+	union UINT_FLOAT                PA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX; /* 0xA308 */
+	union UINT_FLOAT             	PA_SC_AA_MASK             ;  /* 0xA312 */
+
+	// CL
+	union UINT_FLOAT           	PA_CL_CLIP_CNTL           ;  /* 0xA204 */
+	union UINT_FLOAT            	PA_CL_VTE_CNTL            ;  /* 0xA206 */
+	union UINT_FLOAT         	PA_CL_VS_OUT_CNTL         ;  /* 0xA207 */
+	union UINT_FLOAT         	PA_CL_NANINF_CNTL         ;  /* 0xA208 */
+	union UINT_FLOAT    	        PA_CL_GB_VERT_CLIP_ADJ    ;  /* 0xA303 */
+	union UINT_FLOAT    	        PA_CL_GB_VERT_DISC_ADJ    ;  /* 0xA304 */
+	union UINT_FLOAT    	        PA_CL_GB_HORZ_CLIP_ADJ    ;  /* 0xA305 */
+	union UINT_FLOAT    	        PA_CL_GB_HORZ_DISC_ADJ    ;  /* 0xA306 */
+
+	// SU
+	union UINT_FLOAT        	PA_SU_SC_MODE_CNTL        ;  /* 0xA205 */
+	union UINT_FLOAT          	PA_SU_POINT_SIZE          ;  /* 0xA280 */
+	union UINT_FLOAT        	PA_SU_POINT_MINMAX        ;  /* 0xA281 */
+	union UINT_FLOAT           	PA_SU_LINE_CNTL           ;  /* 0xA282 */
+	union UINT_FLOAT            	PA_SU_VTX_CNTL            ;  /* 0xA302 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_DB_FMT_CNTL;   /* 0xA37E */
+	union UINT_FLOAT   	        PA_SU_POLY_OFFSET_CLAMP   ;      /* 0xA37F */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_FRONT_SCALE;   /* 0xA380 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_FRONT_OFFSET; /* 0xA381 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_BACK_SCALE;    /* 0xA382 */
+	union UINT_FLOAT                PA_SU_POLY_OFFSET_BACK_OFFSET;   /* 0xA383 */
+
+	VIEWPORT_STATE_STRUCT           viewport[R700_MAX_VIEWPORTS];
+	UCP_STATE_STRUCT                ucp[R700_MAX_UCP];
+
+	// CB
+	union UINT_FLOAT              	CB_CLEAR_RED_R6XX         ;  /* 0xA048 */
+	union UINT_FLOAT            	CB_CLEAR_GREEN_R6XX       ;  /* 0xA049 */
+	union UINT_FLOAT             	CB_CLEAR_BLUE_R6XX        ;  /* 0xA04A */
+	union UINT_FLOAT            	CB_CLEAR_ALPHA_R6XX       ;  /* 0xA04B */
+	union UINT_FLOAT            	CB_TARGET_MASK            ;  /* 0xA08E */
+	union UINT_FLOAT            	CB_SHADER_MASK            ;  /* 0xA08F */
+	union UINT_FLOAT              	CB_BLEND_RED              ;  /* 0xA105 */
+	union UINT_FLOAT            	CB_BLEND_GREEN            ;  /* 0xA106 */
+	union UINT_FLOAT             	CB_BLEND_BLUE             ;  /* 0xA107 */
+	union UINT_FLOAT            	CB_BLEND_ALPHA            ;  /* 0xA108 */
+	union UINT_FLOAT              	CB_FOG_RED_R6XX           ;  /* 0xA109 */
+	union UINT_FLOAT            	CB_FOG_GREEN_R6XX         ;  /* 0xA10A */
+	union UINT_FLOAT             	CB_FOG_BLUE_R6XX          ;  /* 0xA10B */
+	union UINT_FLOAT         	CB_SHADER_CONTROL         ;  /* 0xA1E8 */
+	union UINT_FLOAT          	CB_COLOR_CONTROL          ;  /* 0xA202 */
+	union UINT_FLOAT         	CB_CLRCMP_CONTROL         ;  /* 0xA30C */
+	union UINT_FLOAT             	CB_CLRCMP_SRC             ;  /* 0xA30D */
+	union UINT_FLOAT             	CB_CLRCMP_DST             ;  /* 0xA30E */
+	union UINT_FLOAT             	CB_CLRCMP_MSK             ;  /* 0xA30F */
+	union UINT_FLOAT             	CB_BLEND_CONTROL          ;  /* 0xABD0 */
+	RENDER_TARGET_STATE_STRUCT      render_target[R700_MAX_RENDER_TARGETS];
+
+	// SX
+	union UINT_FLOAT                SX_MISC                   ;  /* 0xA0D4 */
+	union UINT_FLOAT     	        SX_ALPHA_TEST_CONTROL     ;  /* 0xA104 */
+
+	// VGT
+	union UINT_FLOAT          	VGT_MAX_VTX_INDX          ;  /* 0xA100 */
+	union UINT_FLOAT          	VGT_MIN_VTX_INDX          ;  /* 0xA101 */
+	union UINT_FLOAT           	VGT_INDX_OFFSET           ;  /* 0xA102 */
+	union UINT_FLOAT                VGT_MULTI_PRIM_IB_RESET_INDX;  /* 0xA103 */
+	union UINT_FLOAT      	        VGT_OUTPUT_PATH_CNTL      ;  /* 0xA284 */
+	union UINT_FLOAT      	        VGT_HOS_CNTL              ;  /* 0xA285 */
+	union UINT_FLOAT      	        VGT_HOS_MAX_TESS_LEVEL    ;  /* 0xA286 */
+	union UINT_FLOAT      	        VGT_HOS_MIN_TESS_LEVEL    ;  /* 0xA287 */
+	union UINT_FLOAT      	        VGT_HOS_REUSE_DEPTH       ;  /* 0xA288 */
+	union UINT_FLOAT      	        VGT_GROUP_PRIM_TYPE       ;  /* 0xA289 */
+	union UINT_FLOAT      	        VGT_GROUP_FIRST_DECR      ;  /* 0xA28A */
+	union UINT_FLOAT      	        VGT_GROUP_DECR            ;  /* 0xA28B */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_0_CNTL     ;  /* 0xA28C */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_1_CNTL     ;  /* 0xA28D */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_0_FMT_CNTL ;  /* 0xA28E */
+	union UINT_FLOAT      	        VGT_GROUP_VECT_1_FMT_CNTL ;  /* 0xA28F */
+	union UINT_FLOAT               	VGT_GS_MODE               ;  /* 0xA290 */
+	union UINT_FLOAT        	VGT_PRIMITIVEID_EN        ;  /* 0xA2A1 */
+	union UINT_FLOAT     	        VGT_DMA_NUM_INSTANCES     ;  /* 0xA2A2 */
+	union UINT_FLOAT	        VGT_MULTI_PRIM_IB_RESET_EN;  /* 0xA2A5 */
+	union UINT_FLOAT  	        VGT_INSTANCE_STEP_RATE_0  ;  /* 0xA2A8 */
+	union UINT_FLOAT  	        VGT_INSTANCE_STEP_RATE_1  ;  /* 0xA2A9 */
+	union UINT_FLOAT            	VGT_STRMOUT_EN            ;  /* 0xA2AC */
+	union UINT_FLOAT             	VGT_REUSE_OFF             ;  /* 0xA2AD */
+	union UINT_FLOAT             	VGT_VTX_CNT_EN            ;  /* 0xA2AE */
+	union UINT_FLOAT            	VGT_STRMOUT_BUFFER_EN     ;  /* 0xA2C8 */
+
+	// SPI
+	union UINT_FLOAT           	SPI_VS_OUT_ID_0           ;  /* 0xA185 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_1           ;  /* 0xA186 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_2           ;  /* 0xA187 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_3           ;  /* 0xA188 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_4           ;  /* 0xA189 */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_5           ;  /* 0xA18A */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_6           ;  /* 0xA18B */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_7           ;  /* 0xA18C */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_8           ;  /* 0xA18D */
+	union UINT_FLOAT           	SPI_VS_OUT_ID_9           ;  /* 0xA18E */
+	union UINT_FLOAT                SPI_VS_OUT_CONFIG         ;  /* 0xA1B1 */
+	union UINT_FLOAT       	        SPI_THREAD_GROUPING       ;  /* 0xA1B2 */
+	union UINT_FLOAT       	        SPI_PS_IN_CONTROL_0       ;  /* 0xA1B3 */
+	union UINT_FLOAT       	        SPI_PS_IN_CONTROL_1       ;  /* 0xA1B4 */
+	union UINT_FLOAT       	        SPI_INTERP_CONTROL_0      ;  /* 0xA1B5 */
+ 	union UINT_FLOAT               	SPI_INPUT_Z               ;  /* 0xA1B6 */
+	union UINT_FLOAT              	SPI_FOG_CNTL              ;  /* 0xA1B7 */
+	union UINT_FLOAT              	SPI_FOG_FUNC_SCALE        ;  /* 0xA1B8 */
+	union UINT_FLOAT              	SPI_FOG_FUNC_BIAS         ;  /* 0xA1B9 */
+
+    union UINT_FLOAT         	SQ_VTX_SEMANTIC_0         ;  /* 0xA0E0 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_1         ;  /* 0xA0E1 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_2         ;  /* 0xA0E2 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_3         ;  /* 0xA0E3 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_4         ;  /* 0xA0E4 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_5         ;  /* 0xA0E5 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_6         ;  /* 0xA0E6 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_7         ;  /* 0xA0E7 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_8         ;  /* 0xA0E8 */
+	union UINT_FLOAT         	SQ_VTX_SEMANTIC_9         ;  /* 0xA0E9 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_10        ;  /* 0xA0EA */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_11        ;  /* 0xA0EB */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_12        ;  /* 0xA0EC */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_13        ;  /* 0xA0ED */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_14        ;  /* 0xA0EE */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_15        ;  /* 0xA0EF */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_16        ;  /* 0xA0F0 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_17        ;  /* 0xA0F1 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_18        ;  /* 0xA0F2 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_19        ;  /* 0xA0F3 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_20        ;  /* 0xA0F4 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_21        ;  /* 0xA0F5 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_22        ;  /* 0xA0F6 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_23        ;  /* 0xA0F7 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_24        ;  /* 0xA0F8 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_25        ;  /* 0xA0F9 */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_26        ;  /* 0xA0FA */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_27        ;  /* 0xA0FB */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_28        ;  /* 0xA0FC */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_29        ;  /* 0xA0FD */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_30        ;  /* 0xA0FE */
+	union UINT_FLOAT        	SQ_VTX_SEMANTIC_31        ;  /* 0xA0FF */
+
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_0       ;  /* 0xA191 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_1       ;  /* 0xA192 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_2       ;  /* 0xA193 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_3       ;  /* 0xA194 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_4       ;  /* 0xA195 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_5       ;  /* 0xA196 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_6       ;  /* 0xA197 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_7       ;  /* 0xA198 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_8       ;  /* 0xA199 */
+	union UINT_FLOAT       	SPI_PS_INPUT_CNTL_9       ;  /* 0xA19A */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_10      ;  /* 0xA19B */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_11      ;  /* 0xA19C */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_12      ;  /* 0xA19D */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_13      ;  /* 0xA19E */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_14      ;  /* 0xA19F */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_15      ;  /* 0xA1A0 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_16      ;  /* 0xA1A1 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_17      ;  /* 0xA1A2 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_18      ;  /* 0xA1A3 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_19      ;  /* 0xA1A4 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_20      ;  /* 0xA1A5 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_21      ;  /* 0xA1A6 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_22      ;  /* 0xA1A7 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_23      ;  /* 0xA1A8 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_24      ;  /* 0xA1A9 */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_25      ;  /* 0xA1AA */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_26      ;  /* 0xA1AB */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_27      ;  /* 0xA1AC */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_28      ;  /* 0xA1AD */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_29      ;  /* 0xA1AE */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_30      ;  /* 0xA1AF */
+	union UINT_FLOAT      	SPI_PS_INPUT_CNTL_31      ;  /* 0xA1B0 */
+
+	// shaders
+	PS_STATE_STRUCT                 ps;
+	VS_STATE_STRUCT                 vs;
+	GS_STATE_STRUCT                 gs;
+	ES_STATE_STRUCT                 es;
+	FS_STATE_STRUCT                 fs;
+
+	// SQ CONFIG
+	SQ_CONFIG_STRUCT                sq_config;
+
+	// SQ
+	union UINT_FLOAT     	        SQ_ESGS_RING_ITEMSIZE     ;  /* 0xA22A */
+	union UINT_FLOAT     	        SQ_GSVS_RING_ITEMSIZE     ;  /* 0xA22B */
+	union UINT_FLOAT    	        SQ_ESTMP_RING_ITEMSIZE    ;  /* 0xA22C */
+	union UINT_FLOAT    	        SQ_GSTMP_RING_ITEMSIZE    ;  /* 0xA22D */
+	union UINT_FLOAT    	        SQ_VSTMP_RING_ITEMSIZE    ;  /* 0xA22E */
+	union UINT_FLOAT    	        SQ_PSTMP_RING_ITEMSIZE    ;  /* 0xA22F */
+	union UINT_FLOAT     	        SQ_FBUF_RING_ITEMSIZE     ;  /* 0xA230 */
+	union UINT_FLOAT    	        SQ_REDUC_RING_ITEMSIZE    ;  /* 0xA231 */
+	union UINT_FLOAT       	        SQ_GS_VERT_ITEMSIZE       ;  /* 0xA232 */
+
+	ContextState*                   pStateList;
+
+	radeonTexObj*                   textures[R700_TEXTURE_NUMBERUNITS];
+
+	GLboolean                       bEnablePerspective;
+
+} R700_CHIP_CONTEXT;
+
+#endif /* _R700_CHIP_H_ */
+
diff --git a/src/mesa/drivers/dri/r600/r700_chipoffset.h b/src/mesa/drivers/dri/r600/r700_chipoffset.h
new file mode 100644
index 00000000000..4d73fb99a7d
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_chipoffset.h
@@ -0,0 +1,693 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_CHIPOFFSET_H_
+#define _R700_CHIPOFFSET_H_
+
+#define mmWAIT_UNTIL                                    0x2010
+#define mmSCRATCH_REG0                                  0x2140
+#define mmGUI_SCRATCH_REG0                              0x2140
+#define mmSCRATCH_REG1                                  0x2141
+#define mmGUI_SCRATCH_REG1                              0x2141
+#define mmSCRATCH_REG2                                  0x2142
+#define mmGUI_SCRATCH_REG2                              0x2142
+#define mmSCRATCH_REG3                                  0x2143
+#define mmGUI_SCRATCH_REG3                              0x2143
+#define mmSCRATCH_REG4                                  0x2144
+#define mmGUI_SCRATCH_REG4                              0x2144
+#define mmSCRATCH_REG5                                  0x2145
+#define mmGUI_SCRATCH_REG5                              0x2145
+#define mmSCRATCH_REG6                                  0x2146
+#define mmGUI_SCRATCH_REG6                              0x2146
+#define mmSCRATCH_REG7                                  0x2147
+#define mmGUI_SCRATCH_REG7                              0x2147
+
+#define mmCP_COHER_CNTL                                 0x217C
+#define mmCP_COHER_SIZE                                 0x217D
+#define mmCP_COHER_BASE                                 0x217E
+#define mmCP_COHER_STATUS                               0x217F
+
+#define mmTA_CNTL_AUX                                   0x2542
+#define mmVC_ENHANCE                                    0x25C5
+#define mmSQ_DYN_GPR_CNTL_PS_FLUSH_REQ                  0x2363
+#define mmDB_DEBUG                                      0x260C
+#define mmDB_WATERMARKS                                 0x260E
+
+#define mmPA_CL_VPORT_XSCALE                            0xA10F
+#define mmPA_CL_VPORT_XOFFSET                           0xA110
+#define mmPA_CL_VPORT_YSCALE                            0xA111
+#define mmPA_CL_VPORT_YOFFSET                           0xA112
+#define mmPA_CL_VPORT_ZSCALE                            0xA113
+#define mmPA_CL_VPORT_ZOFFSET                           0xA114
+#define mmPA_CL_VPORT_XSCALE_1                          0xA115
+#define mmPA_CL_VPORT_XSCALE_2                          0xA11B
+#define mmPA_CL_VPORT_XSCALE_3                          0xA121
+#define mmPA_CL_VPORT_XSCALE_4                          0xA127
+#define mmPA_CL_VPORT_XSCALE_5                          0xA12D
+#define mmPA_CL_VPORT_XSCALE_6                          0xA133
+#define mmPA_CL_VPORT_XSCALE_7                          0xA139
+#define mmPA_CL_VPORT_XSCALE_8                          0xA13F
+#define mmPA_CL_VPORT_XSCALE_9                          0xA145
+#define mmPA_CL_VPORT_XSCALE_10                         0xA14B
+#define mmPA_CL_VPORT_XSCALE_11                         0xA151
+#define mmPA_CL_VPORT_XSCALE_12                         0xA157
+#define mmPA_CL_VPORT_XSCALE_13                         0xA15D
+#define mmPA_CL_VPORT_XSCALE_14                         0xA163
+#define mmPA_CL_VPORT_XSCALE_15                         0xA169
+#define mmPA_CL_VPORT_XOFFSET_1                         0xA116
+#define mmPA_CL_VPORT_XOFFSET_2                         0xA11C
+#define mmPA_CL_VPORT_XOFFSET_3                         0xA122
+#define mmPA_CL_VPORT_XOFFSET_4                         0xA128
+#define mmPA_CL_VPORT_XOFFSET_5                         0xA12E
+#define mmPA_CL_VPORT_XOFFSET_6                         0xA134
+#define mmPA_CL_VPORT_XOFFSET_7                         0xA13A
+#define mmPA_CL_VPORT_XOFFSET_8                         0xA140
+#define mmPA_CL_VPORT_XOFFSET_9                         0xA146
+#define mmPA_CL_VPORT_XOFFSET_10                        0xA14C
+#define mmPA_CL_VPORT_XOFFSET_11                        0xA152
+#define mmPA_CL_VPORT_XOFFSET_12                        0xA158
+#define mmPA_CL_VPORT_XOFFSET_13                        0xA15E
+#define mmPA_CL_VPORT_XOFFSET_14                        0xA164
+#define mmPA_CL_VPORT_XOFFSET_15                        0xA16A
+#define mmPA_CL_VPORT_YSCALE_1                          0xA117
+#define mmPA_CL_VPORT_YSCALE_2                          0xA11D
+#define mmPA_CL_VPORT_YSCALE_3                          0xA123
+#define mmPA_CL_VPORT_YSCALE_4                          0xA129
+#define mmPA_CL_VPORT_YSCALE_5                          0xA12F
+#define mmPA_CL_VPORT_YSCALE_6                          0xA135
+#define mmPA_CL_VPORT_YSCALE_7                          0xA13B
+#define mmPA_CL_VPORT_YSCALE_8                          0xA141
+#define mmPA_CL_VPORT_YSCALE_9                          0xA147
+#define mmPA_CL_VPORT_YSCALE_10                         0xA14D
+#define mmPA_CL_VPORT_YSCALE_11                         0xA153
+#define mmPA_CL_VPORT_YSCALE_12                         0xA159
+#define mmPA_CL_VPORT_YSCALE_13                         0xA15F
+#define mmPA_CL_VPORT_YSCALE_14                         0xA165
+#define mmPA_CL_VPORT_YSCALE_15                         0xA16B
+#define mmPA_CL_VPORT_YOFFSET_1                         0xA118
+#define mmPA_CL_VPORT_YOFFSET_2                         0xA11E
+#define mmPA_CL_VPORT_YOFFSET_3                         0xA124
+#define mmPA_CL_VPORT_YOFFSET_4                         0xA12A
+#define mmPA_CL_VPORT_YOFFSET_5                         0xA130
+#define mmPA_CL_VPORT_YOFFSET_6                         0xA136
+#define mmPA_CL_VPORT_YOFFSET_7                         0xA13C
+#define mmPA_CL_VPORT_YOFFSET_8                         0xA142
+#define mmPA_CL_VPORT_YOFFSET_9                         0xA148
+#define mmPA_CL_VPORT_YOFFSET_10                        0xA14E
+#define mmPA_CL_VPORT_YOFFSET_11                        0xA154
+#define mmPA_CL_VPORT_YOFFSET_12                        0xA15A
+#define mmPA_CL_VPORT_YOFFSET_13                        0xA160
+#define mmPA_CL_VPORT_YOFFSET_14                        0xA166
+#define mmPA_CL_VPORT_YOFFSET_15                        0xA16C
+#define mmPA_CL_VPORT_ZSCALE_1                          0xA119
+#define mmPA_CL_VPORT_ZSCALE_2                          0xA11F
+#define mmPA_CL_VPORT_ZSCALE_3                          0xA125
+#define mmPA_CL_VPORT_ZSCALE_4                          0xA12B
+#define mmPA_CL_VPORT_ZSCALE_5                          0xA131
+#define mmPA_CL_VPORT_ZSCALE_6                          0xA137
+#define mmPA_CL_VPORT_ZSCALE_7                          0xA13D
+#define mmPA_CL_VPORT_ZSCALE_8                          0xA143
+#define mmPA_CL_VPORT_ZSCALE_9                          0xA149
+#define mmPA_CL_VPORT_ZSCALE_10                         0xA14F
+#define mmPA_CL_VPORT_ZSCALE_11                         0xA155
+#define mmPA_CL_VPORT_ZSCALE_12                         0xA15B
+#define mmPA_CL_VPORT_ZSCALE_13                         0xA161
+#define mmPA_CL_VPORT_ZSCALE_14                         0xA167
+#define mmPA_CL_VPORT_ZSCALE_15                         0xA16D
+#define mmPA_CL_VPORT_ZOFFSET_1                         0xA11A
+#define mmPA_CL_VPORT_ZOFFSET_2                         0xA120
+#define mmPA_CL_VPORT_ZOFFSET_3                         0xA126
+#define mmPA_CL_VPORT_ZOFFSET_4                         0xA12C
+#define mmPA_CL_VPORT_ZOFFSET_5                         0xA132
+#define mmPA_CL_VPORT_ZOFFSET_6                         0xA138
+#define mmPA_CL_VPORT_ZOFFSET_7                         0xA13E
+#define mmPA_CL_VPORT_ZOFFSET_8                         0xA144
+#define mmPA_CL_VPORT_ZOFFSET_9                         0xA14A
+#define mmPA_CL_VPORT_ZOFFSET_10                        0xA150
+#define mmPA_CL_VPORT_ZOFFSET_11                        0xA156
+#define mmPA_CL_VPORT_ZOFFSET_12                        0xA15C
+#define mmPA_CL_VPORT_ZOFFSET_13                        0xA162
+#define mmPA_CL_VPORT_ZOFFSET_14                        0xA168
+#define mmPA_CL_VPORT_ZOFFSET_15                        0xA16E
+#define mmPA_CL_VTE_CNTL                                0xA206
+#define mmPA_CL_VS_OUT_CNTL                             0xA207
+#define mmPA_CL_NANINF_CNTL                             0xA208
+#define mmPA_CL_CLIP_CNTL                               0xA204
+#define mmPA_CL_GB_VERT_CLIP_ADJ                        0xA303
+#define mmPA_CL_GB_VERT_DISC_ADJ                        0xA304
+#define mmPA_CL_GB_HORZ_CLIP_ADJ                        0xA305
+#define mmPA_CL_GB_HORZ_DISC_ADJ                        0xA306
+#define mmPA_CL_UCP_0_X                                 0xA388
+#define mmPA_CL_UCP_0_Y                                 0xA389
+#define mmPA_CL_UCP_0_Z                                 0xA38A
+#define mmPA_CL_UCP_0_W                                 0xA38B
+#define mmPA_CL_UCP_1_X                                 0xA38C
+#define mmPA_CL_UCP_1_Y                                 0xA38D
+#define mmPA_CL_UCP_1_Z                                 0xA38E
+#define mmPA_CL_UCP_1_W                                 0xA38F
+#define mmPA_CL_UCP_2_X                                 0xA390
+#define mmPA_CL_UCP_2_Y                                 0xA391
+#define mmPA_CL_UCP_2_Z                                 0xA392
+#define mmPA_CL_UCP_2_W                                 0xA393
+#define mmPA_CL_UCP_3_X                                 0xA394
+#define mmPA_CL_UCP_3_Y                                 0xA395
+#define mmPA_CL_UCP_3_Z                                 0xA396
+#define mmPA_CL_UCP_3_W                                 0xA397
+#define mmPA_CL_UCP_4_X                                 0xA398
+#define mmPA_CL_UCP_4_Y                                 0xA399
+#define mmPA_CL_UCP_4_Z                                 0xA39A
+#define mmPA_CL_UCP_4_W                                 0xA39B
+#define mmPA_CL_UCP_5_X                                 0xA39C
+#define mmPA_CL_UCP_5_Y                                 0xA39D
+#define mmPA_CL_UCP_5_Z                                 0xA39E
+#define mmPA_CL_UCP_5_W                                 0xA39F
+#define mmPA_CL_POINT_X_RAD                             0xA384
+#define mmPA_CL_POINT_Y_RAD                             0xA385
+#define mmPA_CL_POINT_SIZE                              0xA386
+#define mmPA_CL_POINT_CULL_RAD                          0xA387
+
+#define mmPA_SU_VTX_CNTL                                0xA302
+#define mmPA_SU_POINT_SIZE                              0xA280
+#define mmPA_SU_POINT_MINMAX                            0xA281
+#define mmPA_SU_LINE_CNTL                               0xA282
+#define mmPA_SU_SC_MODE_CNTL                            0xA205
+#define mmPA_SU_POLY_OFFSET_DB_FMT_CNTL                 0xA37E
+#define mmPA_SU_POLY_OFFSET_CLAMP                       0xA37F
+#define mmPA_SU_POLY_OFFSET_FRONT_SCALE                 0xA380
+#define mmPA_SU_POLY_OFFSET_FRONT_OFFSET                0xA381
+#define mmPA_SU_POLY_OFFSET_BACK_SCALE                  0xA382
+#define mmPA_SU_POLY_OFFSET_BACK_OFFSET                 0xA383
+
+#define mmPA_SC_WINDOW_OFFSET                           0xA080
+#define mmPA_SC_AA_CONFIG                               0xA301
+#define mmPA_SC_AA_MASK                                 0xA312
+#define mmPA_SC_AA_SAMPLE_LOCS_MCTX                     0xA307
+#define mmPA_SC_AA_SAMPLE_LOCS_8S_WD1_MCTX              0xA308
+#define mmPA_SC_LINE_STIPPLE                            0xA283
+#define mmPA_SC_LINE_CNTL                               0xA300
+#define mmPA_SC_SCREEN_SCISSOR_TL                       0xA00C
+#define mmPA_SC_SCREEN_SCISSOR_BR                       0xA00D
+#define mmPA_SC_WINDOW_SCISSOR_TL                       0xA081
+#define mmPA_SC_WINDOW_SCISSOR_BR                       0xA082
+#define mmPA_SC_CLIPRECT_RULE                           0xA083
+#define mmPA_SC_CLIPRECT_0_TL                           0xA084
+#define mmPA_SC_CLIPRECT_0_BR                           0xA085
+#define mmPA_SC_CLIPRECT_1_TL                           0xA086
+#define mmPA_SC_CLIPRECT_1_BR                           0xA087
+#define mmPA_SC_CLIPRECT_2_TL                           0xA088
+#define mmPA_SC_CLIPRECT_2_BR                           0xA089
+#define mmPA_SC_CLIPRECT_3_TL                           0xA08A
+#define mmPA_SC_CLIPRECT_3_BR                           0xA08B
+#define mmPA_SC_EDGERULE                                0xA08C
+#define mmPA_SC_GENERIC_SCISSOR_TL                      0xA090
+#define mmPA_SC_GENERIC_SCISSOR_BR                      0xA091
+#define mmPA_SC_VPORT_SCISSOR_0_TL                      0xA094
+#define mmPA_SC_VPORT_SCISSOR_1_TL                      0xA096
+#define mmPA_SC_VPORT_SCISSOR_2_TL                      0xA098
+#define mmPA_SC_VPORT_SCISSOR_3_TL                      0xA09A
+#define mmPA_SC_VPORT_SCISSOR_4_TL                      0xA09C
+#define mmPA_SC_VPORT_SCISSOR_5_TL                      0xA09E
+#define mmPA_SC_VPORT_SCISSOR_6_TL                      0xA0A0
+#define mmPA_SC_VPORT_SCISSOR_7_TL                      0xA0A2
+#define mmPA_SC_VPORT_SCISSOR_8_TL                      0xA0A4
+#define mmPA_SC_VPORT_SCISSOR_9_TL                      0xA0A6
+#define mmPA_SC_VPORT_SCISSOR_10_TL                     0xA0A8
+#define mmPA_SC_VPORT_SCISSOR_11_TL                     0xA0AA
+#define mmPA_SC_VPORT_SCISSOR_12_TL                     0xA0AC
+#define mmPA_SC_VPORT_SCISSOR_13_TL                     0xA0AE
+#define mmPA_SC_VPORT_SCISSOR_14_TL                     0xA0B0
+#define mmPA_SC_VPORT_SCISSOR_15_TL                     0xA0B2
+#define mmPA_SC_VPORT_SCISSOR_0_BR                      0xA095
+#define mmPA_SC_VPORT_SCISSOR_1_BR                      0xA097
+#define mmPA_SC_VPORT_SCISSOR_2_BR                      0xA099
+#define mmPA_SC_VPORT_SCISSOR_3_BR                      0xA09B
+#define mmPA_SC_VPORT_SCISSOR_4_BR                      0xA09D
+#define mmPA_SC_VPORT_SCISSOR_5_BR                      0xA09F
+#define mmPA_SC_VPORT_SCISSOR_6_BR                      0xA0A1
+#define mmPA_SC_VPORT_SCISSOR_7_BR                      0xA0A3
+#define mmPA_SC_VPORT_SCISSOR_8_BR                      0xA0A5
+#define mmPA_SC_VPORT_SCISSOR_9_BR                      0xA0A7
+#define mmPA_SC_VPORT_SCISSOR_10_BR                     0xA0A9
+#define mmPA_SC_VPORT_SCISSOR_11_BR                     0xA0AB
+#define mmPA_SC_VPORT_SCISSOR_12_BR                     0xA0AD
+#define mmPA_SC_VPORT_SCISSOR_13_BR                     0xA0AF
+#define mmPA_SC_VPORT_SCISSOR_14_BR                     0xA0B1
+#define mmPA_SC_VPORT_SCISSOR_15_BR                     0xA0B3
+#define mmPA_SC_VPORT_ZMIN_0                            0xA0B4
+#define mmPA_SC_VPORT_ZMIN_1                            0xA0B6
+#define mmPA_SC_VPORT_ZMIN_2                            0xA0B8
+#define mmPA_SC_VPORT_ZMIN_3                            0xA0BA
+#define mmPA_SC_VPORT_ZMIN_4                            0xA0BC
+#define mmPA_SC_VPORT_ZMIN_5                            0xA0BE
+#define mmPA_SC_VPORT_ZMIN_6                            0xA0C0
+#define mmPA_SC_VPORT_ZMIN_7                            0xA0C2
+#define mmPA_SC_VPORT_ZMIN_8                            0xA0C4
+#define mmPA_SC_VPORT_ZMIN_9                            0xA0C6
+#define mmPA_SC_VPORT_ZMIN_10                           0xA0C8
+#define mmPA_SC_VPORT_ZMIN_11                           0xA0CA
+#define mmPA_SC_VPORT_ZMIN_12                           0xA0CC
+#define mmPA_SC_VPORT_ZMIN_13                           0xA0CE
+#define mmPA_SC_VPORT_ZMIN_14                           0xA0D0
+#define mmPA_SC_VPORT_ZMIN_15                           0xA0D2
+#define mmPA_SC_VPORT_ZMAX_0                            0xA0B5
+#define mmPA_SC_VPORT_ZMAX_1                            0xA0B7
+#define mmPA_SC_VPORT_ZMAX_2                            0xA0B9
+#define mmPA_SC_VPORT_ZMAX_3                            0xA0BB
+#define mmPA_SC_VPORT_ZMAX_4                            0xA0BD
+#define mmPA_SC_VPORT_ZMAX_5                            0xA0BF
+#define mmPA_SC_VPORT_ZMAX_6                            0xA0C1
+#define mmPA_SC_VPORT_ZMAX_7                            0xA0C3
+#define mmPA_SC_VPORT_ZMAX_8                            0xA0C5
+#define mmPA_SC_VPORT_ZMAX_9                            0xA0C7
+#define mmPA_SC_VPORT_ZMAX_10                           0xA0C9
+#define mmPA_SC_VPORT_ZMAX_11                           0xA0CB
+#define mmPA_SC_VPORT_ZMAX_12                           0xA0CD
+#define mmPA_SC_VPORT_ZMAX_13                           0xA0CF
+#define mmPA_SC_VPORT_ZMAX_14                           0xA0D1
+#define mmPA_SC_VPORT_ZMAX_15                           0xA0D3
+#define mmPA_SC_MODE_CNTL                               0xA293
+#define mmPA_SC_MPASS_PS_CNTL                           0xA292
+
+#define mmVGT_DRAW_INITIATOR                            0xA1FC
+#define mmVGT_EVENT_INITIATOR                           0xA2A4
+#define mmVGT_EVENT_ADDRESS_REG                         0xA1FE
+#define mmVGT_DMA_BASE_HI                               0xA1F9
+#define mmVGT_DMA_BASE                                  0xA1FA
+#define mmVGT_DMA_INDEX_TYPE                            0xA29F
+#define mmVGT_DMA_NUM_INSTANCES                         0xA2A2
+#define mmVGT_DMA_SIZE                                  0xA29D
+
+#define mmVGT_IMMED_DATA                                0xA1FD
+#define mmVGT_INDEX_TYPE                                0x2257
+#define mmVGT_NUM_INDICES                               0x225C
+#define mmVGT_NUM_INSTANCES                             0x225D
+#define mmVGT_PRIMITIVE_TYPE                            0x2256
+#define mmVGT_PRIMITIVEID_EN                            0xA2A1
+#define mmVGT_VTX_CNT_EN                                0xA2AE
+#define mmVGT_REUSE_OFF                                 0xA2AD
+#define mmVGT_INSTANCE_STEP_RATE_0                      0xA2A8
+#define mmVGT_INSTANCE_STEP_RATE_1                      0xA2A9
+#define mmVGT_MAX_VTX_INDX                              0xA100
+#define mmVGT_MIN_VTX_INDX                              0xA101
+#define mmVGT_INDX_OFFSET                               0xA102
+#define mmVGT_VERTEX_REUSE_BLOCK_CNTL                   0xA316
+#define mmVGT_OUT_DEALLOC_CNTL                          0xA317
+#define mmVGT_MULTI_PRIM_IB_RESET_INDX                  0xA103
+#define mmVGT_MULTI_PRIM_IB_RESET_EN                    0xA2A5
+#define mmVGT_ENHANCE                                   0xA294
+#define mmVGT_OUTPUT_PATH_CNTL                          0xA284
+#define mmVGT_HOS_CNTL                                  0xA285
+#define mmVGT_HOS_MAX_TESS_LEVEL                        0xA286
+#define mmVGT_HOS_MIN_TESS_LEVEL                        0xA287
+#define mmVGT_HOS_REUSE_DEPTH                           0xA288
+#define mmVGT_GROUP_PRIM_TYPE                           0xA289
+#define mmVGT_GROUP_FIRST_DECR                          0xA28A
+#define mmVGT_GROUP_DECR                                0xA28B
+#define mmVGT_GROUP_VECT_0_CNTL                         0xA28C
+#define mmVGT_GROUP_VECT_1_CNTL                         0xA28D
+#define mmVGT_GROUP_VECT_0_FMT_CNTL                     0xA28E
+#define mmVGT_GROUP_VECT_1_FMT_CNTL                     0xA28F
+#define mmVGT_GS_MODE                                   0xA290
+#define mmVGT_GS_OUT_PRIM_TYPE                          0xA29B
+
+#define mmVGT_STRMOUT_EN                                0xA2AC
+#define mmVGT_STRMOUT_BUFFER_SIZE_0                     0xA2B4
+#define mmVGT_STRMOUT_BUFFER_SIZE_1                     0xA2B8
+#define mmVGT_STRMOUT_BUFFER_SIZE_2                     0xA2BC
+#define mmVGT_STRMOUT_BUFFER_SIZE_3                     0xA2C0
+#define mmVGT_STRMOUT_BUFFER_OFFSET_0                   0xA2B7
+#define mmVGT_STRMOUT_BUFFER_OFFSET_1                   0xA2BB
+#define mmVGT_STRMOUT_BUFFER_OFFSET_2                   0xA2BF
+#define mmVGT_STRMOUT_BUFFER_OFFSET_3                   0xA2C3
+#define mmVGT_STRMOUT_VTX_STRIDE_0                      0xA2B5
+#define mmVGT_STRMOUT_VTX_STRIDE_1                      0xA2B9
+#define mmVGT_STRMOUT_VTX_STRIDE_2                      0xA2BD
+#define mmVGT_STRMOUT_VTX_STRIDE_3                      0xA2C1
+#define mmVGT_STRMOUT_BUFFER_BASE_0                     0xA2B6
+#define mmVGT_STRMOUT_BUFFER_BASE_1                     0xA2BA
+#define mmVGT_STRMOUT_BUFFER_BASE_2                     0xA2BE
+#define mmVGT_STRMOUT_BUFFER_BASE_3                     0xA2C2
+#define mmVGT_STRMOUT_BUFFER_EN                         0xA2C8
+#define mmVGT_STRMOUT_BASE_OFFSET_0                     0xA2C4
+#define mmVGT_STRMOUT_BASE_OFFSET_1                     0xA2C5
+#define mmVGT_STRMOUT_BASE_OFFSET_2                     0xA2C6
+#define mmVGT_STRMOUT_BASE_OFFSET_3                     0xA2C7
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_0                  0xA2D1
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_1                  0xA2D2
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_2                  0xA2D3
+#define mmVGT_STRMOUT_BASE_OFFSET_HI_3                  0xA2D4
+#define mmVGT_STRMOUT_DRAW_OPAQUE_OFFSET                0xA2CA
+#define mmVGT_STRMOUT_DRAW_OPAQUE_BUFFER_FILLED_SIZE    0xA2CB
+#define mmVGT_STRMOUT_DRAW_OPAQUE_VERTEX_STRIDE         0xA2CC
+
+#define mmSQ_PGM_START_PS                               0xA210
+#define mmSQ_PGM_CF_OFFSET_PS                           0xA233
+#define mmSQ_PGM_RESOURCES_PS                           0xA214
+#define mmSQ_PGM_EXPORTS_PS                             0xA215
+#define mmSQ_PGM_START_VS                               0xA216
+#define mmSQ_PGM_CF_OFFSET_VS                           0xA234
+#define mmSQ_PGM_RESOURCES_VS                           0xA21A
+#define mmSQ_PGM_START_GS                               0xA21B
+#define mmSQ_PGM_CF_OFFSET_GS                           0xA235
+#define mmSQ_PGM_RESOURCES_GS                           0xA21F
+#define mmSQ_PGM_START_ES                               0xA220
+#define mmSQ_PGM_CF_OFFSET_ES                           0xA236
+#define mmSQ_PGM_RESOURCES_ES                           0xA224
+#define mmSQ_PGM_START_FS                               0xA225
+#define mmSQ_PGM_CF_OFFSET_FS                           0xA237
+#define mmSQ_PGM_RESOURCES_FS                           0xA229
+#define mmSQ_ESGS_RING_ITEMSIZE                         0xA22A
+#define mmSQ_GSVS_RING_ITEMSIZE                         0xA22B
+#define mmSQ_ESTMP_RING_ITEMSIZE                        0xA22C
+#define mmSQ_GSTMP_RING_ITEMSIZE                        0xA22D
+#define mmSQ_VSTMP_RING_ITEMSIZE                        0xA22E
+#define mmSQ_PSTMP_RING_ITEMSIZE                        0xA22F
+#define mmSQ_FBUF_RING_ITEMSIZE                         0xA230
+#define mmSQ_REDUC_RING_ITEMSIZE                        0xA231
+#define mmSQ_GS_VERT_ITEMSIZE                           0xA232
+#define mmSQ_VTX_SEMANTIC_CLEAR                         0xA238
+
+#define mmSQ_VTX_SEMANTIC_0                             0xA0E0
+#define mmSQ_VTX_SEMANTIC_1                             0xA0E1
+#define mmSQ_VTX_SEMANTIC_2                             0xA0E2
+#define mmSQ_VTX_SEMANTIC_3                             0xA0E3
+#define mmSQ_VTX_SEMANTIC_4                             0xA0E4
+#define mmSQ_VTX_SEMANTIC_5                             0xA0E5
+#define mmSQ_VTX_SEMANTIC_6                             0xA0E6
+#define mmSQ_VTX_SEMANTIC_7                             0xA0E7
+#define mmSQ_VTX_SEMANTIC_8                             0xA0E8
+#define mmSQ_VTX_SEMANTIC_9                             0xA0E9
+#define mmSQ_VTX_SEMANTIC_10                            0xA0EA
+#define mmSQ_VTX_SEMANTIC_11                            0xA0EB
+#define mmSQ_VTX_SEMANTIC_12                            0xA0EC
+#define mmSQ_VTX_SEMANTIC_13                            0xA0ED
+#define mmSQ_VTX_SEMANTIC_14                            0xA0EE
+#define mmSQ_VTX_SEMANTIC_15                            0xA0EF
+#define mmSQ_VTX_SEMANTIC_16                            0xA0F0
+#define mmSQ_VTX_SEMANTIC_17                            0xA0F1
+#define mmSQ_VTX_SEMANTIC_18                            0xA0F2
+#define mmSQ_VTX_SEMANTIC_19                            0xA0F3
+#define mmSQ_VTX_SEMANTIC_20                            0xA0F4
+#define mmSQ_VTX_SEMANTIC_21                            0xA0F5
+#define mmSQ_VTX_SEMANTIC_22                            0xA0F6
+#define mmSQ_VTX_SEMANTIC_23                            0xA0F7
+#define mmSQ_VTX_SEMANTIC_24                            0xA0F8
+#define mmSQ_VTX_SEMANTIC_25                            0xA0F9
+#define mmSQ_VTX_SEMANTIC_26                            0xA0FA
+#define mmSQ_VTX_SEMANTIC_27                            0xA0FB
+#define mmSQ_VTX_SEMANTIC_28                            0xA0FC
+#define mmSQ_VTX_SEMANTIC_29                            0xA0FD
+#define mmSQ_VTX_SEMANTIC_30                            0xA0FE
+#define mmSQ_VTX_SEMANTIC_31                            0xA0FF
+
+#define mmSQ_ALU_CONST_CACHE_PS_0                       0xA250
+#define mmSQ_ALU_CONST_CACHE_PS_1                       0xA251
+#define mmSQ_ALU_CONST_CACHE_PS_2                       0xA252
+#define mmSQ_ALU_CONST_CACHE_PS_3                       0xA253
+#define mmSQ_ALU_CONST_CACHE_PS_4                       0xA254
+#define mmSQ_ALU_CONST_CACHE_PS_5                       0xA255
+#define mmSQ_ALU_CONST_CACHE_PS_6                       0xA256
+#define mmSQ_ALU_CONST_CACHE_PS_7                       0xA257
+#define mmSQ_ALU_CONST_CACHE_PS_8                       0xA258
+#define mmSQ_ALU_CONST_CACHE_PS_9                       0xA259
+#define mmSQ_ALU_CONST_CACHE_PS_10                      0xA25A
+#define mmSQ_ALU_CONST_CACHE_PS_11                      0xA25B
+#define mmSQ_ALU_CONST_CACHE_PS_12                      0xA25C
+#define mmSQ_ALU_CONST_CACHE_PS_13                      0xA25D
+#define mmSQ_ALU_CONST_CACHE_PS_14                      0xA25E
+#define mmSQ_ALU_CONST_CACHE_PS_15                      0xA25F
+#define mmSQ_ALU_CONST_CACHE_VS_0                       0xA260
+#define mmSQ_ALU_CONST_CACHE_VS_1                       0xA261
+#define mmSQ_ALU_CONST_CACHE_VS_2                       0xA262
+#define mmSQ_ALU_CONST_CACHE_VS_3                       0xA263
+#define mmSQ_ALU_CONST_CACHE_VS_4                       0xA264
+#define mmSQ_ALU_CONST_CACHE_VS_5                       0xA265
+#define mmSQ_ALU_CONST_CACHE_VS_6                       0xA266
+#define mmSQ_ALU_CONST_CACHE_VS_7                       0xA267
+#define mmSQ_ALU_CONST_CACHE_VS_8                       0xA268
+#define mmSQ_ALU_CONST_CACHE_VS_9                       0xA269
+#define mmSQ_ALU_CONST_CACHE_VS_10                      0xA26A
+#define mmSQ_ALU_CONST_CACHE_VS_11                      0xA26B
+#define mmSQ_ALU_CONST_CACHE_VS_12                      0xA26C
+#define mmSQ_ALU_CONST_CACHE_VS_13                      0xA26D
+#define mmSQ_ALU_CONST_CACHE_VS_14                      0xA26E
+#define mmSQ_ALU_CONST_CACHE_VS_15                      0xA26F
+#define mmSQ_ALU_CONST_CACHE_GS_0                       0xA270
+#define mmSQ_ALU_CONST_CACHE_GS_1                       0xA271
+#define mmSQ_ALU_CONST_CACHE_GS_2                       0xA272
+#define mmSQ_ALU_CONST_CACHE_GS_3                       0xA273
+#define mmSQ_ALU_CONST_CACHE_GS_4                       0xA274
+#define mmSQ_ALU_CONST_CACHE_GS_5                       0xA275
+#define mmSQ_ALU_CONST_CACHE_GS_6                       0xA276
+#define mmSQ_ALU_CONST_CACHE_GS_7                       0xA277
+#define mmSQ_ALU_CONST_CACHE_GS_8                       0xA278
+#define mmSQ_ALU_CONST_CACHE_GS_9                       0xA279
+#define mmSQ_ALU_CONST_CACHE_GS_10                      0xA27A
+#define mmSQ_ALU_CONST_CACHE_GS_11                      0xA27B
+#define mmSQ_ALU_CONST_CACHE_GS_12                      0xA27C
+#define mmSQ_ALU_CONST_CACHE_GS_13                      0xA27D
+#define mmSQ_ALU_CONST_CACHE_GS_14                      0xA27E
+#define mmSQ_ALU_CONST_CACHE_GS_15                      0xA27F
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_0                 0xA050
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_1                 0xA051
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_2                 0xA052
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_3                 0xA053
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_4                 0xA054
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_5                 0xA055
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_6                 0xA056
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_7                 0xA057
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_8                 0xA058
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_9                 0xA059
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_10                0xA05A
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_11                0xA05B
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_12                0xA05C
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_13                0xA05D
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_14                0xA05E
+#define mmSQ_ALU_CONST_BUFFER_SIZE_PS_15                0xA05F
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_0                 0xA060
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_1                 0xA061
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_2                 0xA062
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_3                 0xA063
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_4                 0xA064
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_5                 0xA065
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_6                 0xA066
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_7                 0xA067
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_8                 0xA068
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_9                 0xA069
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_10                0xA06A
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_11                0xA06B
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_12                0xA06C
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_13                0xA06D
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_14                0xA06E
+#define mmSQ_ALU_CONST_BUFFER_SIZE_VS_15                0xA06F
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_0                 0xA070
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_1                 0xA071
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_2                 0xA072
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_3                 0xA073
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_4                 0xA074
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_5                 0xA075
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_6                 0xA076
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_7                 0xA077
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_8                 0xA078
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_9                 0xA079
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_10                0xA07A
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_11                0xA07B
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_12                0xA07C
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_13                0xA07D
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_14                0xA07E
+#define mmSQ_ALU_CONST_BUFFER_SIZE_GS_15                0xA07F
+
+#define mmSPI_VS_OUT_ID_0                               0xA185
+#define mmSPI_VS_OUT_ID_1                               0xA186
+#define mmSPI_VS_OUT_ID_2                               0xA187
+#define mmSPI_VS_OUT_ID_3                               0xA188
+#define mmSPI_VS_OUT_ID_4                               0xA189
+#define mmSPI_VS_OUT_ID_5                               0xA18A
+#define mmSPI_VS_OUT_ID_6                               0xA18B
+#define mmSPI_VS_OUT_ID_7                               0xA18C
+#define mmSPI_VS_OUT_ID_8                               0xA18D
+#define mmSPI_VS_OUT_ID_9                               0xA18E
+#define mmSPI_PS_INPUT_CNTL_0                           0xA191
+#define mmSPI_PS_INPUT_CNTL_1                           0xA192
+#define mmSPI_PS_INPUT_CNTL_2                           0xA193
+#define mmSPI_PS_INPUT_CNTL_3                           0xA194
+#define mmSPI_PS_INPUT_CNTL_4                           0xA195
+#define mmSPI_PS_INPUT_CNTL_5                           0xA196
+#define mmSPI_PS_INPUT_CNTL_6                           0xA197
+#define mmSPI_PS_INPUT_CNTL_7                           0xA198
+#define mmSPI_PS_INPUT_CNTL_8                           0xA199
+#define mmSPI_PS_INPUT_CNTL_9                           0xA19A
+#define mmSPI_PS_INPUT_CNTL_10                          0xA19B
+#define mmSPI_PS_INPUT_CNTL_11                          0xA19C
+#define mmSPI_PS_INPUT_CNTL_12                          0xA19D
+#define mmSPI_PS_INPUT_CNTL_13                          0xA19E
+#define mmSPI_PS_INPUT_CNTL_14                          0xA19F
+#define mmSPI_PS_INPUT_CNTL_15                          0xA1A0
+#define mmSPI_PS_INPUT_CNTL_16                          0xA1A1
+#define mmSPI_PS_INPUT_CNTL_17                          0xA1A2
+#define mmSPI_PS_INPUT_CNTL_18                          0xA1A3
+#define mmSPI_PS_INPUT_CNTL_19                          0xA1A4
+#define mmSPI_PS_INPUT_CNTL_20                          0xA1A5
+#define mmSPI_PS_INPUT_CNTL_21                          0xA1A6
+#define mmSPI_PS_INPUT_CNTL_22                          0xA1A7
+#define mmSPI_PS_INPUT_CNTL_23                          0xA1A8
+#define mmSPI_PS_INPUT_CNTL_24                          0xA1A9
+#define mmSPI_PS_INPUT_CNTL_25                          0xA1AA
+#define mmSPI_PS_INPUT_CNTL_26                          0xA1AB
+#define mmSPI_PS_INPUT_CNTL_27                          0xA1AC
+#define mmSPI_PS_INPUT_CNTL_28                          0xA1AD
+#define mmSPI_PS_INPUT_CNTL_29                          0xA1AE
+#define mmSPI_PS_INPUT_CNTL_30                          0xA1AF
+#define mmSPI_PS_INPUT_CNTL_31                          0xA1B0
+#define mmSPI_VS_OUT_CONFIG                             0xA1B1
+#define mmSPI_THREAD_GROUPING                           0xA1B2
+#define mmSPI_PS_IN_CONTROL_0                           0xA1B3
+#define mmSPI_PS_IN_CONTROL_1                           0xA1B4
+#define mmSPI_INTERP_CONTROL_0                          0xA1B5
+#define mmSPI_INPUT_Z                                   0xA1B6
+#define mmSPI_FOG_CNTL                                  0xA1B7
+#define mmSPI_FOG_FUNC_SCALE                            0xA1B8
+#define mmSPI_FOG_FUNC_BIAS                             0xA1B9
+
+#define mmSX_MISC                                       0xA0D4
+#define mmSX_ALPHA_TEST_CONTROL                         0xA104
+#define mmSX_ALPHA_REF                                  0xA10E
+
+#define mmDB_DEPTH_BASE                                 0xA003
+#define mmDB_DEPTH_INFO                                 0xA004
+#define mmDB_HTILE_DATA_BASE                            0xA005
+#define mmDB_DEPTH_SIZE                                 0xA000
+#define mmDB_DEPTH_VIEW                                 0xA001
+#define mmDB_RENDER_CONTROL                             0xA343
+#define mmDB_RENDER_OVERRIDE                            0xA344
+#define mmDB_SHADER_CONTROL                             0xA203
+#define mmDB_STENCIL_CLEAR                              0xA00A
+#define mmDB_DEPTH_CLEAR                                0xA00B
+#define mmDB_HTILE_SURFACE                              0xA349
+#define mmDB_PRELOAD_CONTROL                            0xA34C
+#define mmDB_PREFETCH_LIMIT                             0xA34D
+#define mmDB_STENCILREFMASK                             0xA10C
+#define mmDB_STENCILREFMASK_BF                          0xA10D
+#define mmDB_SRESULTS_COMPARE_STATE0                    0xA34A
+#define mmDB_SRESULTS_COMPARE_STATE1                    0xA34B
+#define mmDB_DEPTH_CONTROL                              0xA200
+#define mmDB_ALPHA_TO_MASK                              0xA351
+
+#define mmCB_CLEAR_RED_R6XX                             0xA048
+#define mmCB_CLEAR_GREEN_R6XX                           0xA049
+#define mmCB_CLEAR_BLUE_R6XX                            0xA04A
+#define mmCB_CLEAR_ALPHA_R6XX                           0xA04B
+#define mmCB_BLEND_RED                                  0xA105
+#define mmCB_BLEND_GREEN                                0xA106
+#define mmCB_BLEND_BLUE                                 0xA107
+#define mmCB_BLEND_ALPHA                                0xA108
+#define mmCB_FOG_RED_R6XX                               0xA109
+#define mmCB_FOG_GREEN_R6XX                             0xA10A
+#define mmCB_FOG_BLUE_R6XX                              0xA10B
+#define mmCB_BLEND_CONTROL                              0xA201
+#define mmCB_COLOR_CONTROL                              0xA202
+#define mmCB_BLEND0_CONTROL                             0xA1E0
+#define mmCB_BLEND1_CONTROL                             0xA1E1
+#define mmCB_BLEND2_CONTROL                             0xA1E2
+#define mmCB_BLEND3_CONTROL                             0xA1E3
+#define mmCB_BLEND4_CONTROL                             0xA1E4
+#define mmCB_BLEND5_CONTROL                             0xA1E5
+#define mmCB_BLEND6_CONTROL                             0xA1E6
+#define mmCB_BLEND7_CONTROL                             0xA1E7
+#define mmCB_CLRCMP_CONTROL                             0xA30C
+#define mmCB_CLRCMP_SRC                                 0xA30D
+#define mmCB_CLRCMP_DST                                 0xA30E
+#define mmCB_CLRCMP_MSK                                 0xA30F
+#define mmCB_COLOR0_BASE                                0xA010
+#define mmCB_COLOR1_BASE                                0xA011
+#define mmCB_COLOR2_BASE                                0xA012
+#define mmCB_COLOR3_BASE                                0xA013
+#define mmCB_COLOR4_BASE                                0xA014
+#define mmCB_COLOR5_BASE                                0xA015
+#define mmCB_COLOR6_BASE                                0xA016
+#define mmCB_COLOR7_BASE                                0xA017
+#define mmCB_COLOR0_SIZE                                0xA018
+#define mmCB_COLOR1_SIZE                                0xA019
+#define mmCB_COLOR2_SIZE                                0xA01A
+#define mmCB_COLOR3_SIZE                                0xA01B
+#define mmCB_COLOR4_SIZE                                0xA01C
+#define mmCB_COLOR5_SIZE                                0xA01D
+#define mmCB_COLOR6_SIZE                                0xA01E
+#define mmCB_COLOR7_SIZE                                0xA01F
+#define mmCB_COLOR0_VIEW                                0xA020
+#define mmCB_COLOR1_VIEW                                0xA021
+#define mmCB_COLOR2_VIEW                                0xA022
+#define mmCB_COLOR3_VIEW                                0xA023
+#define mmCB_COLOR4_VIEW                                0xA024
+#define mmCB_COLOR5_VIEW                                0xA025
+#define mmCB_COLOR6_VIEW                                0xA026
+#define mmCB_COLOR7_VIEW                                0xA027
+#define mmCB_COLOR0_INFO                                0xA028
+#define mmCB_COLOR1_INFO                                0xA029
+#define mmCB_COLOR2_INFO                                0xA02A
+#define mmCB_COLOR3_INFO                                0xA02B
+#define mmCB_COLOR4_INFO                                0xA02C
+#define mmCB_COLOR5_INFO                                0xA02D
+#define mmCB_COLOR6_INFO                                0xA02E
+#define mmCB_COLOR7_INFO                                0xA02F
+#define mmCB_COLOR0_TILE                                0xA030
+#define mmCB_COLOR1_TILE                                0xA031
+#define mmCB_COLOR2_TILE                                0xA032
+#define mmCB_COLOR3_TILE                                0xA033
+#define mmCB_COLOR4_TILE                                0xA034
+#define mmCB_COLOR5_TILE                                0xA035
+#define mmCB_COLOR6_TILE                                0xA036
+#define mmCB_COLOR7_TILE                                0xA037
+#define mmCB_COLOR0_FRAG                                0xA038
+#define mmCB_COLOR1_FRAG                                0xA039
+#define mmCB_COLOR2_FRAG                                0xA03A
+#define mmCB_COLOR3_FRAG                                0xA03B
+#define mmCB_COLOR4_FRAG                                0xA03C
+#define mmCB_COLOR5_FRAG                                0xA03D
+#define mmCB_COLOR6_FRAG                                0xA03E
+#define mmCB_COLOR7_FRAG                                0xA03F
+#define mmCB_COLOR0_MASK                                0xA040
+#define mmCB_COLOR1_MASK                                0xA041
+#define mmCB_COLOR2_MASK                                0xA042
+#define mmCB_COLOR3_MASK                                0xA043
+#define mmCB_COLOR4_MASK                                0xA044
+#define mmCB_COLOR5_MASK                                0xA045
+#define mmCB_COLOR6_MASK                                0xA046
+#define mmCB_COLOR7_MASK                                0xA047
+#define mmCB_CLEAR_RED_R6XX                             0xA048
+#define mmCB_CLEAR_GREEN_R6XX                           0xA049
+#define mmCB_CLEAR_BLUE_R6XX                            0xA04A
+#define mmCB_CLEAR_ALPHA_R6XX                           0xA04B
+#define mmCB_TARGET_MASK                                0xA08E
+#define mmCB_SHADER_MASK                                0xA08F
+#define mmCB_SHADER_CONTROL                             0xA1E8
+
+#define mmSQ_VTX_BASE_VTX_LOC                           0xF3FC
+#define mmSQ_VTX_START_INST_LOC                         0xF3FD
+
+#endif /* _R700_CHIPOFFSET_H_ */
+
diff --git a/src/mesa/drivers/dri/r600/r700_clear.c b/src/mesa/drivers/dri/r600/r700_clear.c
new file mode 100644
index 00000000000..e84be386221
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_clear.c
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+ 
+#include "main/glheader.h"
+#include "main/context.h"
+#include "main/macros.h"
+#include "main/imports.h"
+#include "main/mtypes.h"
+#include "main/enums.h"
+
+#include "radeon_lock.h"
+#include "r600_context.h"
+
+#include "r700_shaderinst.h"
+#include "r600_emit.h"
+#include "r700_clear.h"
+
+static GLboolean r700ClearFast(context_t *context, GLbitfield mask)
+{
+    /* TODO, fast clear need implementation */
+    return GL_FALSE;
+}
+
+#define R600_NEWPRIM( rmesa )			\
+  do {						\
+  if ( rmesa->radeon.dma.flush )			\
+    rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
+  } while (0)
+
+void r700Clear(GLcontext * ctx, GLbitfield mask)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    __DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+    const GLuint colorMask = *((GLuint *) & ctx->Color.ColorMask);
+    GLbitfield swrast_mask = 0, tri_mask = 0;
+    int i;
+    struct gl_framebuffer *fb = ctx->DrawBuffer;
+
+    if( GL_TRUE == r700ClearFast(context, mask) )
+    {
+        return;
+    }
+	if (!context->radeon.radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(&context->radeon);
+		UNLOCK_HARDWARE(&context->radeon);
+		if (dPriv->numClipRects == 0)
+			return;
+	}
+
+	R600_NEWPRIM(context);
+
+	if (colorMask == ~0)
+	  tri_mask |= (mask & BUFFER_BITS_COLOR);
+	else
+	  tri_mask |= (mask & (BUFFER_BIT_FRONT_LEFT | BUFFER_BIT_BACK_LEFT));
+
+
+	/* HW stencil */
+	if (mask & BUFFER_BIT_STENCIL) {
+		tri_mask |= BUFFER_BIT_STENCIL;
+	}
+
+	/* HW depth */
+	if (mask & BUFFER_BIT_DEPTH) {
+    	        tri_mask |= BUFFER_BIT_DEPTH;
+	}
+
+	/* If we're doing a tri pass for depth/stencil, include a likely color
+	 * buffer with it.
+	 */
+
+	for (i = 0; i < BUFFER_COUNT; i++) {
+	  GLuint bufBit = 1 << i;
+	  if ((tri_mask) & bufBit) {
+	    if (!fb->Attachment[i].Renderbuffer->ClassID) {
+	      tri_mask &= ~bufBit;
+	      swrast_mask |= bufBit;
+	    }
+	  }
+	}
+
+	/* SW fallback clearing */
+	swrast_mask = mask & ~tri_mask;
+
+	if (tri_mask) {
+		radeonUserClear(ctx, tri_mask);
+	}
+
+	if (swrast_mask) {
+		if (RADEON_DEBUG & DEBUG_FALLBACKS)
+			fprintf(stderr, "%s: swrast clear, mask: %x\n",
+				__FUNCTION__, swrast_mask);
+		_swrast_Clear(ctx, swrast_mask);
+	}
+
+}
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_clear.h b/src/mesa/drivers/dri/r600/r700_clear.h
new file mode 100644
index 00000000000..bed1d3a90e5
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_clear.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#ifndef __r700_CLEAR_H__
+#define __r700_CLEAR_H__
+
+extern void r700Clear(GLcontext * ctx, GLbitfield mask);
+
+#endif /* __r700_CLEAR_H__ */
diff --git a/src/mesa/drivers/dri/r600/r700_debug.c b/src/mesa/drivers/dri/r600/r700_debug.c
new file mode 100644
index 00000000000..ecdb75ad482
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_debug.c
@@ -0,0 +1,83 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+
+#include "main/glheader.h"
+
+#include "r700_debug.h"
+#include "r600_context.h"
+
+void NormalizeLogErrorCode(int nError)
+{
+    //TODO
+}
+
+void r700_error(int nLocalError, char* fmt, ...)
+{
+    va_list args;
+
+    NormalizeLogErrorCode(nLocalError);
+
+	va_start(args, fmt);
+    fprintf(stderr, fmt, args);
+    va_end(args);
+}
+
+void DumpHwBinary(int type, void *addr, int size)
+{
+    int i;
+    unsigned int *pHw = (unsigned int *)addr;
+
+    return;
+
+    switch (type)
+    {
+        case DUMP_PIXEL_SHADER:
+            DEBUGF("Pixel Shader\n");
+        break;
+        case DUMP_VERTEX_SHADER:
+            DEBUGF("Vertex Shader\n");
+        break;
+        case DUMP_FETCH_SHADER:
+            DEBUGF("Fetch Shader\n");
+        break;
+    }
+
+    for (i = 0; i < size; i++)
+    {
+        DEBUGP("0x%08x,\t", *pHw);
+        if (i%4 == 3)
+            DEBUGP("\n", *pHw);
+        pHw++;
+
+    }
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_debug.h b/src/mesa/drivers/dri/r600/r700_debug.h
new file mode 100644
index 00000000000..e810e6da083
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_debug.h
@@ -0,0 +1,93 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_DEBUG_H_
+#define _R700_DEBUG_H_
+
+enum R700_ERROR
+{
+    ERROR_ASM_VTX_CLAUSE     = 0x1000,
+    ERROR_ASM_UNKOWNCLAUSE   = 0x1001,
+    ERROR_ASM_ALLOCEXPORTCF  = 0x1002,
+    ERROR_ASM_ALLOCVTXCF     = 0x1003,
+    ERROR_ASM_ALLOCTEXCF     = 0x1004,
+    ERROR_ASM_ALLOCALUCF     = 0x1005,
+    ERROR_ASM_UNKNOWNILINST  = 0x1006,
+    ERROR_ASM_SRCARGUMENT    = 0x1007,
+    ERROR_ASM_DSTARGUMENT    = 0x1008,
+    ERROR_ASM_TEXINSTRUCTION = 0x1009,
+    ERROR_ASM_ALUINSTRUCTION = 0x100A,
+    ERROR_ASM_INSTDSTTRACK   = 0x100B,
+    ERROR_ASM_TEXDSTBADTYPE  = 0x100C,
+    ERROR_ASM_ALUSRCBADTYPE  = 0x100D,
+    ERROR_ASM_ALUSRCSELECT   = 0x100E,
+    ERROR_ASM_ALUSRCNUMBER   = 0x100F,
+    ERROR_ASM_ALUDSTBADTYPE  = 0x1010,
+    ERROR_ASM_CONSTCHANNEL   = 0x1011,
+    ERROR_ASM_BADSCALARBZ    = 0x1012,
+    ERROR_ASM_BADGPRRESERVE  = 0x1013,
+    ERROR_ASM_BADVECTORBZ    = 0x1014,
+    ERROR_ASM_BADTEXINST     = 0x1015,
+    ERROR_ASM_BADTEXSRC      = 0x1016,
+    ERROR_ASM_BADEXPORTTYPE  = 0x1017,
+
+
+    TODO_ASM_CONSTTEXADDR   = 0x8000,
+    TODO_ASM_NEEDIMPINST    = 0x8001,
+    TODO_ASM_TXB            = 0x8002,
+    TODO_ASM_TXP            = 0x8003
+};
+
+enum R700_DUMP_TYPE
+{
+    DUMP_VERTEX_SHADER      = 0x1,
+    DUMP_PIXEL_SHADER       = 0x2,
+    DUMP_FETCH_SHADER       = 0x4,
+};
+
+#define DEBUGF printf
+#define DEBUGP printf
+
+void NormalizeLogErrorCode(int nError);
+/*NormalizeLogErrorCode(nLocalError); */
+void r700_error(int nLocalError, char *fmt, ...);      
+extern void DumpHwBinary(int, void *, int);
+
+#ifdef STANDALONE_COMPILER
+#ifdef __cplusplus
+extern "C"
+{
+#endif //__cplusplus
+
+void LogString(char* szStr);
+
+#ifdef __cplusplus
+}
+#endif //__cplusplus
+#endif /*STANDALONE_COMPILER*/
+
+#endif /*_R700_DEBUG_H_*/
diff --git a/src/mesa/drivers/dri/r600/r700_driconf.h b/src/mesa/drivers/dri/r600/r700_driconf.h
new file mode 100644
index 00000000000..a9e21523449
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_driconf.h
@@ -0,0 +1,33 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_DRICONF_H_
+#define _R700_DRICONF_H_
+
+#define DRI_CONF_FP_OPTIMIZATION_SPEED   0
+#define DRI_CONF_FP_OPTIMIZATION_QUALITY 1
+
+#endif /* _R700_DRICONF_H_ */
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.c b/src/mesa/drivers/dri/r600/r700_fragprog.c
new file mode 100644
index 00000000000..44de2aebee6
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.c
@@ -0,0 +1,369 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/imports.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r700_fragprog.h"
+
+#include "r700_debug.h"
+
+//TODO : Validate FP input with VP output.
+void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
+						  struct gl_fragment_program *mesa_fp)
+{
+	unsigned int unBit;
+    unsigned int i;
+    GLuint       ui;
+
+	pAsm->number_used_registers = 0;
+
+//Input mapping : mesa_fp->Base.InputsRead set the flag, set in 
+	//The flags parsed in parse_attrib_binding. FRAG_ATTRIB_COLx, FRAG_ATTRIB_TEXx, ...
+	//MUST match order in Map_Vertex_Output
+	unBit = 1 << FRAG_ATTRIB_COL0;
+	if(mesa_fp->Base.InputsRead & unBit)
+	{
+		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL0] = pAsm->number_used_registers++;
+	}
+
+	unBit = 1 << FRAG_ATTRIB_COL1;
+	if(mesa_fp->Base.InputsRead & unBit)
+	{
+		pAsm->uiFP_AttributeMap[FRAG_ATTRIB_COL1] = pAsm->number_used_registers++;
+	}
+
+	for(i=0; i<8; i++)
+	{
+		unBit = 1 << (FRAG_ATTRIB_TEX0 + i);
+		if(mesa_fp->Base.InputsRead & unBit)
+		{
+			pAsm->uiFP_AttributeMap[FRAG_ATTRIB_TEX0 + i] = pAsm->number_used_registers++;
+		}
+	}
+
+/* Map temporary registers (GPRs) */
+    pAsm->starting_temp_register_number = pAsm->number_used_registers;
+
+    if(mesa_fp->Base.NumNativeTemporaries >= mesa_fp->Base.NumTemporaries)
+    {
+	    pAsm->number_used_registers += mesa_fp->Base.NumNativeTemporaries;
+    }
+    else
+    {
+        pAsm->number_used_registers += mesa_fp->Base.NumTemporaries;
+    }
+
+/* Output mapping */
+	pAsm->number_of_exports = 0;
+	pAsm->number_of_colorandz_exports = 0; /* don't include stencil and mask out. */
+	pAsm->starting_export_register_number = pAsm->number_used_registers;
+	unBit = 1 << FRAG_RESULT_COLOR;
+	if(mesa_fp->Base.OutputsWritten & unBit)
+	{
+		pAsm->uiFP_OutputMap[FRAG_RESULT_COLOR] = pAsm->number_used_registers++;
+		pAsm->number_of_exports++;
+		pAsm->number_of_colorandz_exports++;
+	}
+	unBit = 1 << FRAG_RESULT_DEPTH;
+	if(mesa_fp->Base.OutputsWritten & unBit)
+	{
+        pAsm->depth_export_register_number = pAsm->number_used_registers;
+		pAsm->uiFP_OutputMap[FRAG_RESULT_DEPTH] = pAsm->number_used_registers++;
+		pAsm->number_of_exports++;
+		pAsm->number_of_colorandz_exports++;
+	}
+
+    pAsm->pucOutMask = (unsigned char*) MALLOC(pAsm->number_of_exports);    
+    for(ui=0; ui<pAsm->number_of_exports; ui++)
+    {
+        pAsm->pucOutMask[ui] = 0x0;
+    }
+	
+	pAsm->uFirstHelpReg = pAsm->number_used_registers;
+}
+
+GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
+					                	struct gl_fragment_program   *mesa_fp)
+{
+    GLuint i, j;
+    GLint * puiTEMPwrites;
+    struct prog_instruction * pILInst;
+    InstDeps         *pInstDeps;
+    struct prog_instruction * texcoord_DepInst;
+    GLint              nDepInstID;
+
+    puiTEMPwrites = (GLint*) MALLOC(sizeof(GLuint)*mesa_fp->Base.NumTemporaries);
+    for(i=0; i<mesa_fp->Base.NumTemporaries; i++)
+    {
+        puiTEMPwrites[i] = -1;
+    }
+
+    pInstDeps = (InstDeps*)MALLOC(sizeof(InstDeps)*mesa_fp->Base.NumInstructions);
+
+    for(i=0; i<mesa_fp->Base.NumInstructions; i++)
+    {
+        pInstDeps[i].nDstDep = -1;
+        pILInst = &(mesa_fp->Base.Instructions[i]);
+
+        //Dst
+        if(pILInst->DstReg.File == PROGRAM_TEMPORARY)
+        {
+            //Set lastwrite for the temp
+            puiTEMPwrites[pILInst->DstReg.Index] = i;
+        }
+
+        //Src
+        for(j=0; j<3; j++)
+        {
+            if(pILInst->SrcReg[j].File == PROGRAM_TEMPORARY)
+            {
+                //Set dep.
+                pInstDeps[i].nSrcDeps[j] = puiTEMPwrites[pILInst->SrcReg[j].Index];
+            }
+            else
+            {
+                pInstDeps[i].nSrcDeps[j] = -1;
+            }
+        }
+    }
+
+    fp->r700AsmCode.pInstDeps = pInstDeps;
+
+    FREE(puiTEMPwrites);
+
+    //Find dep for tex inst    
+    for(i=0; i<mesa_fp->Base.NumInstructions; i++)
+    {
+        pILInst = &(mesa_fp->Base.Instructions[i]);
+
+        if(GL_TRUE == IsTex(pILInst->Opcode))
+        {   //src0 is the tex coord register, src1 is texunit, src2 is textype
+            nDepInstID = pInstDeps[i].nSrcDeps[0];
+            if(nDepInstID >= 0)
+            {
+                texcoord_DepInst = &(mesa_fp->Base.Instructions[nDepInstID]);
+                if(GL_TRUE == IsAlu(texcoord_DepInst->Opcode) )
+                {
+                    pInstDeps[nDepInstID].nDstDep = i;
+                    pInstDeps[i].nDstDep = i;
+                }
+                else if(GL_TRUE == IsTex(texcoord_DepInst->Opcode) )
+                {
+                    pInstDeps[i].nDstDep = i;
+                }
+                else
+                {   //... other deps?
+                }
+            }
+        }
+	}
+
+    return GL_TRUE;
+}
+
+GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
+							     struct gl_fragment_program   *mesa_fp)
+{
+	GLuint    number_of_colors_exported;
+	GLboolean z_enabled = GL_FALSE;
+	GLuint    unBit;
+
+    //Init_Program
+	Init_r700_AssemblerBase( SPT_FP, &(fp->r700AsmCode), &(fp->r700Shader) );
+	Map_Fragment_Program(&(fp->r700AsmCode), mesa_fp);
+
+    if( GL_FALSE == Find_Instruction_Dependencies_fp(fp, mesa_fp) )
+	{
+		return GL_FALSE;
+    }
+	
+	if( GL_FALSE == AssembleInstr(mesa_fp->Base.NumInstructions,
+                                  &(mesa_fp->Base.Instructions[0]), 
+                                  &(fp->r700AsmCode)) )
+	{
+		return GL_FALSE;
+	}
+
+    if(GL_FALSE == Process_Fragment_Exports(&(fp->r700AsmCode), mesa_fp->Base.OutputsWritten) )
+    {
+        return GL_FALSE;
+    }
+
+    fp->r700Shader.nRegs = (fp->r700AsmCode.number_used_registers == 0) ? 0 
+                         : (fp->r700AsmCode.number_used_registers - 1);
+
+	fp->r700Shader.nParamExports = fp->r700AsmCode.number_of_exports;
+
+	number_of_colors_exported = fp->r700AsmCode.number_of_colorandz_exports;
+
+	unBit = 1 << FRAG_RESULT_DEPTH;
+	if(mesa_fp->Base.OutputsWritten & unBit)
+	{
+		z_enabled = GL_TRUE;
+		number_of_colors_exported--;
+	}
+
+	fp->r700Shader.exportMode = number_of_colors_exported << 1 | z_enabled;
+
+    fp->translated = GL_TRUE;
+
+	return GL_TRUE;
+}
+
+void * r700GetActiveFpShaderBo(GLcontext * ctx)
+{
+    struct r700_fragment_program *fp = (struct r700_fragment_program *)
+	                                   (ctx->FragmentProgram._Current);
+
+    return fp->shaderbo;
+}
+
+GLboolean r700SetupFragmentProgram(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);   
+    BATCH_LOCALS(&context->radeon);
+    
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct r700_fragment_program *fp = (struct r700_fragment_program *)
+	                                   (ctx->FragmentProgram._Current);
+
+    struct gl_program_parameter_list *paramList;
+    unsigned int unNumParamData;
+    unsigned int ui;
+
+    unsigned int unNumOfReg;
+    
+    if(GL_FALSE == fp->loaded)
+    {
+        if(fp->r700Shader.bNeedsAssembly == GL_TRUE)
+	    {
+		    Assemble( &(fp->r700Shader) );
+	    }
+
+        /* Load fp to gpu */
+        r600EmitShader(ctx, 
+                       &(fp->shaderbo), 
+                       (GLvoid *)(fp->r700Shader.pProgram),
+                       fp->r700Shader.uShaderBinaryDWORDSize,
+                       "FS");    			                
+
+        fp->loaded = GL_TRUE;
+    }
+
+    DumpHwBinary(DUMP_PIXEL_SHADER, (GLvoid *)(fp->r700Shader.pProgram),
+                 fp->r700Shader.uShaderBinaryDWORDSize);
+
+    /* TODO : enable this after MemUse fixed *=
+    (context->chipobj.MemUse)(context, fp->shadercode.buf->id);
+    */
+
+    r700->ps.SQ_PGM_START_PS.u32All = 0; /* set from buffer obj */
+
+    unNumOfReg = fp->r700Shader.nRegs + 1;
+
+    ui = (r700->SPI_PS_IN_CONTROL_0.u32All & NUM_INTERP_mask) / (1 << NUM_INTERP_shift);
+
+    ui = ui ? ui : unNumOfReg;
+
+    SETfield(r700->ps.SQ_PGM_RESOURCES_PS.u32All, ui, NUM_GPRS_shift, NUM_GPRS_mask); 
+    
+    CLEARbit(r700->ps.SQ_PGM_RESOURCES_PS.u32All, UNCACHED_FIRST_INST_bit);
+
+    if(fp->r700Shader.uStackSize) /* we don't use branch for now, it should be zero. */
+	{
+        SETfield(r700->ps.SQ_PGM_RESOURCES_PS.u32All, fp->r700Shader.uStackSize,
+                 STACK_SIZE_shift, STACK_SIZE_mask);
+    }
+
+    SETfield(r700->ps.SQ_PGM_EXPORTS_PS.u32All, fp->r700Shader.exportMode,
+             EXPORT_MODE_shift, EXPORT_MODE_mask);
+
+    if(fp->r700Shader.killIsUsed)
+    {
+	    SETbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+    }
+    else
+    {
+        CLEARbit(r700->DB_SHADER_CONTROL.u32All, KILL_ENABLE_bit);
+    }
+
+    if(fp->r700Shader.depthIsExported)
+    {
+	    SETbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit); 
+    }
+    else
+    {
+        CLEARbit(r700->DB_SHADER_CONTROL.u32All, Z_EXPORT_ENABLE_bit);
+    }
+
+    /* sent out shader constants. */
+
+    paramList = fp->mesa_program.Base.Parameters;
+
+    if(NULL != paramList)
+    {
+        _mesa_load_state_parameters(ctx, paramList);
+
+        unNumParamData = paramList->NumParameters * 4;
+
+        BEGIN_BATCH_NO_AUTOSTATE(2 + unNumParamData);
+        
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, unNumParamData));
+
+        /* assembler map const from very beginning. */
+        R600_OUT_BATCH(SQ_ALU_CONSTANT_PS_OFFSET * 4);
+
+        unNumParamData = paramList->NumParameters;
+
+        for(ui=0; ui<unNumParamData; ui++)
+        {
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][0])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][1])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][2])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][3])));
+        }
+        END_BATCH();
+        COMMIT_BATCH();
+    }
+
+    return GL_TRUE;
+}
+
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_fragprog.h b/src/mesa/drivers/dri/r600/r700_fragprog.h
new file mode 100644
index 00000000000..9c7813e9084
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_fragprog.h
@@ -0,0 +1,63 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_FRAGPROG_H_
+#define _R700_FRAGPROG_H_
+
+#include "r600_context.h"
+#include "r700_assembler.h"
+
+struct r700_fragment_program
+{
+	struct gl_fragment_program mesa_program;
+
+    r700_AssemblerBase r700AsmCode;
+	R700_Shader        r700Shader;
+
+	GLboolean translated;
+    GLboolean loaded;
+	GLboolean error;
+
+    void * shaderbo;
+
+	GLboolean WritesDepth;
+	GLuint optimization;
+};
+
+/* Internal */
+void Map_Fragment_Program(r700_AssemblerBase         *pAsm,
+						  struct gl_fragment_program *mesa_fp);
+GLboolean Find_Instruction_Dependencies_fp(struct r700_fragment_program *fp,
+					                	   struct gl_fragment_program   *mesa_fp);
+
+/* Interface */
+extern GLboolean r700TranslateFragmentShader(struct r700_fragment_program *fp,
+							                 struct gl_fragment_program   *mesa_vp);
+extern GLboolean r700SetupFragmentProgram(GLcontext * ctx);
+
+extern void *    r700GetActiveFpShaderBo(GLcontext * ctx);
+
+#endif /*_R700_FRAGPROG_H_*/
diff --git a/src/mesa/drivers/dri/r600/r700_ioctl.c b/src/mesa/drivers/dri/r600/r700_ioctl.c
new file mode 100644
index 00000000000..c4795320012
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_ioctl.c
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#include <sched.h>
+#include <errno.h>
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "swrast/swrast.h"
+
+#include "radeon_common.h"
+#include "radeon_lock.h"
+#include "r600_context.h"
+
+#include "r700_ioctl.h"
+#include "r700_clear.h"
+
+static void r700Flush(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+    context_t *     context = R700_CONTEXT(ctx);
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, radeon->cmdbuf.cs->cdw);
+
+	/* okay if we have no cmds in the buffer &&
+	   we have no DMA flush &&
+	   we have no DMA buffer allocated.
+	   then no point flushing anything at all.
+	*/
+	if (!radeon->dma.flush && !radeon->cmdbuf.cs->cdw && !radeon->dma.current)
+		return;
+
+	if (radeon->dma.flush)
+		radeon->dma.flush( ctx );
+
+	r700SendContextStates(context);
+
+	if (radeon->cmdbuf.cs->cdw)
+		rcommonFlushCmdBuf(radeon, __FUNCTION__);
+}
+
+void r700InitIoctlFuncs(struct dd_function_table *functions)
+{
+	functions->Clear = r700Clear;
+	functions->Finish = radeonFinish;
+	functions->Flush = r700Flush;
+}
diff --git a/src/mesa/drivers/dri/r600/r700_ioctl.h b/src/mesa/drivers/dri/r600/r700_ioctl.h
new file mode 100644
index 00000000000..414dc3e23e3
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_ioctl.h
@@ -0,0 +1,35 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#ifndef __R700_IOCTL_H__
+#define __R700_IOCTL_H__
+
+#include "r600_context.h"
+#include "radeon_drm.h"
+
+extern void r700InitIoctlFuncs(struct dd_function_table *functions);
+
+#endif				/* __R700_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.c b/src/mesa/drivers/dri/r600/r700_oglprog.c
new file mode 100644
index 00000000000..36de143b1a7
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#include <string.h>
+
+#include "main/glheader.h"
+#include "main/imports.h"
+
+#include "shader/program.h"
+#include "tnl/tnl.h"
+
+#include "r600_context.h"
+
+#include "r700_oglprog.h"
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+
+
+static struct gl_program *r700NewProgram(GLcontext * ctx, 
+                                         GLenum target,
+					                     GLuint id)
+{
+	struct gl_program *pProgram = NULL;
+
+    struct r700_vertex_program *vp;
+	struct r700_fragment_program *fp;
+
+    switch (target) 
+    {
+    case GL_VERTEX_STATE_PROGRAM_NV:
+    case GL_VERTEX_PROGRAM_ARB:	    
+        vp       = CALLOC_STRUCT(r700_vertex_program);
+	    pProgram = _mesa_init_vertex_program(ctx, 
+                                             &vp->mesa_program,
+					                         target, 
+                                             id);
+        vp->translated = GL_FALSE;
+        vp->loaded     = GL_FALSE;
+ 
+        vp->shaderbo   = NULL;
+
+	    break;
+    case GL_FRAGMENT_PROGRAM_NV:
+    case GL_FRAGMENT_PROGRAM_ARB:
+		fp       = CALLOC_STRUCT(r700_fragment_program);
+		pProgram = _mesa_init_fragment_program(ctx, 
+                                               &fp->mesa_program,
+						                       target, 
+                                               id);
+        fp->translated = GL_FALSE;
+        fp->loaded     = GL_FALSE;
+
+        fp->shaderbo   = NULL;
+
+	    break;
+    default:
+	    _mesa_problem(ctx, "Bad target in r700NewProgram");
+    }
+
+	return pProgram;
+}
+
+static void r700DeleteProgram(GLcontext * ctx, struct gl_program *prog)
+{
+    struct r700_vertex_program   * vp;
+    struct r700_fragment_program * fp;
+    context_t *context = R700_CONTEXT(ctx);
+
+    switch (prog->Target) 
+    {
+    case GL_VERTEX_STATE_PROGRAM_NV:
+    case GL_VERTEX_PROGRAM_ARB:	    
+        vp = (struct r700_vertex_program*)prog;
+        /* Release DMA region */
+
+        r600DeleteShader(ctx, vp->shaderbo);
+
+        /* Clean up */
+        Clean_Up_Assembler(&(vp->r700AsmCode));
+        Clean_Up_Shader(&(vp->r700Shader));
+	    break;
+    case GL_FRAGMENT_PROGRAM_NV:
+    case GL_FRAGMENT_PROGRAM_ARB:
+		fp = (struct r700_fragment_program*)prog;
+        /* Release DMA region */
+
+        r600DeleteShader(ctx, fp->shaderbo);
+
+        /* Clean up */
+        Clean_Up_Assembler(&(fp->r700AsmCode));
+        Clean_Up_Shader(&(fp->r700Shader));
+	    break;
+    default:
+	    _mesa_problem(ctx, "Bad target in r700NewProgram");
+    }
+
+	_mesa_delete_program(ctx, prog);
+}
+
+static void
+r700ProgramStringNotify(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+
+}
+
+static GLboolean r700IsProgramNative(GLcontext * ctx, GLenum target, struct gl_program *prog)
+{
+
+	return GL_TRUE;
+}
+
+void r700InitShaderFuncs(struct dd_function_table *functions)
+{
+	functions->NewProgram = r700NewProgram;
+	functions->DeleteProgram = r700DeleteProgram;
+	functions->ProgramStringNotify = r700ProgramStringNotify;
+	functions->IsProgramNative = r700IsProgramNative;
+}
diff --git a/src/mesa/drivers/dri/r600/r700_oglprog.h b/src/mesa/drivers/dri/r600/r700_oglprog.h
new file mode 100644
index 00000000000..fe2e9d19749
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_oglprog.h
@@ -0,0 +1,34 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_OGLPROG_H_
+#define _R700_OGLPROG_H_
+#include "r600_context.h"
+
+extern void r700InitShaderFuncs(struct dd_function_table *functions);
+
+#endif /*_R700_OGLPROG_H_*/
diff --git a/src/mesa/drivers/dri/r600/r700_render.c b/src/mesa/drivers/dri/r600/r700_render.c
new file mode 100644
index 00000000000..f1e467a317f
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_render.c
@@ -0,0 +1,458 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ *   CooperYuan <[email protected]>, <[email protected]>
+ */
+
+#include "main/glheader.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/context.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+#include "main/api_arrayelt.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "vbo/vbo.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_vp_build.h"
+#include "tnl/t_context.h"
+#include "tnl/t_vertex.h"
+#include "tnl/t_pipeline.h"
+
+#include "radeon_mipmap_tree.h"
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r600_tex.h"
+
+#include "r700_vertprog.h"
+#include "r700_fragprog.h"
+#include "r700_state.h"
+
+void r700WaitForIdle(context_t *context);
+void r700WaitForIdleClean(context_t *context);
+void r700Start3D(context_t *context);
+GLboolean r700SendTextureState(context_t *context);
+unsigned int r700PrimitiveType(int prim);
+void r600UpdateTextureState(GLcontext * ctx);
+GLboolean r700SyncSurf(context_t *context,
+		       struct radeon_bo *pbo,
+		       uint32_t read_domain,
+		       uint32_t write_domain,
+		       uint32_t sync_type);
+
+void r700WaitForIdle(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+    R600_OUT_BATCH(mmWAIT_UNTIL - ASIC_CONFIG_BASE_INDEX);
+    R600_OUT_BATCH(WAIT_3D_IDLE_bit);
+
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+void r700WaitForIdleClean(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    BEGIN_BATCH_NO_AUTOSTATE(5);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_EVENT_WRITE, 0));
+    R600_OUT_BATCH(CACHE_FLUSH_AND_INV_EVENT);
+
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+    R600_OUT_BATCH(mmWAIT_UNTIL - ASIC_CONFIG_BASE_INDEX);
+    R600_OUT_BATCH(WAIT_3D_IDLE_bit | WAIT_3D_IDLECLEAN_bit);
+
+    END_BATCH();
+    COMMIT_BATCH();
+}
+
+void r700Start3D(context_t *context)
+{
+    BATCH_LOCALS(&context->radeon);
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+    {
+        BEGIN_BATCH_NO_AUTOSTATE(2);
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_START_3D_CMDBUF, 0));
+        R600_OUT_BATCH(0);
+        END_BATCH();
+    }
+
+    BEGIN_BATCH_NO_AUTOSTATE(3);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_CONTEXT_CONTROL, 1));
+    R600_OUT_BATCH(0x80000000);
+    R600_OUT_BATCH(0x80000000);
+    END_BATCH();
+
+    COMMIT_BATCH();
+
+    r700WaitForIdleClean(context);
+}
+
+static GLboolean r700SetupShaders(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    GLuint exportCount;
+
+    r700->ps.SQ_PGM_RESOURCES_PS.u32All = 0;
+    r700->vs.SQ_PGM_RESOURCES_VS.u32All = 0;
+
+    SETbit(r700->ps.SQ_PGM_RESOURCES_PS.u32All, PGM_RESOURCES__PRIME_CACHE_ON_DRAW_bit);
+    SETbit(r700->vs.SQ_PGM_RESOURCES_VS.u32All, PGM_RESOURCES__PRIME_CACHE_ON_DRAW_bit);
+
+    r700SetupVertexProgram(ctx);
+
+    r700SetupFragmentProgram(ctx);
+
+    exportCount = (r700->ps.SQ_PGM_EXPORTS_PS.u32All & EXPORT_MODE_mask) / (1 << EXPORT_MODE_shift);
+    r700->CB_SHADER_CONTROL.u32All = (1 << exportCount) - 1;
+
+    return GL_TRUE;
+}
+
+GLboolean r700SendTextureState(context_t *context)
+{
+    unsigned int i;
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    offset_modifiers offset_mod = {NO_SHIFT, 0, 0xFFFFFFFF};
+    struct radeon_bo *bo = NULL;
+    BATCH_LOCALS(&context->radeon);
+
+    for (i=0; i<R700_TEXTURE_NUMBERUNITS; i++) {
+	    radeonTexObj *t = r700->textures[i];
+	    if (t) {
+		    if (!t->image_override)
+			    bo = t->mt->bo;
+		    else
+			    bo = t->bo;
+		    if (bo) {
+
+			    r700SyncSurf(context, bo,
+					 RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM,
+					 0, TC_ACTION_ENA_bit);
+
+			    BEGIN_BATCH_NO_AUTOSTATE(9);
+			    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_RESOURCE, 7));
+			    R600_OUT_BATCH(i * 7);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE0);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE1);
+			    R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE2,
+						 bo,
+						 0,
+						 RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0, &offset_mod);
+			    R600_OUT_BATCH_RELOC(r700->textures[i]->SQ_TEX_RESOURCE3,
+						 bo,
+						 0,
+						 RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0, &offset_mod);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE4);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE5);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_RESOURCE6);
+			    END_BATCH();
+
+			    BEGIN_BATCH_NO_AUTOSTATE(5);
+			    R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_SAMPLER, 3));
+			    R600_OUT_BATCH(i * 3);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER0);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER1);
+			    R600_OUT_BATCH(r700->textures[i]->SQ_TEX_SAMPLER2);
+			    END_BATCH();
+			    COMMIT_BATCH();
+		    }
+	    }
+    }
+    return GL_TRUE;
+}
+
+GLboolean r700SyncSurf(context_t *context,
+		       struct radeon_bo *pbo,
+		       uint32_t read_domain,
+		       uint32_t write_domain,
+		       uint32_t sync_type)
+{
+    BATCH_LOCALS(&context->radeon);
+    uint32_t cp_coher_size;
+    offset_modifiers offset_mod;
+
+    if (pbo->size == 0xffffffff)
+	    cp_coher_size = 0xffffffff;
+    else
+	    cp_coher_size = ((pbo->size + 255) >> 8);
+
+    offset_mod.shift     = NO_SHIFT;
+    offset_mod.shiftbits = 0;
+    offset_mod.mask      = 0xFFFFFFFF;
+
+    BEGIN_BATCH_NO_AUTOSTATE(5);
+    R600_OUT_BATCH(CP_PACKET3(R600_IT_SURFACE_SYNC, 3));
+    R600_OUT_BATCH(sync_type);
+    R600_OUT_BATCH(cp_coher_size);
+    R600_OUT_BATCH_RELOC(0,
+			 pbo,
+			 0,
+			 read_domain, write_domain, 0, &offset_mod); // ???
+    R600_OUT_BATCH(10);
+
+    END_BATCH();
+    COMMIT_BATCH();
+
+    return GL_TRUE;
+}
+
+unsigned int r700PrimitiveType(int prim)
+{
+    switch (prim & PRIM_MODE_MASK)
+    {
+    case GL_POINTS:
+        return DI_PT_POINTLIST;
+        break;
+    case GL_LINES:
+        return DI_PT_LINELIST;
+        break;
+    case GL_LINE_STRIP:
+        return DI_PT_LINESTRIP;
+        break;
+    case GL_LINE_LOOP:
+        return DI_PT_LINELOOP;
+        break;
+    case GL_TRIANGLES:
+        return DI_PT_TRILIST;
+        break;
+    case GL_TRIANGLE_STRIP:
+        return DI_PT_TRISTRIP;
+        break;
+    case GL_TRIANGLE_FAN:
+        return DI_PT_TRIFAN;
+        break;
+    case GL_QUADS:
+        return DI_PT_QUADLIST;
+        break;
+    case GL_QUAD_STRIP:
+        return DI_PT_QUADSTRIP;
+        break;
+    case GL_POLYGON:
+        return DI_PT_POLYGON;
+        break;
+    default:
+        assert(0);
+        return -1;
+        break;
+    }
+}
+
+static GLboolean r700RunRender(GLcontext * ctx,
+			                   struct tnl_pipeline_stage *stage)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    int lastIndex = 0;
+#if 1
+    BATCH_LOCALS(&context->radeon);
+
+    unsigned int i, j;
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+    struct vertex_buffer *vb = &tnl->vb;
+
+    struct r700_fragment_program *fp = (struct r700_fragment_program *)
+	                                   (ctx->FragmentProgram._Current);
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+    {
+        fp->r700AsmCode.bR6xx = 1;
+    }
+
+    r700Start3D(context); /* TODO : this is too much. */
+
+    r700SendSQConfig(context);
+
+    r700UpdateShaders(ctx);
+
+    r700SetScissor(context);
+    r700SetRenderTarget(context, 0);
+    r700SetDepthTarget(context);
+
+    if(r700SetupStreams(ctx))
+    {
+        return GL_TRUE;
+    }
+
+    r600UpdateTextureState(ctx);
+    r700SendTextureState(context);
+
+    if(GL_FALSE == fp->translated)
+    {
+        if( GL_FALSE == r700TranslateFragmentShader(fp, &(fp->mesa_program)) )
+        {
+            return GL_TRUE;
+        }
+    }
+
+    r700SetupShaders(ctx);
+
+    r700SendFSState(context); // FIXME just a place holder for now
+    r700SendPSState(context);
+    r700SendVSState(context);
+
+    r700SendContextStates(context);
+    r700SendViewportState(context, 0);
+    r700SendRenderTargetState(context, 0);
+    r700SendDepthTargetState(context);
+
+    /* richard test code */
+    for (i = 0; i < vb->PrimitiveCount; i++) 
+    {
+        GLuint prim = _tnl_translate_prim(&vb->Primitive[i]);
+        GLuint start = vb->Primitive[i].start;
+        GLuint end = vb->Primitive[i].start + vb->Primitive[i].count;
+        GLuint numIndices = vb->Primitive[i].count;
+        GLuint numEntires;
+
+        unsigned int VGT_DRAW_INITIATOR = 0;
+        unsigned int VGT_INDEX_TYPE     = 0;
+        unsigned int VGT_PRIMITIVE_TYPE = 0;
+        unsigned int VGT_NUM_INDICES    = 0;
+        
+        numEntires = 2 /* VGT_INDEX_TYPE */
+                     + 3 /* VGT_PRIMITIVE_TYPE */
+                     + numIndices + 3; /* DRAW_INDEX_IMMD */                  
+                     
+        BEGIN_BATCH_NO_AUTOSTATE(numEntires);  
+
+        VGT_INDEX_TYPE |= DI_INDEX_SIZE_32_BIT << INDEX_TYPE_shift;
+
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_INDEX_TYPE, 0));
+        R600_OUT_BATCH(VGT_INDEX_TYPE);
+
+        VGT_NUM_INDICES = numIndices;
+
+        VGT_PRIMITIVE_TYPE |= r700PrimitiveType(prim) << VGT_PRIMITIVE_TYPE__PRIM_TYPE_shift;
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_CONFIG_REG, 1));
+        R600_OUT_BATCH(mmVGT_PRIMITIVE_TYPE - ASIC_CONFIG_BASE_INDEX);
+        R600_OUT_BATCH(VGT_PRIMITIVE_TYPE);
+
+        VGT_DRAW_INITIATOR |= DI_SRC_SEL_IMMEDIATE << SOURCE_SELECT_shift;
+        VGT_DRAW_INITIATOR |= DI_MAJOR_MODE_0 << MAJOR_MODE_shift;
+
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_DRAW_INDEX_IMMD, (numIndices + 1)));
+        R600_OUT_BATCH(VGT_NUM_INDICES);
+        R600_OUT_BATCH(VGT_DRAW_INITIATOR);
+
+        for (j = lastIndex; j < lastIndex + numIndices; j++)
+        {
+            R600_OUT_BATCH(j);
+        }
+        lastIndex += numIndices;
+
+        END_BATCH();
+        COMMIT_BATCH();
+    }
+
+    /* Flush render op cached for last several quads. */
+    r700WaitForIdleClean(context);
+
+    radeonReleaseArrays(ctx, 0);
+
+#endif //0
+    rcommonFlushCmdBuf( &context->radeon, __FUNCTION__ );
+
+    return GL_FALSE;
+}
+
+static GLboolean r700RunNonTCLRender(GLcontext * ctx,
+				     struct tnl_pipeline_stage *stage) /* -------------------- */
+{
+	GLboolean bRet = GL_TRUE;
+	
+	return bRet;
+}
+
+static GLboolean r700RunTCLRender(GLcontext * ctx,  /*----------------------*/
+				  struct tnl_pipeline_stage *stage)
+{
+	GLboolean bRet = GL_FALSE;
+
+    /* TODO : sw fallback */
+
+    /**
+    * Ensure all enabled and complete textures are uploaded along with any buffers being used.
+    */
+    if(!r600ValidateBuffers(ctx))
+    {
+        return GL_TRUE;
+    }
+
+    context_t *context = R700_CONTEXT(ctx);
+
+    r700UpdateShaders(ctx);
+
+    bRet = r700RunRender(ctx, stage);
+
+    return bRet;
+	//GL_FALSE will stop to do other pipe stage in _tnl_run_pipeline
+    //The render here DOES finish the whole pipe, so GL_FALSE should be returned for success.
+}
+
+const struct tnl_pipeline_stage _r700_render_stage = {
+	"r700 Hardware Rasterization",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r700RunNonTCLRender
+};
+
+const struct tnl_pipeline_stage _r700_tcl_stage = {
+	"r700 Hardware Transform, Clipping and Lighting",
+	NULL,
+	NULL,
+	NULL,
+	NULL,
+	r700RunTCLRender
+};
+
+const struct tnl_pipeline_stage *r700_pipeline[] = 
+{
+    &_r700_tcl_stage,
+    &_tnl_vertex_transform_stage,
+	&_tnl_normal_transform_stage,
+	&_tnl_lighting_stage,
+	&_tnl_fog_coordinate_stage,
+	&_tnl_texgen_stage,
+	&_tnl_texture_transform_stage,
+	&_tnl_vertex_program_stage,
+
+    &_r700_render_stage,
+    &_tnl_render_stage,
+    0,
+};
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_shader.c b/src/mesa/drivers/dri/r600/r700_shader.c
new file mode 100644
index 00000000000..b4fd51c1370
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shader.c
@@ -0,0 +1,527 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/imports.h"
+
+#include "main/glheader.h"
+
+#include "r600_context.h"
+#include "r700_debug.h"
+
+#include "r700_shader.h"
+
+void r700ShaderInit(GLcontext * ctx)
+{
+}
+
+void AddInstToList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst)
+{
+	if(NULL == plstCFInstructions->pTail)
+	{	//first
+		plstCFInstructions->pHead = pInst;
+		plstCFInstructions->pTail = pInst;
+	}
+	else
+	{
+		plstCFInstructions->pTail->pNextInst = pInst;
+		plstCFInstructions->pTail = pInst;
+	}
+	pInst->pNextInst = NULL;
+
+	plstCFInstructions->uNumOfNode++;
+}
+
+void Init_R700_Shader(R700_Shader * pShader)
+{
+	pShader->Type = R700_SHADER_INVALID;
+	pShader->pProgram = NULL;
+	pShader->bBinaryShader = GL_FALSE;
+	pShader->bFetchShaderRequired = GL_FALSE;
+	pShader->bNeedsAssembly = GL_FALSE;
+	pShader->bLinksDirty = GL_TRUE;
+	pShader->uShaderBinaryDWORDSize = 0;
+	pShader->nRegs = 0;
+	pShader->nParamExports = 0;
+	pShader->nMemExports = 0;
+	pShader->resource = 0;
+
+	pShader->exportMode = 0;
+	pShader->depthIsImported = GL_FALSE;
+
+	pShader->positionVectorIsExported = GL_FALSE;
+	pShader->miscVectorIsExported = GL_FALSE;
+	pShader->renderTargetArrayIndexIsExported = GL_FALSE;
+	pShader->ccDist0VectorIsExported = GL_FALSE;
+	pShader->ccDist1VectorIsExported = GL_FALSE; 
+
+
+	pShader->depthIsExported = GL_FALSE;
+	pShader->stencilRefIsExported = GL_FALSE;
+	pShader->coverageToMaskIsExported = GL_FALSE;
+	pShader->maskIsExported = GL_FALSE;
+	pShader->killIsUsed = GL_FALSE;
+
+	pShader->uCFOffset = 0;
+	pShader->uStackSize = 0;
+	pShader->uMaxCallDepth = 0;
+
+	pShader->bSurfAllocated = GL_FALSE;
+	
+	pShader->lstCFInstructions.pHead=NULL;  
+	pShader->lstCFInstructions.pTail=NULL;  
+	pShader->lstCFInstructions.uNumOfNode=0;
+	pShader->lstALUInstructions.pHead=NULL; 
+	pShader->lstALUInstructions.pTail=NULL; 
+	pShader->lstALUInstructions.uNumOfNode=0;
+	pShader->lstTEXInstructions.pHead=NULL; 
+	pShader->lstTEXInstructions.pTail=NULL; 
+	pShader->lstTEXInstructions.uNumOfNode=0;
+	pShader->lstVTXInstructions.pHead=NULL; 
+	pShader->lstVTXInstructions.pTail=NULL; 
+	pShader->lstVTXInstructions.uNumOfNode=0;
+}
+
+void AddCFInstruction(R700_Shader *pShader, R700ControlFlowInstruction *pCFInst)
+{
+    R700ControlFlowSXClause*  pSXClause; 
+    R700ControlFlowSMXClause* pSMXClause;
+
+    pCFInst->m_uIndex = pShader->lstCFInstructions.uNumOfNode;
+    AddInstToList(&(pShader->lstCFInstructions), 
+                  (R700ShaderInstruction*)pCFInst);
+    pShader->uShaderBinaryDWORDSize += GetInstructionSize(pCFInst->m_ShaderInstType);
+
+    pSXClause = NULL;
+    pSMXClause = NULL; 
+	switch (pCFInst->m_ShaderInstType)
+	{
+	case SIT_CF_ALL_EXP_SX:
+		pSXClause =  (R700ControlFlowSXClause*)pCFInst;
+		break;
+	case SIT_CF_ALL_EXP_SMX:
+		pSMXClause = (R700ControlFlowSMXClause*)pCFInst;
+		break;
+	default:
+		break;
+	};
+
+    if((pSXClause != NULL) && (pSXClause->m_Word0.f.type == SQ_EXPORT_PARAM))
+    {
+        pShader->nParamExports += pSXClause->m_Word1.f.burst_count + 1;
+    }
+    else if ((pSMXClause != NULL) && (pSMXClause->m_Word1.f.cf_inst == SQ_CF_INST_MEM_RING) &&
+            (pSMXClause->m_Word0.f.type == SQ_EXPORT_WRITE || pSMXClause->m_Word0.f.type == SQ_EXPORT_WRITE_IND))
+    {
+        pShader->nMemExports += pSMXClause->m_Word1.f.burst_count + 1;
+    }
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pCFInst->useCount++;
+}
+
+void AddVTXInstruction(R700_Shader *pShader, R700VertexInstruction *pVTXInst)
+{
+    pVTXInst->m_uIndex = pShader->lstVTXInstructions.uNumOfNode;
+	AddInstToList(&(pShader->lstVTXInstructions), 
+                  (R700ShaderInstruction*)pVTXInst);
+	pShader->uShaderBinaryDWORDSize += GetInstructionSize(pVTXInst->m_ShaderInstType);
+
+	if(pVTXInst->m_ShaderInstType == SIT_VTX_GENERIC)
+	{
+		R700VertexGenericFetch* pVTXGenericClause = (R700VertexGenericFetch*)pVTXInst;	
+		pShader->nRegs = (pShader->nRegs < pVTXGenericClause->m_Word1_GPR.f.dst_gpr) ? pVTXGenericClause->m_Word1_GPR.f.dst_gpr : pShader->nRegs;
+	}
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pVTXInst->useCount++;
+}
+
+void AddTEXInstruction(R700_Shader *pShader, R700TextureInstruction *pTEXInst)
+{
+    pTEXInst->m_uIndex = pShader->lstTEXInstructions.uNumOfNode;
+	AddInstToList(&(pShader->lstTEXInstructions), 
+                  (R700ShaderInstruction*)pTEXInst);
+	pShader->uShaderBinaryDWORDSize += GetInstructionSize(pTEXInst->m_ShaderInstType);
+
+    pShader->nRegs = (pShader->nRegs < pTEXInst->m_Word1.f.dst_gpr) ? pTEXInst->m_Word1.f.dst_gpr : pShader->nRegs;
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pTEXInst->useCount++;
+}
+
+void AddALUInstruction(R700_Shader *pShader, R700ALUInstruction *pALUInst)
+{
+    pALUInst->m_uIndex = pShader->lstALUInstructions.uNumOfNode;
+    AddInstToList(&(pShader->lstALUInstructions), 
+                  (R700ShaderInstruction*)pALUInst);
+    pShader->uShaderBinaryDWORDSize += GetInstructionSize(pALUInst->m_ShaderInstType);
+
+    pShader->nRegs = (pShader->nRegs < pALUInst->m_Word1.f.dst_gpr) ? pALUInst->m_Word1.f.dst_gpr : pShader->nRegs;
+
+    pShader->bLinksDirty    = GL_TRUE;
+    pShader->bNeedsAssembly = GL_TRUE;
+
+    pALUInst->useCount++;
+}
+
+void ResolveLinks(R700_Shader *pShader)
+{
+    GLuint uiSize;
+    R700ShaderInstruction  *pInst;
+    R700ALUInstruction     *pALUinst;
+    R700TextureInstruction *pTEXinst;
+    R700VertexInstruction  *pVTXinst; 
+
+    GLuint vtxOffset;
+
+	GLuint cfOffset = 0x0;  
+
+    GLuint aluOffset = cfOffset + pShader->lstCFInstructions.uNumOfNode * GetInstructionSize(SIT_CF);
+
+    GLuint texOffset = aluOffset;  // + m_lstALUInstructions.size() * R700ALUInstruction::SIZE,
+
+    pInst = pShader->lstALUInstructions.pHead;
+    while(NULL != pInst)
+    {
+        texOffset += GetInstructionSize(pInst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+  
+    vtxOffset = texOffset + pShader->lstTEXInstructions.uNumOfNode * GetInstructionSize(SIT_TEX);
+
+    if ( ((pShader->lstTEXInstructions.uNumOfNode > 0) && (texOffset % 4 != 0)) || 
+         ((pShader->lstVTXInstructions.uNumOfNode > 0) && (vtxOffset % 4 != 0))    )
+    {
+        pALUinst = (R700ALUInstruction*) CALLOC_STRUCT(R700ALUInstruction);
+        Init_R700ALUInstruction(pALUinst);
+        AddALUInstruction(pShader, pALUinst);
+        texOffset += GetInstructionSize(SIT_ALU);
+        vtxOffset += GetInstructionSize(SIT_ALU);
+    }
+
+    pInst  = pShader->lstALUInstructions.pHead;
+    uiSize = 0;
+    while(NULL != pInst)
+    {
+        pALUinst = (R700ALUInstruction*)pInst;
+
+        if(pALUinst->m_pLinkedALUClause != NULL)
+        {
+            // This address is quad-word aligned
+            pALUinst->m_pLinkedALUClause->m_Word0.f.addr = (aluOffset + uiSize) >> 1;
+        }
+
+        uiSize += GetInstructionSize(pALUinst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+
+    pInst  = pShader->lstTEXInstructions.pHead;
+    uiSize = 0;
+    while(NULL != pInst)
+    {
+        pTEXinst = (R700TextureInstruction*)pInst;
+
+        if (pTEXinst->m_pLinkedGenericClause != NULL)
+        {
+            pTEXinst->m_pLinkedGenericClause->m_Word0.f.addr = (texOffset + uiSize) >> 1;
+        }
+
+        uiSize += GetInstructionSize(pTEXinst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+
+    pInst  = pShader->lstVTXInstructions.pHead;
+    uiSize = 0;
+    while(NULL != pInst)
+    {
+        pVTXinst = (R700VertexInstruction*)pInst;
+
+        if (pVTXinst->m_pLinkedGenericClause != NULL)
+        {
+            pVTXinst->m_pLinkedGenericClause->m_Word0.f.addr = (vtxOffset + uiSize) >> 1;
+        }
+
+        uiSize += GetInstructionSize(pVTXinst->m_ShaderInstType);
+
+        pInst = pInst->pNextInst;
+    };
+
+    pShader->bLinksDirty = GL_FALSE;
+}
+
+void Assemble(R700_Shader *pShader)
+{
+	GLuint i;
+    GLuint *pShaderBinary;
+    GLuint size_of_program;
+    GLuint *pCurrPos;
+
+    GLuint end_of_cf_instructions;
+    GLuint number_of_alu_dwords;
+
+    R700ShaderInstruction  *pInst;
+
+    if(GL_TRUE == pShader->bBinaryShader)
+    {
+        return;
+    }
+
+    if(pShader->bLinksDirty == GL_TRUE) 
+    {
+        ResolveLinks(pShader);
+    }
+
+    size_of_program = pShader->uShaderBinaryDWORDSize;
+    
+    pShaderBinary = (GLuint*) MALLOC(sizeof(GLuint)*size_of_program);
+ 
+    pCurrPos = pShaderBinary;
+
+    for (i = 0; i < size_of_program; i++)
+    {
+        pShaderBinary[i] = 0;
+    }
+
+    pInst = pShader->lstCFInstructions.pHead;
+    while(NULL != pInst)
+    {
+        switch (pInst->m_ShaderInstType)
+        {
+        case SIT_CF_GENERIC: 
+            {
+                R700ControlFlowGenericClause* pCFgeneric = (R700ControlFlowGenericClause*)pInst;
+                *pCurrPos++ = pCFgeneric->m_Word0.val;
+                *pCurrPos++ = pCFgeneric->m_Word1.val;
+            }
+            break;
+        case SIT_CF_ALU: 
+            {
+                R700ControlFlowALUClause* pCFalu = (R700ControlFlowALUClause*)pInst;
+                *pCurrPos++ = pCFalu->m_Word0.val;
+                *pCurrPos++ = pCFalu->m_Word1.val;
+            }
+            break;
+        case SIT_CF_ALL_EXP_SX: 
+            {
+                R700ControlFlowSXClause* pCFsx = (R700ControlFlowSXClause*)pInst;
+                *pCurrPos++ = pCFsx->m_Word0.val;
+                *pCurrPos++ = (pCFsx->m_Word1.val | pCFsx->m_Word1_SWIZ.val);
+            }
+            break;
+        case SIT_CF_ALL_EXP_SMX: 
+            {
+                R700ControlFlowSMXClause* pCFsmx = (R700ControlFlowSMXClause*)pInst;
+                *pCurrPos++ = pCFsmx->m_Word0.val;
+                *pCurrPos++ = (pCFsmx->m_Word1.val | pCFsmx->m_Word1_BUF.val);
+            }
+            break;
+        default:
+            break;
+        }
+
+        pInst = pInst->pNextInst;
+    };
+    
+    number_of_alu_dwords = 0;
+    pInst = pShader->lstALUInstructions.pHead;
+    while(NULL != pInst)
+    {
+        switch (pInst->m_ShaderInstType)
+        {
+        case SIT_ALU: 
+            {
+                R700ALUInstruction* pALU = (R700ALUInstruction*)pInst;
+
+                *pCurrPos++ = pALU->m_Word0.val;
+                *pCurrPos++ = (pALU->m_Word1.val | pALU->m_Word1_OP2.val | pALU->m_Word1_OP3.val);
+
+                number_of_alu_dwords += 2;
+            }
+            break;
+        case SIT_ALU_HALF_LIT: 
+            {
+                R700ALUInstructionHalfLiteral* pALUhalf = (R700ALUInstructionHalfLiteral*)pInst;
+
+                *pCurrPos++ = pALUhalf->m_Word0.val;
+                *pCurrPos++ = (pALUhalf->m_Word1.val | pALUhalf->m_Word1_OP2.val | pALUhalf->m_Word1_OP3.val);
+                *pCurrPos++ = *((GLuint*)&(pALUhalf->m_fLiteralX));
+                *pCurrPos++ = *((GLuint*)&(pALUhalf->m_fLiteralY));
+
+                number_of_alu_dwords += 4;
+            }
+            break;
+        case SIT_ALU_FALL_LIT: 
+            {
+                R700ALUInstructionFullLiteral* pALUfull = (R700ALUInstructionFullLiteral*)pInst;
+
+                *pCurrPos++ = pALUfull->m_Word0.val;
+                *pCurrPos++ = (pALUfull->m_Word1.val | pALUfull->m_Word1_OP2.val | pALUfull->m_Word1_OP3.val);
+
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralX));
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralY));
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralZ));
+                *pCurrPos++ = *((GLuint*)&(pALUfull->m_fLiteralW));
+
+                number_of_alu_dwords += 6;
+            }
+            break;
+        default:
+            break;
+        }
+
+        pInst = pInst->pNextInst;
+    };
+    
+    pInst = pShader->lstTEXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        R700TextureInstruction* pTEX = (R700TextureInstruction*)pInst;
+
+        *pCurrPos++ = pTEX->m_Word0.val;
+        *pCurrPos++ = pTEX->m_Word1.val;
+        *pCurrPos++ = pTEX->m_Word2.val;
+        *pCurrPos++ = 0x0beadeaf;
+
+        pInst = pInst->pNextInst;
+    };
+    
+    pInst = pShader->lstVTXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        switch (pInst->m_ShaderInstType)
+        {
+        case SIT_VTX_SEM: //
+            {
+                R700VertexSemanticFetch* pVTXsem = (R700VertexSemanticFetch*)pInst;
+
+                *pCurrPos++ = pVTXsem->m_Word0.val;
+                *pCurrPos++ = (pVTXsem->m_Word1.val | pVTXsem->m_Word1_SEM.val);
+                *pCurrPos++ = pVTXsem->m_Word2.val;
+                *pCurrPos++ = 0x0beadeaf;
+            }
+            break;
+        case SIT_VTX_GENERIC: //
+            {
+                R700VertexGenericFetch* pVTXgeneric = (R700VertexGenericFetch*)pInst;
+
+                *pCurrPos++ = pVTXgeneric->m_Word0.val;
+                *pCurrPos++ = (pVTXgeneric->m_Word1.val | pVTXgeneric->m_Word1_GPR.val);
+                *pCurrPos++ = pVTXgeneric->m_Word2.val;
+                *pCurrPos++ = 0x0beadeaf;
+            }
+            break;
+        default:
+            break;
+        }
+
+        pInst = pInst->pNextInst;
+    };
+
+    if(NULL != pShader->pProgram)
+    {
+        FREE(pShader->pProgram);
+    }
+    pShader->pProgram = (GLubyte*)pShaderBinary;
+
+    end_of_cf_instructions = pShader->uCFOffset + pShader->lstCFInstructions.uNumOfNode * GetInstructionSize(SIT_CF);
+    
+    pShader->uEndOfCF = end_of_cf_instructions >> 1;
+
+    pShader->uEndOfALU = (end_of_cf_instructions + number_of_alu_dwords) >> 1;
+
+    pShader->uEndOfFetch = (pShader->uCFOffset + pShader->uShaderBinaryDWORDSize) >> 1;
+
+    pShader->bNeedsAssembly = GL_FALSE;
+}
+
+void LoadProgram(R700_Shader *pShader) //context
+{
+}
+
+void UpdateShaderRegisters(R700_Shader *pShader) //context
+{
+}
+
+void DeleteInstructions(R700_Shader *pShader)
+{
+}
+
+void DebugPrint(void)
+{
+}
+
+void Clean_Up_Shader(R700_Shader *pShader)
+{
+    FREE(pShader->pProgram);
+
+    R700ShaderInstruction  *pInst;
+    R700ShaderInstruction  *pInstToFree;
+
+    pInst = pShader->lstCFInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+    pInst = pShader->lstALUInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+    pInst = pShader->lstTEXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+    pInst = pShader->lstVTXInstructions.pHead;
+    while(NULL != pInst)
+    {
+        pInstToFree = pInst;
+        pInst = pInst->pNextInst;
+        FREE(pInstToFree);
+    };
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_shader.h b/src/mesa/drivers/dri/r600/r700_shader.h
new file mode 100644
index 00000000000..bfd01e1a93a
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shader.h
@@ -0,0 +1,150 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+
+#ifndef __R700_SHADER_H__
+#define __R700_SHADER_H__
+
+#include "main/mtypes.h"
+
+#include "r700_shaderinst.h"
+
+
+void r700ShaderInit(GLcontext * ctx);
+
+typedef enum R700ShaderType
+{
+    R700_SHADER_FS      = 0x0,
+    R700_SHADER_ES      = 0x1,
+    R700_SHADER_GS      = 0x2,
+    R700_SHADER_VS      = 0x3,
+    R700_SHADER_PS      = 0x4,
+    R700_SHADER_INVALID = 0x5,
+} R700ShaderType;
+
+typedef struct TypedShaderList 
+{
+	R700ShaderInstruction * pHead;
+	R700ShaderInstruction * pTail;
+	GLuint  uNumOfNode;
+} TypedShaderList;
+
+typedef struct RealRegister 
+{
+    GLuint uAddr;
+    GLuint uValue;
+} RealRegister;
+
+typedef struct InstDeps
+{
+    GLint nDstDep;
+    GLint nSrcDeps[3];
+} InstDeps;
+
+typedef struct R700_Shader 
+{
+	R700ShaderType   Type;
+
+    GLubyte*  pProgram;
+
+    GLboolean bBinaryShader;
+    GLboolean bFetchShaderRequired;
+    GLboolean bNeedsAssembly;
+    GLboolean bLinksDirty;
+
+    GLuint  uShaderBinaryDWORDSize; // in DWORDS
+    GLuint  nRegs;      
+    GLuint  nParamExports;   // VS_ EXPORT_COUNT (1 based, the actual register is 0 based!)
+    GLuint  nMemExports; 
+    GLuint  resource;     // VS and PS _RESOURCE
+    GLuint  exportMode;   // VS and PS _EXPORT_MODE
+
+    GLboolean  depthIsImported;             
+
+    // Vertex program exports
+    GLboolean  positionVectorIsExported;          
+
+    GLboolean  miscVectorIsExported;               
+    GLboolean  renderTargetArrayIndexIsExported;  
+
+    GLboolean  ccDist0VectorIsExported;  
+    GLboolean  ccDist1VectorIsExported;  
+
+    // Pixel program exports
+    GLboolean  depthIsExported;             
+    GLboolean  stencilRefIsExported;        
+    GLboolean  coverageToMaskIsExported;    
+    GLboolean  maskIsExported;              
+
+    GLboolean  killIsUsed;                  
+
+    GLuint  uStartAddr;
+    GLuint  uCFOffset;
+    GLuint  uEndOfCF;
+    GLuint  uEndOfALU;
+    GLuint  uEndOfFetch;
+    GLuint  uStackSize;
+    GLuint  uMaxCallDepth;
+
+	TypedShaderList lstCFInstructions;
+	TypedShaderList lstALUInstructions;
+	TypedShaderList lstTEXInstructions;
+	TypedShaderList lstVTXInstructions;
+
+    RealRegister RegStartAddr;
+    RealRegister RegCFOffset;
+    RealRegister RegEndCF;
+    RealRegister RegEndALU;
+    RealRegister egEndFetcg;
+
+	// -------- constants
+	GLfloat   ConstantArray[SQ_ALU_CONSTANT_PS_COUNT * 4];
+	
+	GLboolean bSurfAllocated;
+} R700_Shader;
+
+//Internal
+void AddInstToList(TypedShaderList * plstCFInstructions, R700ShaderInstruction * pInst);
+void ResolveLinks(R700_Shader *pShader);
+void Assemble(R700_Shader *pShader);
+
+
+//Interface
+void Init_R700_Shader(R700_Shader * pShader);
+void AddCFInstruction(R700_Shader *pShader, R700ControlFlowInstruction *pCFInst);
+void AddVTXInstruction(R700_Shader *pShader, R700VertexInstruction *pVTXInst);
+void AddTEXInstruction(R700_Shader *pShader, R700TextureInstruction *pTEXInst);
+void AddALUInstruction(R700_Shader *pShader, R700ALUInstruction *pALUInst);
+
+void LoadProgram(R700_Shader *pShader);
+void UpdateShaderRegisters(R700_Shader *pShader);
+void DeleteInstructions(R700_Shader *pShader);
+void DebugPrint(void);
+
+void Clean_Up_Shader(R700_Shader *pShader);
+
+#endif /*__R700_SHADER_H__*/
+
diff --git a/src/mesa/drivers/dri/r600/r700_shaderinst.c b/src/mesa/drivers/dri/r600/r700_shaderinst.c
new file mode 100644
index 00000000000..c1bffee91f0
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shaderinst.c
@@ -0,0 +1,224 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+
+#include "main/mtypes.h"
+
+#include "r700_debug.h"
+#include "r700_shaderinst.h"
+
+void Init_R700ControlFlowGenericClause(R700ControlFlowGenericClause* pInst)
+{
+    pInst->m_Word0.val = 0x00000000;
+    pInst->m_Word1.val = 0x00000000;
+
+    pInst->m_pLinkedVTXInstruction = 0;
+    pInst->m_pLinkedTEXInstruction = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_GENERIC;
+}
+
+void Init_R700ControlFlowALUClause(R700ControlFlowALUClause* pInst)
+{
+    pInst->m_Word0.val = 0x00000000;
+    pInst->m_Word1.val = 0x00000000;
+
+    pInst->m_pLinkedALUInstruction = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_ALU;
+}
+
+void Init_R700ControlFlowSXClause(R700ControlFlowSXClause* pInst)
+{
+    pInst->m_Word0.val      = 0x00000000;
+    pInst->m_Word1.val      = 0x00000000;
+    pInst->m_Word1_SWIZ.val = 0x00000000;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_ALL_EXP_SX;
+}
+
+void Init_R700ControlFlowSMXClause(R700ControlFlowSMXClause* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_BUF.val = 0x00000000;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_CF_ALL_EXP_SMX;
+}
+
+void Init_R700ALUInstruction(R700ALUInstruction* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_OP2.val = 0x00000000;
+    pInst->m_Word1_OP3.val = 0x00000000;
+
+    pInst->m_pLinkedALUClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_ALU;
+}
+
+void Init_R700ALUInstructionHalfLiteral(R700ALUInstructionHalfLiteral* pInst, GLfloat x, GLfloat y)
+{
+	pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_OP2.val = 0x00000000;
+    pInst->m_Word1_OP3.val = 0x00000000;
+
+	pInst->m_pLinkedALUClause = 0;
+
+    pInst->m_fLiteralX = x;
+    pInst->m_fLiteralY = y;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_ALU_HALF_LIT;
+}
+
+void Init_R700ALUInstructionFullLiteral(R700ALUInstructionFullLiteral* pInst, GLfloat x, GLfloat y, GLfloat z, GLfloat w)
+{
+	pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_OP2.val = 0x00000000;
+    pInst->m_Word1_OP3.val = 0x00000000;
+
+	pInst->m_pLinkedALUClause = 0;
+
+    pInst->m_fLiteralX = x;
+    pInst->m_fLiteralY = y;
+    pInst->m_fLiteralZ = z;
+    pInst->m_fLiteralW = w;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_ALU_FALL_LIT;
+}
+
+void Init_R700TextureInstruction(R700TextureInstruction* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word2.val     = 0x00000000;
+
+    pInst->m_pLinkedGenericClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_TEX;
+}
+
+void Init_R700VertexSemanticFetch(R700VertexSemanticFetch* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_SEM.val = 0x00000000;
+    pInst->m_Word2.val     = 0x00000000;
+
+    pInst->m_pLinkedGenericClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_VTX_SEM;
+}
+
+void Init_R700VertexGenericFetch(R700VertexGenericFetch* pInst)
+{
+    pInst->m_Word0.val     = 0x00000000;
+    pInst->m_Word1.val     = 0x00000000;
+    pInst->m_Word1_GPR.val = 0x00000000;
+    pInst->m_Word2.val     = 0x00000000;
+
+    pInst->m_pLinkedGenericClause = 0;
+
+    pInst->useCount = 0;
+
+	pInst->m_ShaderInstType = SIT_VTX_GENERIC;
+}
+
+unsigned int GetInstructionSize(ShaderInstType instType)
+{
+    switch(instType)
+    {
+    case SIT_ALU_HALF_LIT:  
+    case SIT_TEX:           
+    case SIT_VTX:           
+    case SIT_VTX_GENERIC:   
+    case SIT_VTX_SEM:       
+        return 4;
+    case SIT_ALU_FALL_LIT:
+        return 6;
+    default:
+        break;
+    }
+
+    return 2;
+}
+
+unsigned int GetCFMaxInstructions(ShaderInstType instType)
+{
+    switch (instType)
+    {
+    case SIT_CF_ALL_EXP:    
+    case SIT_CF_ALL_EXP_SX: 
+    case SIT_CF_ALL_EXP_SMX:  
+        return 0x10;
+    case SIT_CF_GENERIC:
+        return 0x8;  //For tex and vtx
+    case SIT_CF_ALU:
+        return 0x80;
+    default:
+        break;
+    }
+    return 0x10;
+}
+
+GLboolean LinkVertexInstruction(R700ControlFlowGenericClause *pCFGeneric,
+								R700VertexInstruction *pVTXInstruction)
+{
+	if (pCFGeneric->m_pLinkedTEXInstruction != 0) 
+	{
+		r700_error(ERROR_ASM_VTX_CLAUSE, "This instruction is already linked to a texture instruction");
+		return GL_FALSE;
+    }
+
+    pCFGeneric->m_pLinkedVTXInstruction     = pVTXInstruction;
+	pVTXInstruction->m_pLinkedGenericClause = pCFGeneric;
+
+	return GL_TRUE;
+}
+
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_shaderinst.h b/src/mesa/drivers/dri/r600/r700_shaderinst.h
new file mode 100644
index 00000000000..2829cca0a3c
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_shaderinst.h
@@ -0,0 +1,321 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+
+#ifndef _R700_SHADERINST_H_
+#define _R700_SHADERINST_H_
+
+#include "main/glheader.h"
+
+#include "defaultendian.h" 
+#include "sq_micro_reg.h"
+
+#define SQ_ALU_CONSTANT_PS_OFFSET      0x00000000
+#define SQ_ALU_CONSTANT_PS_COUNT       0x00000100
+#define SQ_ALU_CONSTANT_VS_OFFSET      0x00000100
+#define SQ_ALU_CONSTANT_VS_COUNT       0x00000100
+#define SQ_FETCH_RESOURCE_PS_OFFSET    0x00000000
+#define SQ_FETCH_RESOURCE_PS_COUNT     0x000000a0
+#define SQ_FETCH_RESOURCE_VS_OFFSET    0x000000a0
+#define SQ_FETCH_RESOURCE_VS_COUNT     0x000000b0
+
+#define SHADERINST_TYPEMASK_CF  0x10
+#define SHADERINST_TYPEMASK_ALU 0x20
+#define SHADERINST_TYPEMASK_TEX 0x40
+#define SHADERINST_TYPEMASK_VTX 0x80
+
+typedef enum ShaderInstType 
+{
+    SIT_CF = 0x10,            /*SIZE = 0x2*/
+        SIT_CF_ALL_EXP = 0x14,    /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x10;*/
+            SIT_CF_ALL_EXP_SX = 0x15, /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x10;*/
+            SIT_CF_ALL_EXP_SMX= 0x16, /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x10;*/
+        SIT_CF_GENERIC = 0x18,    /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x8;  //For tex and vtx*/
+        SIT_CF_ALU = 0x19,        /*SIZE = 0x2, MAX_INSTRUCTIONS = 0x80;*/
+    SIT_ALU = 0x20,           /*SIZE = 0x2,*/
+        SIT_ALU_HALF_LIT = 0x21,  /*SIZE = 0x4,*/
+        SIT_ALU_FALL_LIT = 0x22,  /*SIZE = 0x6,*/
+    SIT_TEX = 0x40,           /*SIZE = 0x4,*/
+    SIT_VTX = 0x80,           /*SIZE = 0x4, MEGA_FETCH_BYTES = 0x20*/
+        SIT_VTX_GENERIC = 0x81,   /*SIZE = 0x4, MEGA_FETCH_BYTES = 0x20*/
+        SIT_VTX_SEM = 0x82       /*SIZE = 0x4, MEGA_FETCH_BYTES = 0x20*/
+} ShaderInstType;
+
+typedef struct R700ShaderInstruction 
+{
+    ShaderInstType m_ShaderInstType;
+    struct R700ShaderInstruction *pNextInst;
+    GLuint m_uIndex;
+    GLuint useCount;
+} R700ShaderInstruction;
+
+// ------------------ CF insts ---------------------------
+
+typedef R700ShaderInstruction R700ControlFlowInstruction;
+
+typedef struct R700ControlFlowAllocExportClause  
+{
+    ShaderInstType          m_ShaderInstType;
+    R700ShaderInstruction * pNextInst;    
+    GLuint m_uIndex;
+    GLuint useCount;
+		
+    sq_cf_alloc_export_word0_u      m_Word0;
+    sq_cf_alloc_export_word1_u      m_Word1;
+} R700ControlFlowAllocExportClause;
+
+typedef struct R700ControlFlowSXClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ControlFlowAllocExportClause
+		//R700ControlFlowInstruction 
+			//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+			//---------------------
+		//---------------------------
+    sq_cf_alloc_export_word0_u      m_Word0;
+    sq_cf_alloc_export_word1_u      m_Word1;
+	//-------------------------------------
+
+    sq_cf_alloc_export_word1_swiz_u m_Word1_SWIZ;
+} R700ControlFlowSXClause;
+
+typedef struct R700ControlFlowSMXClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+    //R700ControlFlowAllocExportClause
+		//R700ControlFlowInstruction 
+			//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+			//---------------------
+		//---------------------------
+    sq_cf_alloc_export_word0_u      m_Word0;
+    sq_cf_alloc_export_word1_u      m_Word1;
+	//-------------------------------
+
+    sq_cf_alloc_export_word1_buf_u m_Word1_BUF;
+} R700ControlFlowSMXClause;
+
+typedef struct R700ControlFlowGenericClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ControlFlowInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	//---------------------
+
+    sq_cf_word0_u m_Word0;
+    sq_cf_word1_u m_Word1;
+
+    struct R700VertexInstruction  *m_pLinkedVTXInstruction;
+    struct R700TextureInstruction *m_pLinkedTEXInstruction;
+} R700ControlFlowGenericClause;
+
+typedef struct R700ControlFlowALUClause 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+    //R700ControlFlowInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	//---------------------
+
+    sq_cf_alu_word0_u m_Word0;
+    sq_cf_alu_word1_u m_Word1;
+    
+    struct R700ALUInstruction *m_pLinkedALUInstruction;
+} R700ControlFlowALUClause;
+
+// ------------------- End of CF Inst ------------------------
+
+// ------------------- ALU Inst ------------------------------
+typedef struct R700ALUInstruction 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+	//---------------------
+
+    sq_alu_word0_u     m_Word0;
+    sq_alu_word1_u     m_Word1;
+    sq_alu_word1_op2_v2_u m_Word1_OP2;
+    sq_alu_word1_op3_u m_Word1_OP3;
+
+    struct R700ControlFlowALUClause *m_pLinkedALUClause;
+} R700ALUInstruction;
+
+typedef struct R700ALUInstructionHalfLiteral
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ALUInstruction 
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+
+    sq_alu_word0_u     m_Word0;
+    sq_alu_word1_u     m_Word1;
+    sq_alu_word1_op2_v2_u m_Word1_OP2;
+    sq_alu_word1_op3_u m_Word1_OP3;
+
+    struct R700ControlFlowALUClause *m_pLinkedALUClause;
+	//-------------------
+
+    GLfloat m_fLiteralX,
+            m_fLiteralY;
+} R700ALUInstructionHalfLiteral;
+
+typedef struct R700ALUInstructionFullLiteral 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ALUInstruction 
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+
+    sq_alu_word0_u     m_Word0;
+    sq_alu_word1_u     m_Word1;
+    sq_alu_word1_op2_v2_u m_Word1_OP2;
+    sq_alu_word1_op3_u m_Word1_OP3;
+
+    struct R700ControlFlowALUClause *m_pLinkedALUClause;
+	//-------------------
+
+    GLfloat m_fLiteralX,
+            m_fLiteralY,
+            m_fLiteralZ,
+            m_fLiteralW;
+} R700ALUInstructionFullLiteral;
+// ------------------- End of ALU Inst -----------------------
+
+// ------------------- Textuer/Vertex Instruction --------------------
+
+typedef struct R700TextureInstruction 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+	//---------------------
+	
+    sq_tex_word0_u m_Word0;
+    sq_tex_word1_u m_Word1;
+    sq_tex_word2_u m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+} R700TextureInstruction;
+
+typedef struct R700VertexInstruction 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+	//---------------------
+	
+    sq_vtx_word0_u     m_Word0;
+    sq_vtx_word1_u     m_Word1;
+    sq_vtx_word2_u     m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+} R700VertexInstruction;
+//
+typedef struct R700VertexSemanticFetch 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700VertexInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	
+    sq_vtx_word0_u     m_Word0;
+    sq_vtx_word1_u     m_Word1;
+    sq_vtx_word2_u     m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+	//---------------------------
+
+    sq_vtx_word1_sem_u m_Word1_SEM;
+} R700VertexSemanticFetch;
+//
+typedef struct R700VertexGenericFetch 
+{
+	ShaderInstType          m_ShaderInstType;
+	R700ShaderInstruction * pNextInst;
+	//R700VertexInstruction
+		//R700ShaderInstruction
+	GLuint m_uIndex;
+    GLuint useCount;
+		//---------------------
+	
+    sq_vtx_word0_u     m_Word0;
+    sq_vtx_word1_u     m_Word1;
+    sq_vtx_word2_u     m_Word2;
+
+    struct R700ControlFlowGenericClause *m_pLinkedGenericClause;
+	//---------------------------
+
+    sq_vtx_word1_gpr_u m_Word1_GPR;
+} R700VertexGenericFetch;
+
+// ------------------- End of Texture Vertex Instruction --------------------
+
+void Init_R700ControlFlowGenericClause(R700ControlFlowGenericClause* pInst);
+void Init_R700ControlFlowALUClause(R700ControlFlowALUClause* pInst);
+void Init_R700ControlFlowSXClause(R700ControlFlowSXClause* pInst);
+void Init_R700ControlFlowSMXClause(R700ControlFlowSMXClause* pInst);
+void Init_R700ALUInstruction(R700ALUInstruction* pInst);
+void Init_R700ALUInstructionHalfLiteral(R700ALUInstructionHalfLiteral* pInst, GLfloat x, GLfloat y);
+void Init_R700ALUInstructionFullLiteral(R700ALUInstructionFullLiteral* pInst, GLfloat x, GLfloat y, GLfloat z, GLfloat w);
+void Init_R700TextureInstruction(R700TextureInstruction* pInst);
+void Init_R700VertexSemanticFetch(R700VertexSemanticFetch* pInst);
+void Init_R700VertexGenericFetch(R700VertexGenericFetch* pInst);
+
+unsigned int GetInstructionSize(ShaderInstType instType);
+unsigned int GetCFMaxInstructions(ShaderInstType instType);
+
+GLboolean LinkVertexInstruction(R700ControlFlowGenericClause *pCFGeneric,
+								R700VertexInstruction *pVTXInstruction);
+
+#endif //_R700_SHADERINST_H_
diff --git a/src/mesa/drivers/dri/r600/r700_state.c b/src/mesa/drivers/dri/r600/r700_state.c
new file mode 100644
index 00000000000..eda224def33
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_state.c
@@ -0,0 +1,1260 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#include "main/glheader.h"
+#include "main/mtypes.h"
+#include "main/state.h"
+#include "main/imports.h"
+#include "main/enums.h"
+#include "main/macros.h"
+#include "main/dd.h"
+#include "main/simple_list.h"
+
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "tnl/t_vp_build.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "main/api_arrayelt.h"
+#include "main/state.h"
+#include "main/framebuffer.h"
+
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+#include "vbo/vbo.h"
+#include "main/texformat.h"
+
+#include "r600_context.h"
+
+#include "r700_state.h"
+
+#include "r700_fragprog.h"
+#include "r700_vertprog.h"
+
+
+void r700SetDefaultStates(context_t *context) //--------------------
+{
+    
+}
+
+void r700UpdateShaders (GLcontext * ctx)  //----------------------------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    GLvector4f dummy_attrib[_TNL_ATTRIB_MAX];
+    GLvector4f *temp_attrib[_TNL_ATTRIB_MAX];
+
+    struct r700_vertex_program *vp;
+	int i;
+
+    if (context->radeon.NewGLState) 
+    {
+        context->radeon.NewGLState = 0;
+
+        for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) 
+        {
+            /* mat states from state var not array for sw */
+            dummy_attrib[i].stride = 0;
+
+            temp_attrib[i] = TNL_CONTEXT(ctx)->vb.AttribPtr[i];
+            TNL_CONTEXT(ctx)->vb.AttribPtr[i] = &(dummy_attrib[i]);
+        }
+
+        _tnl_UpdateFixedFunctionProgram(ctx);
+
+        for (i = _TNL_FIRST_MAT; i <= _TNL_LAST_MAT; i++) 
+        {
+            TNL_CONTEXT(ctx)->vb.AttribPtr[i] = temp_attrib[i];
+        }
+
+        r700SelectVertexShader(ctx);
+        vp = (struct r700_vertex_program *)ctx->VertexProgram._Current;
+
+        if (vp->translated == GL_FALSE) 
+        {
+            // TODO
+            //fprintf(stderr, "Failing back to sw-tcl\n");
+            //hw_tcl_on = future_hw_tcl_on = 0;
+            //r300ResetHwState(rmesa);
+            //
+            r700UpdateStateParameters(ctx, _NEW_PROGRAM);
+            return;
+        }
+    }
+
+    r700UpdateStateParameters(ctx, _NEW_PROGRAM);
+}
+
+/*
+ * To correctly position primitives:
+ */
+void r700UpdateViewportOffset(GLcontext * ctx) //------------------
+{
+
+	//radeonUpdateScissor(ctx);
+
+    return;
+}
+
+/**
+ * Tell the card where to render (offset, pitch).
+ * Effected by glDrawBuffer, etc
+ */
+void r700UpdateDrawBuffer(GLcontext * ctx) /* TODO */ //---------------------
+{
+#if 0 /* to be enabled */
+    context_t *context = R700_CONTEXT(ctx);
+
+    switch (ctx->DrawBuffer->_ColorDrawBufferIndexes[0]) 
+    {
+	case BUFFER_FRONT_LEFT:
+	    context->target.rt = context->screen->frontBuffer;
+	    break;
+	case BUFFER_BACK_LEFT:
+	    context->target.rt = context->screen->backBuffer;
+	    break;
+	default:
+	    memset (&context->target.rt, sizeof(context->target.rt), 0);
+	}
+#endif /* to be enabled */
+}
+
+static void r700FetchStateParameter(GLcontext * ctx,
+			                        const gl_state_index state[STATE_LENGTH],
+			                        GLfloat * value)
+{
+	context_t *context = R700_CONTEXT(ctx);
+
+    /* TODO */
+}
+
+void r700UpdateStateParameters(GLcontext * ctx, GLuint new_state) //--------------------
+{
+	struct r700_fragment_program *fp;
+	struct gl_program_parameter_list *paramList;
+	GLuint i;
+
+	if (!(new_state & (_NEW_BUFFERS | _NEW_PROGRAM)))
+		return;
+
+	fp = (struct r700_fragment_program *)ctx->FragmentProgram._Current;
+	if (!fp)
+    {
+		return;
+    }
+
+	paramList = fp->mesa_program.Base.Parameters;
+
+	if (!paramList)
+    {
+		return;
+    }
+
+	for (i = 0; i < paramList->NumParameters; i++) 
+    {
+		if (paramList->Parameters[i].Type == PROGRAM_STATE_VAR) 
+        {
+			r700FetchStateParameter(ctx,
+						paramList->Parameters[i].
+						StateIndexes,
+						paramList->ParameterValues[i]);
+		}
+	}
+}
+
+/**
+ * Called by Mesa after an internal state update.
+ */
+static void r700InvalidateState(GLcontext * ctx, GLuint new_state) //-------------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    _swrast_InvalidateState(ctx, new_state);
+	_swsetup_InvalidateState(ctx, new_state);
+	_vbo_InvalidateState(ctx, new_state);
+	_tnl_InvalidateState(ctx, new_state);
+	_ae_invalidate_state(ctx, new_state);
+
+	if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) 
+    {
+        _mesa_update_framebuffer(ctx);
+		/* this updates the DrawBuffer's Width/Height if it's a FBO */
+		_mesa_update_draw_buffer_bounds(ctx);
+
+		r700UpdateDrawBuffer(ctx);
+	}
+
+	r700UpdateStateParameters(ctx, new_state);
+
+    if(GL_TRUE == r700->bEnablePerspective)
+    {
+        /* Do scale XY and Z by 1/W0 for perspective correction on pos. For orthogonal case, set both to one. */
+        CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_XY_FMT_bit);
+        CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_Z_FMT_bit);
+
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_W0_FMT_bit);
+
+        SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, PERSP_GRADIENT_ENA_bit);
+        CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
+    }
+    else
+    {
+        /* For orthogonal case. */
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_XY_FMT_bit);
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_Z_FMT_bit);
+
+        SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_W0_FMT_bit);
+
+        CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, PERSP_GRADIENT_ENA_bit);
+        SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
+    }
+
+	context->radeon.NewGLState |= new_state;
+}
+
+static void r700SetDepthState(GLcontext * ctx)
+{
+	context_t *context = R700_CONTEXT(ctx);
+
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    if (ctx->Depth.Test)
+    {
+        SETbit(r700->DB_DEPTH_CONTROL.u32All, Z_ENABLE_bit);
+        if (ctx->Depth.Mask)
+        {
+            SETbit(r700->DB_DEPTH_CONTROL.u32All, Z_WRITE_ENABLE_bit);
+        }
+        else
+        {
+            CLEARbit(r700->DB_DEPTH_CONTROL.u32All, Z_WRITE_ENABLE_bit);
+        }
+
+        switch (ctx->Depth.Func)
+        {
+        case GL_NEVER:            
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_NEVER, 
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_LESS:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_LESS, 
+                     ZFUNC_shift, ZFUNC_mask);            
+            break;
+        case GL_EQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_EQUAL, 
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_LEQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_LEQUAL,  
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_GREATER:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_GREATER,  
+                     ZFUNC_shift, ZFUNC_mask);           
+            break;
+        case GL_NOTEQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_NOTEQUAL,  
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_GEQUAL:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_GEQUAL,  
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        case GL_ALWAYS:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_ALWAYS,  
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        default:
+            SETfield(r700->DB_DEPTH_CONTROL.u32All, FRAG_ALWAYS,  
+                     ZFUNC_shift, ZFUNC_mask);
+            break;
+        }
+    }
+    else
+    {
+        CLEARbit(r700->DB_DEPTH_CONTROL.u32All, Z_ENABLE_bit);
+        CLEARbit(r700->DB_DEPTH_CONTROL.u32All, Z_WRITE_ENABLE_bit);
+    }
+}
+
+static void r700AlphaFunc(GLcontext * ctx, GLenum func, GLfloat ref) //---------------
+{
+}
+
+
+static void r700BlendColor(GLcontext * ctx, const GLfloat cf[4]) //----------------
+{
+}
+
+static void r700BlendEquationSeparate(GLcontext * ctx,
+				                      GLenum modeRGB, GLenum modeA) //-----------------
+{
+}
+
+static void r700BlendFuncSeparate(GLcontext * ctx,
+				  GLenum sfactorRGB, GLenum dfactorRGB,
+				  GLenum sfactorA, GLenum dfactorA) //------------------------
+{
+}
+
+static void r700UpdateCulling(GLcontext * ctx)
+{
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+
+    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit);
+    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+    CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+
+    if (ctx->Polygon.CullFlag) 
+    {
+        switch (ctx->Polygon.CullFaceMode) 
+        {
+        case GL_FRONT:            
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        case GL_BACK:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        case GL_FRONT_AND_BACK:
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        default:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_FRONT_bit);
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, CULL_BACK_bit);
+            break;
+        }
+    }
+
+    switch (ctx->Polygon.FrontFace) 
+    {
+        case GL_CW:
+            SETbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit);
+            break;
+        case GL_CCW:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit); 
+            break;
+        default:
+            CLEARbit(r700->PA_SU_SC_MODE_CNTL.u32All, FACE_bit); /* default: ccw */
+            break;
+    }
+}
+
+static void r700UpdateLineStipple(GLcontext * ctx)
+{
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+    if (ctx->Line.StippleFlag)
+    {
+	SETbit(r700->PA_SC_MODE_CNTL.u32All, LINE_STIPPLE_ENABLE_bit);
+    }
+    else
+    {
+	CLEARbit(r700->PA_SC_MODE_CNTL.u32All, LINE_STIPPLE_ENABLE_bit);
+    }
+}
+
+static void r700Enable(GLcontext * ctx, GLenum cap, GLboolean state) //------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+
+	switch (cap) {
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+		/* empty */
+		break;
+	case GL_FOG:
+		/* empty */
+		break;
+	case GL_ALPHA_TEST:
+		//r700SetAlphaState(ctx);
+		break;
+	case GL_COLOR_LOGIC_OP:
+		//r700SetLogicOpState(ctx);
+		/* fall-through, because logic op overrides blending */
+	case GL_BLEND:
+		//r700SetBlendState(ctx);
+		break;
+	case GL_CLIP_PLANE0:
+	case GL_CLIP_PLANE1:
+	case GL_CLIP_PLANE2:
+	case GL_CLIP_PLANE3:
+	case GL_CLIP_PLANE4:
+	case GL_CLIP_PLANE5:
+		//r700SetClipPlaneState(ctx, cap, state);
+		break;
+	case GL_DEPTH_TEST:
+		r700SetDepthState(ctx);
+		break;
+	case GL_STENCIL_TEST:
+		//r700SetStencilState(ctx, state);
+		break;
+	case GL_CULL_FACE:
+		r700UpdateCulling(ctx);
+		break;
+	case GL_POLYGON_OFFSET_POINT:
+	case GL_POLYGON_OFFSET_LINE:
+	case GL_POLYGON_OFFSET_FILL:
+		//r700SetPolygonOffsetState(ctx, state);
+		break;
+	case GL_SCISSOR_TEST:
+		radeon_firevertices(&context->radeon);
+		context->radeon.state.scissor.enabled = state;
+		radeonUpdateScissor(ctx);
+		break;
+	case GL_LINE_STIPPLE:
+		r700UpdateLineStipple(ctx);
+		break;	
+	default:
+		break;
+	}
+
+}
+
+/**
+ * Handle glColorMask()
+ */
+static void r700ColorMask(GLcontext * ctx,
+			  GLboolean r, GLboolean g, GLboolean b, GLboolean a) //------------------
+{
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&R700_CONTEXT(ctx)->hw);
+	unsigned int mask = ((r ? 1 : 0) |
+			     (g ? 2 : 0) |
+			     (b ? 4 : 0) |
+			     (a ? 8 : 0));
+
+	if (mask != r700->CB_SHADER_MASK.u32All)
+		SETfield(r700->CB_SHADER_MASK.u32All, mask, OUTPUT0_ENABLE_shift, OUTPUT0_ENABLE_mask);
+}
+
+/**
+ * Change the depth testing function.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700DepthFunc(GLcontext * ctx, GLenum func) //--------------------
+{
+    r700SetDepthState(ctx);
+}
+
+/**
+ * Enable/Disable depth writing.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700DepthMask(GLcontext * ctx, GLboolean mask) //------------------
+{
+    r700SetDepthState(ctx);
+}
+
+/**
+ * Change the culling mode.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700CullFace(GLcontext * ctx, GLenum mode) //-----------------
+{
+    r700UpdateCulling(ctx);
+}
+
+/* =============================================================
+ * Fog
+ */
+static void r700Fogfv(GLcontext * ctx, GLenum pname, const GLfloat * param) //--------------
+{
+}
+
+/**
+ * Change the polygon orientation.
+ *
+ * \note Mesa already filters redundant calls to this function.
+ */
+static void r700FrontFace(GLcontext * ctx, GLenum mode) //------------------
+{
+    r700UpdateCulling(ctx);
+}
+
+static void r700ShadeModel(GLcontext * ctx, GLenum mode) //--------------------
+{
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+	/* also need to set/clear FLAT_SHADE bit per param in SPI_PS_INPUT_CNTL_[0-31] */
+	switch (mode) {
+	case GL_FLAT:
+		SETbit(r700->SPI_INTERP_CONTROL_0.u32All, FLAT_SHADE_ENA_bit);
+		break;
+	case GL_SMOOTH:
+		CLEARbit(r700->SPI_INTERP_CONTROL_0.u32All, FLAT_SHADE_ENA_bit);
+		break;
+	default:
+		return;
+	}
+}
+
+static void r700PointParameter(GLcontext * ctx, GLenum pname, const GLfloat * param) //---------------
+{
+}
+
+static void r700StencilFuncSeparate(GLcontext * ctx, GLenum face,
+				    GLenum func, GLint ref, GLuint mask) //---------------------
+{
+}
+
+
+static void r700StencilMaskSeparate(GLcontext * ctx, GLenum face, GLuint mask) //--------------
+{
+}
+
+static void r700StencilOpSeparate(GLcontext * ctx, GLenum face,
+				  GLenum fail, GLenum zfail, GLenum zpass) //--------------------
+{
+}
+
+static void r700UpdateWindow(GLcontext * ctx, int id) //--------------------
+{
+
+	context_t *context = R700_CONTEXT(ctx);
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(&context->radeon);
+	GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+	GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
+	const GLfloat *v = ctx->Viewport._WindowMap.m;
+	const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+	const GLboolean render_to_fbo = (ctx->DrawBuffer->Name != 0);
+	GLfloat y_scale, y_bias;
+
+	if (render_to_fbo) {
+		y_scale = 1.0;
+		y_bias = 0;
+	} else {
+		y_scale = -1.0;
+		y_bias = yoffset;
+	}
+
+	GLfloat sx = v[MAT_SX];
+	GLfloat tx = v[MAT_TX] + xoffset;
+	GLfloat sy = v[MAT_SY] * y_scale;
+	GLfloat ty = (v[MAT_TY] * y_scale) + y_bias;
+	GLfloat sz = v[MAT_SZ] * depthScale;
+	GLfloat tz = v[MAT_TZ] * depthScale;
+
+	/* TODO : Need DMA flush as well. */
+
+	r700->viewport[id].PA_CL_VPORT_XSCALE.f32All  = sx;
+	r700->viewport[id].PA_CL_VPORT_XOFFSET.f32All = tx;
+
+	r700->viewport[id].PA_CL_VPORT_YSCALE.f32All  = sy;
+	r700->viewport[id].PA_CL_VPORT_YOFFSET.f32All = ty;
+
+	r700->viewport[id].PA_CL_VPORT_ZSCALE.f32All  = sz;
+	r700->viewport[id].PA_CL_VPORT_ZOFFSET.f32All = tz;
+
+	r700->viewport[id].enabled = GL_TRUE;
+
+	r700SetScissor(context);
+}
+
+
+static void r700Viewport(GLcontext * ctx,
+                         GLint x,
+                         GLint y,
+			 GLsizei width,
+                         GLsizei height) //--------------------
+{
+	r700UpdateWindow(ctx, 0);
+
+	radeon_viewport(ctx, x, y, width, height);
+}
+
+static void r700DepthRange(GLcontext * ctx, GLclampd nearval, GLclampd farval) //-------------
+{
+	r700UpdateWindow(ctx, 0);
+}
+
+static void r700PointSize(GLcontext * ctx, GLfloat size) //-------------------
+{
+}
+
+static void r700LineWidth(GLcontext * ctx, GLfloat widthf) //---------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    uint32_t lineWidth = (uint32_t)((widthf * 0.5) * (1 << 4));
+    if (lineWidth > 0xFFFF)
+	lineWidth = 0xFFFF;
+    SETfield(r700->PA_SU_LINE_CNTL.u32All,(uint16_t)lineWidth,
+	PA_SU_LINE_CNTL__WIDTH_shift, PA_SU_LINE_CNTL__WIDTH_mask);
+}
+
+static void r700LineStipple(GLcontext *ctx, GLint factor, GLushort pattern)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    SETfield(r700->PA_SC_LINE_STIPPLE.u32All, pattern, LINE_PATTERN_shift, LINE_PATTERN_mask);
+    SETfield(r700->PA_SC_LINE_STIPPLE.u32All, (factor-1), REPEAT_COUNT_shift, REPEAT_COUNT_mask);
+    SETfield(r700->PA_SC_LINE_STIPPLE.u32All, 1, AUTO_RESET_CNTL_shift, AUTO_RESET_CNTL_mask);
+}
+
+static void r700PolygonOffset(GLcontext * ctx, GLfloat factor, GLfloat units) //--------------
+{
+}
+
+
+static void r700PolygonMode(GLcontext * ctx, GLenum face, GLenum mode) //------------------
+{
+}
+ 
+static void r700RenderMode(GLcontext * ctx, GLenum mode) //---------------------
+{
+}
+
+static void r700ClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq ) //-----------------
+{
+}
+
+void r700SetScissor(context_t *context) //---------------
+{
+	R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+	unsigned x1, y1, x2, y2;
+	int id = 0;
+	struct radeon_renderbuffer *rrb;
+
+	rrb = radeon_get_colorbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		return;
+	}
+	if (context->radeon.state.scissor.enabled) {
+		x1 = context->radeon.state.scissor.rect.x1;
+		y1 = context->radeon.state.scissor.rect.y1;
+		x2 = context->radeon.state.scissor.rect.x2 - 1;
+		y2 = context->radeon.state.scissor.rect.y2 - 1;
+	} else {
+		x1 = rrb->dPriv->x;
+		y1 = rrb->dPriv->y;
+		x2 = rrb->dPriv->x + rrb->dPriv->w;
+		y2 = rrb->dPriv->y + rrb->dPriv->h;
+	}
+
+	/* window */
+	SETbit(r700->PA_SC_WINDOW_SCISSOR_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_TL.u32All, x1,
+		 PA_SC_WINDOW_SCISSOR_TL__TL_X_shift, PA_SC_WINDOW_SCISSOR_TL__TL_X_mask);
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_TL.u32All, y1,
+		 PA_SC_WINDOW_SCISSOR_TL__TL_Y_shift, PA_SC_WINDOW_SCISSOR_TL__TL_Y_mask);
+
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_BR.u32All, x2,
+		 PA_SC_WINDOW_SCISSOR_BR__BR_X_shift, PA_SC_WINDOW_SCISSOR_BR__BR_X_mask);
+	SETfield(r700->PA_SC_WINDOW_SCISSOR_BR.u32All, y2,
+		 PA_SC_WINDOW_SCISSOR_BR__BR_Y_shift, PA_SC_WINDOW_SCISSOR_BR__BR_Y_mask);
+
+
+	SETfield(r700->PA_SC_CLIPRECT_0_TL.u32All, x1,
+		 PA_SC_CLIPRECT_0_TL__TL_X_shift, PA_SC_CLIPRECT_0_TL__TL_X_mask);
+	SETfield(r700->PA_SC_CLIPRECT_0_TL.u32All, y1,
+		 PA_SC_CLIPRECT_0_TL__TL_Y_shift, PA_SC_CLIPRECT_0_TL__TL_Y_mask);
+	SETfield(r700->PA_SC_CLIPRECT_0_BR.u32All, x2,
+		 PA_SC_CLIPRECT_0_BR__BR_X_shift, PA_SC_CLIPRECT_0_BR__BR_X_mask);
+	SETfield(r700->PA_SC_CLIPRECT_0_BR.u32All, y2,
+		 PA_SC_CLIPRECT_0_BR__BR_Y_shift, PA_SC_CLIPRECT_0_BR__BR_Y_mask);
+
+	r700->PA_SC_CLIPRECT_1_TL.u32All = r700->PA_SC_CLIPRECT_0_TL.u32All;
+	r700->PA_SC_CLIPRECT_1_BR.u32All = r700->PA_SC_CLIPRECT_0_BR.u32All;
+	r700->PA_SC_CLIPRECT_2_TL.u32All = r700->PA_SC_CLIPRECT_0_TL.u32All;
+	r700->PA_SC_CLIPRECT_2_BR.u32All = r700->PA_SC_CLIPRECT_0_BR.u32All;
+	r700->PA_SC_CLIPRECT_3_TL.u32All = r700->PA_SC_CLIPRECT_0_TL.u32All;
+	r700->PA_SC_CLIPRECT_3_BR.u32All = r700->PA_SC_CLIPRECT_0_BR.u32All;
+
+	/* more....2d clip */
+	SETbit(r700->PA_SC_GENERIC_SCISSOR_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_TL.u32All, x1,
+		 PA_SC_GENERIC_SCISSOR_TL__TL_X_shift, PA_SC_GENERIC_SCISSOR_TL__TL_X_mask);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_TL.u32All, y1,
+		 PA_SC_GENERIC_SCISSOR_TL__TL_Y_shift, PA_SC_GENERIC_SCISSOR_TL__TL_Y_mask);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_BR.u32All, x2,
+		 PA_SC_GENERIC_SCISSOR_BR__BR_X_shift, PA_SC_GENERIC_SCISSOR_BR__BR_X_mask);
+	SETfield(r700->PA_SC_GENERIC_SCISSOR_BR.u32All, y2,
+		 PA_SC_GENERIC_SCISSOR_BR__BR_Y_shift, PA_SC_GENERIC_SCISSOR_BR__BR_Y_mask);
+
+	SETbit(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All, WINDOW_OFFSET_DISABLE_bit);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All, x1,
+		 PA_SC_VPORT_SCISSOR_0_TL__TL_X_shift, PA_SC_VPORT_SCISSOR_0_TL__TL_X_mask);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_TL.u32All, y1,
+		 PA_SC_VPORT_SCISSOR_0_TL__TL_Y_shift, PA_SC_VPORT_SCISSOR_0_TL__TL_Y_mask);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All, x2,
+		 PA_SC_VPORT_SCISSOR_0_BR__BR_X_shift, PA_SC_VPORT_SCISSOR_0_BR__BR_X_mask);
+	SETfield(r700->viewport[id].PA_SC_VPORT_SCISSOR_0_BR.u32All, y2,
+		 PA_SC_VPORT_SCISSOR_0_BR__BR_Y_shift, PA_SC_VPORT_SCISSOR_0_BR__BR_Y_mask);
+
+	r700->viewport[id].PA_SC_VPORT_ZMIN_0.u32All = 0;
+	r700->viewport[id].PA_SC_VPORT_ZMAX_0.u32All = 0x3F800000;
+	r700->viewport[id].enabled = GL_TRUE;
+}
+
+void r700SetRenderTarget(context_t *context, int id)
+{
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct radeon_renderbuffer *rrb;
+    unsigned int nPitchInPixel;
+
+    /* screen/window/view */
+    SETfield(r700->CB_TARGET_MASK.u32All, 0xF, (4 * id), TARGET0_ENABLE_mask);
+
+    rrb = radeon_get_colorbuffer(&context->radeon);
+	if (!rrb || !rrb->bo) {
+		fprintf(stderr, "no rrb\n");
+		return;
+	}
+
+    /* color buffer */
+    r700->render_target[id].CB_COLOR0_BASE.u32All = context->radeon.state.color.draw_offset;
+
+    nPitchInPixel = rrb->pitch/rrb->cpp;
+    SETfield(r700->render_target[id].CB_COLOR0_SIZE.u32All, (nPitchInPixel/8)-1,
+             PITCH_TILE_MAX_shift, PITCH_TILE_MAX_mask);
+    SETfield(r700->render_target[id].CB_COLOR0_SIZE.u32All, ( (nPitchInPixel * context->radeon.radeonScreen->driScreen->fbHeight)/64 )-1,
+             SLICE_TILE_MAX_shift, SLICE_TILE_MAX_mask);
+    r700->render_target[id].CB_COLOR0_BASE.u32All = 0;
+    SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, ENDIAN_NONE, ENDIAN_shift, ENDIAN_mask);
+    SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, ARRAY_LINEAR_GENERAL,
+             CB_COLOR0_INFO__ARRAY_MODE_shift, CB_COLOR0_INFO__ARRAY_MODE_mask);
+    if(4 == rrb->cpp)
+    {
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, COLOR_8_8_8_8,
+                 CB_COLOR0_INFO__FORMAT_shift, CB_COLOR0_INFO__FORMAT_mask);
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, SWAP_ALT, COMP_SWAP_shift, COMP_SWAP_mask);
+    }
+    else
+    {
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, COLOR_5_6_5,
+                 CB_COLOR0_INFO__FORMAT_shift, CB_COLOR0_INFO__FORMAT_mask);
+        SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, SWAP_ALT_REV,
+                 COMP_SWAP_shift, COMP_SWAP_mask);
+    }
+    SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, SOURCE_FORMAT_bit);
+    SETbit(r700->render_target[id].CB_COLOR0_INFO.u32All, BLEND_CLAMP_bit);
+    SETfield(r700->render_target[id].CB_COLOR0_INFO.u32All, NUMBER_UNORM, NUMBER_TYPE_shift, NUMBER_TYPE_mask);
+
+    CLEARfield(r700->render_target[id].CB_BLEND0_CONTROL.u32All, COLOR_SRCBLEND_mask); /* no dst blend */
+    CLEARfield(r700->render_target[id].CB_BLEND0_CONTROL.u32All, ALPHA_SRCBLEND_mask); /* no dst blend */
+
+    r700->render_target[id].enabled = GL_TRUE;
+}
+
+void r700SetDepthTarget(context_t *context)
+{
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct radeon_renderbuffer *rrb;
+    unsigned int nPitchInPixel;
+
+    /* depth buf */
+    r700->DB_DEPTH_SIZE.u32All = 0;
+    r700->DB_DEPTH_BASE.u32All = 0;
+    r700->DB_DEPTH_INFO.u32All = 0;
+
+    r700->DB_DEPTH_CLEAR.u32All     = 0x3F800000;
+    r700->DB_DEPTH_VIEW.u32All      = 0;
+    r700->DB_RENDER_CONTROL.u32All  = 0;
+    SETbit(r700->DB_RENDER_CONTROL.u32All, STENCIL_COMPRESS_DISABLE_bit);
+    SETbit(r700->DB_RENDER_CONTROL.u32All, DEPTH_COMPRESS_DISABLE_bit);
+    r700->DB_RENDER_OVERRIDE.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+	    SETbit(r700->DB_RENDER_OVERRIDE.u32All, FORCE_SHADER_Z_ORDER_bit);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIZ_ENABLE_shift, FORCE_HIZ_ENABLE_mask);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE0_shift, FORCE_HIS_ENABLE0_mask);
+    SETfield(r700->DB_RENDER_OVERRIDE.u32All, FORCE_DISABLE, FORCE_HIS_ENABLE1_shift, FORCE_HIS_ENABLE1_mask);
+
+    r700->DB_ALPHA_TO_MASK.u32All = 0;
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET0_shift, ALPHA_TO_MASK_OFFSET0_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET1_shift, ALPHA_TO_MASK_OFFSET1_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET2_shift, ALPHA_TO_MASK_OFFSET2_mask);
+    SETfield(r700->DB_ALPHA_TO_MASK.u32All, 2, ALPHA_TO_MASK_OFFSET3_shift, ALPHA_TO_MASK_OFFSET3_mask);
+
+    rrb = radeon_get_depthbuffer(&context->radeon);
+	if (!rrb)
+		return;
+
+    nPitchInPixel = rrb->pitch/rrb->cpp;
+
+    SETfield(r700->DB_DEPTH_SIZE.u32All, (nPitchInPixel/8)-1,
+             PITCH_TILE_MAX_shift, PITCH_TILE_MAX_mask);
+    SETfield(r700->DB_DEPTH_SIZE.u32All, ( (nPitchInPixel * context->radeon.radeonScreen->driScreen->fbHeight)/64 )-1,
+             SLICE_TILE_MAX_shift, SLICE_TILE_MAX_mask); /* size in pixel / 64 - 1 */
+
+    if(4 == rrb->cpp)
+    {
+        switch (GL_CONTEXT(context)->Visual.depthBits)
+        {
+        case 16:
+        case 24:
+            SETfield(r700->DB_DEPTH_INFO.u32All, DEPTH_8_24,
+                     DB_DEPTH_INFO__FORMAT_shift, DB_DEPTH_INFO__FORMAT_mask);
+            break;
+        default:
+            fprintf(stderr, "Error: Unsupported depth %d... exiting\n",
+                GL_CONTEXT(context)->Visual.depthBits);
+            _mesa_exit(-1);
+        }
+    }
+    else
+    {
+        SETfield(r700->DB_DEPTH_INFO.u32All, DEPTH_16,
+                     DB_DEPTH_INFO__FORMAT_shift, DB_DEPTH_INFO__FORMAT_mask);
+    }
+    SETfield(r700->DB_DEPTH_INFO.u32All, ARRAY_2D_TILED_THIN1,
+             DB_DEPTH_INFO__ARRAY_MODE_shift, DB_DEPTH_INFO__ARRAY_MODE_mask);
+    /* r700->DB_PREFETCH_LIMIT.bits.DEPTH_HEIGHT_TILE_MAX = (context->currentDraw->h >> 3) - 1; */ /* z buffer sie may much bigger than what need, so use actual used h. */
+}
+
+static void r700InitSQConfig(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+    int ps_prio;
+    int vs_prio;
+    int gs_prio;
+    int es_prio;
+    int num_ps_gprs;
+    int num_vs_gprs;
+    int num_gs_gprs;
+    int num_es_gprs;
+    int num_temp_gprs;
+    int num_ps_threads;
+    int num_vs_threads;
+    int num_gs_threads;
+    int num_es_threads;
+    int num_ps_stack_entries;
+    int num_vs_stack_entries;
+    int num_gs_stack_entries;
+    int num_es_stack_entries;
+
+    // SQ
+    ps_prio = 0;
+    vs_prio = 1;
+    gs_prio = 2;
+    es_prio = 3;
+    switch (context->radeon.radeonScreen->chip_family) {
+    case CHIP_FAMILY_R600:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV630:
+    case CHIP_FAMILY_RV635:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 144;
+	    num_vs_threads = 40;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV610:
+    case CHIP_FAMILY_RV620:
+    case CHIP_FAMILY_RS780:
+    default:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV670:
+	    num_ps_gprs = 144;
+	    num_vs_gprs = 40;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 136;
+	    num_vs_threads = 48;
+	    num_gs_threads = 4;
+	    num_es_threads = 4;
+	    num_ps_stack_entries = 40;
+	    num_vs_stack_entries = 40;
+	    num_gs_stack_entries = 32;
+	    num_es_stack_entries = 16;
+	    break;
+    case CHIP_FAMILY_RV770:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 188;
+	    num_vs_threads = 60;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 256;
+	    num_vs_stack_entries = 256;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV730:
+    case CHIP_FAMILY_RV740:
+	    num_ps_gprs = 84;
+	    num_vs_gprs = 36;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 188;
+	    num_vs_threads = 60;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    case CHIP_FAMILY_RV710:
+	    num_ps_gprs = 192;
+	    num_vs_gprs = 56;
+	    num_temp_gprs = 4;
+	    num_gs_gprs = 0;
+	    num_es_gprs = 0;
+	    num_ps_threads = 144;
+	    num_vs_threads = 48;
+	    num_gs_threads = 0;
+	    num_es_threads = 0;
+	    num_ps_stack_entries = 128;
+	    num_vs_stack_entries = 128;
+	    num_gs_stack_entries = 0;
+	    num_es_stack_entries = 0;
+	    break;
+    }
+
+    r700->sq_config.SQ_CONFIG.u32All = 0;
+    if ((context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV610) ||
+        (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV620) ||
+	(context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RS780) ||
+        (context->radeon.radeonScreen->chip_family == CHIP_FAMILY_RV710))
+	    CLEARbit(r700->sq_config.SQ_CONFIG.u32All, VC_ENABLE_bit);
+    else
+	    SETbit(r700->sq_config.SQ_CONFIG.u32All, VC_ENABLE_bit);
+    SETbit(r700->sq_config.SQ_CONFIG.u32All, DX9_CONSTS_bit);
+    SETbit(r700->sq_config.SQ_CONFIG.u32All, ALU_INST_PREFER_VECTOR_bit);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, PS_PRIO_shift, PS_PRIO_mask);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, VS_PRIO_shift, VS_PRIO_mask);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, GS_PRIO_shift, GS_PRIO_mask);
+    SETfield(r700->sq_config.SQ_CONFIG.u32All, ps_prio, ES_PRIO_shift, ES_PRIO_mask);
+
+    r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All = 0;
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All, num_ps_gprs, NUM_PS_GPRS_shift, NUM_PS_GPRS_mask);
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All, num_vs_gprs, NUM_VS_GPRS_shift, NUM_VS_GPRS_mask);
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_1.u32All, num_temp_gprs,
+	     NUM_CLAUSE_TEMP_GPRS_shift, NUM_CLAUSE_TEMP_GPRS_mask);
+
+    r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All = 0;
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All, num_gs_gprs, NUM_GS_GPRS_shift, NUM_GS_GPRS_mask);
+    SETfield(r700->sq_config.SQ_GPR_RESOURCE_MGMT_2.u32All, num_es_gprs, NUM_ES_GPRS_shift, NUM_ES_GPRS_mask);
+
+    r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All = 0;
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_ps_threads,
+	     NUM_PS_THREADS_shift, NUM_PS_THREADS_mask);
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_vs_threads,
+	     NUM_VS_THREADS_shift, NUM_VS_THREADS_mask);
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_gs_threads,
+	     NUM_GS_THREADS_shift, NUM_GS_THREADS_mask);
+    SETfield(r700->sq_config.SQ_THREAD_RESOURCE_MGMT.u32All, num_es_threads,
+	     NUM_ES_THREADS_shift, NUM_ES_THREADS_mask);
+
+    r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All = 0;
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All, num_ps_stack_entries,
+	     NUM_PS_STACK_ENTRIES_shift, NUM_PS_STACK_ENTRIES_mask);
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_1.u32All, num_vs_stack_entries,
+	     NUM_VS_STACK_ENTRIES_shift, NUM_VS_STACK_ENTRIES_mask);
+
+    r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All = 0;
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All, num_gs_stack_entries,
+	     NUM_GS_STACK_ENTRIES_shift, NUM_GS_STACK_ENTRIES_mask);
+    SETfield(r700->sq_config.SQ_STACK_RESOURCE_MGMT_2.u32All, num_es_stack_entries,
+	     NUM_ES_STACK_ENTRIES_shift, NUM_ES_STACK_ENTRIES_mask);
+
+}
+
+/**
+ * Calculate initial hardware state and register state functions.
+ * Assumes that the command buffer and state atoms have been
+ * initialized already.
+ */
+void r700InitState(GLcontext * ctx) //-------------------
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    r700->TA_CNTL_AUX.u32All = 0;
+    SETfield(r700->TA_CNTL_AUX.u32All, 28, TD_FIFO_CREDIT_shift, TD_FIFO_CREDIT_mask);
+    r700->VC_ENHANCE.u32All = 0;
+    r700->DB_WATERMARKS.u32All = 0;
+    SETfield(r700->DB_WATERMARKS.u32All, 4, DEPTH_FREE_shift, DEPTH_FREE_mask);
+    SETfield(r700->DB_WATERMARKS.u32All, 16, DEPTH_FLUSH_shift, DEPTH_FLUSH_mask);
+    SETfield(r700->DB_WATERMARKS.u32All, 0, FORCE_SUMMARIZE_shift, FORCE_SUMMARIZE_mask);
+    SETfield(r700->DB_WATERMARKS.u32All, 4, DEPTH_PENDING_FREE_shift, DEPTH_PENDING_FREE_mask);
+    r700->SQ_DYN_GPR_CNTL_PS_FLUSH_REQ.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+	    SETfield(r700->TA_CNTL_AUX.u32All, 3, GRADIENT_CREDIT_shift, GRADIENT_CREDIT_mask);
+	    r700->DB_DEBUG.u32All = 0x82000000;
+	    SETfield(r700->DB_WATERMARKS.u32All, 16, DEPTH_CACHELINE_FREE_shift, DEPTH_CACHELINE_FREE_mask);
+    } else {
+	    SETfield(r700->TA_CNTL_AUX.u32All, 2, GRADIENT_CREDIT_shift, GRADIENT_CREDIT_mask);
+	    SETfield(r700->DB_WATERMARKS.u32All, 4, DEPTH_CACHELINE_FREE_shift, DEPTH_CACHELINE_FREE_mask);
+	    SETbit(r700->SQ_DYN_GPR_CNTL_PS_FLUSH_REQ.u32All, VS_PC_LIMIT_ENABLE_bit);
+    }
+
+    /* Turn off vgt reuse */
+    r700->VGT_REUSE_OFF.u32All = 0;
+    SETbit(r700->VGT_REUSE_OFF.u32All, REUSE_OFF_bit);
+
+    /* Specify offsetting and clamp values for vertices */
+    r700->VGT_MAX_VTX_INDX.u32All      = 0xFFFFFF;
+    r700->VGT_MIN_VTX_INDX.u32All      = 0;
+    r700->VGT_INDX_OFFSET.u32All    = 0;
+
+    /* Specify the number of instances */
+    r700->VGT_DMA_NUM_INSTANCES.u32All = 1;
+
+    /* not alpha blend */
+    CLEARfield(r700->SX_ALPHA_TEST_CONTROL.u32All, ALPHA_FUNC_mask);
+    CLEARbit(r700->SX_ALPHA_TEST_CONTROL.u32All, ALPHA_TEST_ENABLE_bit);
+
+    /* default shader connections. */
+    r700->SPI_VS_OUT_ID_0.u32All  = 0x03020100;
+    r700->SPI_VS_OUT_ID_1.u32All  = 0x07060504;
+
+    r700->SPI_PS_INPUT_CNTL_0.u32All  = 0x00000800;
+    r700->SPI_PS_INPUT_CNTL_1.u32All  = 0x00000801;
+    r700->SPI_PS_INPUT_CNTL_2.u32All  = 0x00000802;
+
+    r700->SPI_THREAD_GROUPING.u32All = 0;
+    if (context->radeon.radeonScreen->chip_family >= CHIP_FAMILY_RV770)
+	    SETfield(r700->SPI_THREAD_GROUPING.u32All, 1, PS_GROUPING_shift, PS_GROUPING_mask);
+
+    SETfield(r700->CB_COLOR_CONTROL.u32All, 0xCC, ROP3_shift, ROP3_mask);
+    CLEARbit(r700->CB_COLOR_CONTROL.u32All, PER_MRT_BLEND_bit);
+
+    r700->DB_SHADER_CONTROL.u32All = 0;
+    SETbit(r700->DB_SHADER_CONTROL.u32All, DUAL_EXPORT_ENABLE_bit);
+
+    /* Set up the culling control register */
+    SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_TRIANGLES,
+             POLYMODE_FRONT_PTYPE_shift, POLYMODE_FRONT_PTYPE_mask);
+    SETfield(r700->PA_SU_SC_MODE_CNTL.u32All, X_DRAW_TRIANGLES,
+             POLYMODE_BACK_PTYPE_shift, POLYMODE_BACK_PTYPE_mask);
+
+    /* screen */
+    r700->PA_SC_SCREEN_SCISSOR_TL.u32All = 0x0;
+
+    SETfield(r700->PA_SC_SCREEN_SCISSOR_BR.u32All,
+	     ((RADEONDRIPtr)(context->radeon.radeonScreen->driScreen->pDevPriv))->width,
+	     PA_SC_SCREEN_SCISSOR_BR__BR_X_shift, PA_SC_SCREEN_SCISSOR_BR__BR_X_mask);
+    SETfield(r700->PA_SC_SCREEN_SCISSOR_BR.u32All,
+	     ((RADEONDRIPtr)(context->radeon.radeonScreen->driScreen->pDevPriv))->height,
+	     PA_SC_SCREEN_SCISSOR_BR__BR_Y_shift, PA_SC_SCREEN_SCISSOR_BR__BR_Y_mask);
+
+    /* 4 clip rectangles */ /* TODO : set these clip rects according to context->currentDraw->numClipRects */
+    r700->PA_SC_CLIPRECT_RULE.u32All = 0;
+    SETfield(r700->PA_SC_CLIPRECT_RULE.u32All, CLIP_RULE_mask, CLIP_RULE_shift, CLIP_RULE_mask);
+
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+	    r700->PA_SC_EDGERULE.u32All = 0;
+    else
+	    r700->PA_SC_EDGERULE.u32All = 0xAAAAAAAA;
+
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770) {
+	    r700->PA_SC_MODE_CNTL.u32All = 0;
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, WALK_ORDER_ENABLE_bit);
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, FORCE_EOV_CNTDWN_ENABLE_bit);
+    } else {
+	    r700->PA_SC_MODE_CNTL.u32All = 0x00500000;
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, FORCE_EOV_REZ_ENABLE_bit);
+	    SETbit(r700->PA_SC_MODE_CNTL.u32All, FORCE_EOV_CNTDWN_ENABLE_bit);
+    }
+
+    /* Do scale XY and Z by 1/W0. */
+    r700->bEnablePerspective = GL_TRUE;
+    CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_XY_FMT_bit);
+    CLEARbit(r700->PA_CL_VTE_CNTL.u32All, VTX_Z_FMT_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VTX_W0_FMT_bit);
+
+    /* Enable viewport scaling for all three axis */
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_X_SCALE_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_X_OFFSET_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Y_SCALE_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Y_OFFSET_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Z_SCALE_ENA_bit);
+    SETbit(r700->PA_CL_VTE_CNTL.u32All, VPORT_Z_OFFSET_ENA_bit);
+
+    /* Set up point sizes and min/max values */
+    SETfield(r700->PA_SU_POINT_SIZE.u32All, 0x8,
+             PA_SU_POINT_SIZE__HEIGHT_shift, PA_SU_POINT_SIZE__HEIGHT_mask);
+    SETfield(r700->PA_SU_POINT_SIZE.u32All, 0x8,
+             PA_SU_POINT_SIZE__WIDTH_shift, PA_SU_POINT_SIZE__WIDTH_mask);
+    CLEARfield(r700->PA_SU_POINT_MINMAX.u32All, MIN_SIZE_mask);
+    SETfield(r700->PA_SU_POINT_MINMAX.u32All, 0x8000, MAX_SIZE_shift, MAX_SIZE_mask);
+
+    /* Set up line control */
+    SETfield(r700->PA_SU_LINE_CNTL.u32All, 0x8,
+             PA_SU_LINE_CNTL__WIDTH_shift, PA_SU_LINE_CNTL__WIDTH_mask);
+
+    r700->PA_SC_LINE_CNTL.u32All = 0;
+    CLEARbit(r700->PA_SC_LINE_CNTL.u32All, EXPAND_LINE_WIDTH_bit);
+    SETbit(r700->PA_SC_LINE_CNTL.u32All, LAST_PIXEL_bit);
+
+    /* Set up vertex control */
+    r700->PA_SU_VTX_CNTL.u32All = 0;
+    CLEARfield(r700->PA_SU_VTX_CNTL.u32All, QUANT_MODE_mask);
+    SETbit(r700->PA_SU_VTX_CNTL.u32All, PIX_CENTER_bit);
+    SETfield(r700->PA_SU_VTX_CNTL.u32All, X_ROUND_TO_EVEN,
+             PA_SU_VTX_CNTL__ROUND_MODE_shift, PA_SU_VTX_CNTL__ROUND_MODE_mask);
+
+    /* to 1.0 = no guard band */
+    r700->PA_CL_GB_VERT_CLIP_ADJ.u32All  = 0x3F800000;  /* 1.0 */
+    r700->PA_CL_GB_VERT_DISC_ADJ.u32All  = 0x3F800000;
+    r700->PA_CL_GB_HORZ_CLIP_ADJ.u32All  = 0x3F800000;
+    r700->PA_CL_GB_HORZ_DISC_ADJ.u32All  = 0x3F800000;
+
+    /* CB */
+    r700->CB_CLEAR_RED_R6XX.f32All = 1.0; //r6xx only
+    r700->CB_CLEAR_GREEN_R6XX.f32All = 0.0; //r6xx only
+    r700->CB_CLEAR_BLUE_R6XX.f32All = 1.0; //r6xx only
+    r700->CB_CLEAR_ALPHA_R6XX.f32All = 1.0; //r6xx only
+    r700->CB_FOG_RED_R6XX.u32All = 0; //r6xx only
+    r700->CB_FOG_GREEN_R6XX.u32All = 0; //r6xx only
+    r700->CB_FOG_BLUE_R6XX.u32All = 0; //r6xx only
+
+    r700->CB_BLEND_RED.u32All = 0;
+    r700->CB_BLEND_GREEN.u32All = 0;
+    r700->CB_BLEND_BLUE.u32All = 0;
+    r700->CB_BLEND_ALPHA.u32All = 0;
+
+    r700->CB_BLEND_CONTROL.u32All = 0;
+
+    /* Disable color compares */
+    SETfield(r700->CB_CLRCMP_CONTROL.u32All, CLRCMP_DRAW_ALWAYS,
+             CLRCMP_FCN_SRC_shift, CLRCMP_FCN_SRC_mask);
+    SETfield(r700->CB_CLRCMP_CONTROL.u32All, CLRCMP_DRAW_ALWAYS,
+             CLRCMP_FCN_DST_shift, CLRCMP_FCN_DST_mask);
+    SETfield(r700->CB_CLRCMP_CONTROL.u32All, CLRCMP_SEL_SRC,
+             CLRCMP_FCN_SEL_shift, CLRCMP_FCN_SEL_mask);
+
+    /* Zero out source */
+    r700->CB_CLRCMP_SRC.u32All = 0x00000000;
+
+    /* Put a compare color in for error checking */
+    r700->CB_CLRCMP_DST.u32All = 0x000000FF;
+
+    /* Set up color compare mask */
+    r700->CB_CLRCMP_MSK.u32All = 0xFFFFFFFF;
+
+    /* default color mask */
+    SETfield(r700->CB_SHADER_MASK.u32All, 0xF, OUTPUT0_ENABLE_shift, OUTPUT0_ENABLE_mask);
+
+    /* Enable all samples for multi-sample anti-aliasing */
+    r700->PA_SC_AA_MASK.u32All = 0xFFFFFFFF;
+    /* Turn off AA */
+    r700->PA_SC_AA_CONFIG.u32All = 0;
+
+    r700->SX_MISC.u32All = 0;
+
+    r700InitSQConfig(ctx);
+}
+
+void r700InitStateFuncs(struct dd_function_table *functions) //-----------------
+{
+	functions->UpdateState = r700InvalidateState;
+	functions->AlphaFunc = r700AlphaFunc;
+	functions->BlendColor = r700BlendColor;
+	functions->BlendEquationSeparate = r700BlendEquationSeparate;
+	functions->BlendFuncSeparate = r700BlendFuncSeparate;
+	functions->Enable = r700Enable;
+	functions->ColorMask = r700ColorMask;
+	functions->DepthFunc = r700DepthFunc;
+	functions->DepthMask = r700DepthMask;
+	functions->CullFace = r700CullFace;
+	functions->Fogfv = r700Fogfv;
+	functions->FrontFace = r700FrontFace;
+	functions->ShadeModel = r700ShadeModel;
+
+	/* ARB_point_parameters */
+	functions->PointParameterfv = r700PointParameter;
+
+	/* Stencil related */
+	functions->StencilFuncSeparate = r700StencilFuncSeparate;
+	functions->StencilMaskSeparate = r700StencilMaskSeparate;
+	functions->StencilOpSeparate = r700StencilOpSeparate;
+
+	/* Viewport related */
+	functions->Viewport = r700Viewport;
+	functions->DepthRange = r700DepthRange;
+	functions->PointSize = r700PointSize;
+	functions->LineWidth = r700LineWidth;
+	functions->LineStipple = r700LineStipple;
+
+	functions->PolygonOffset = r700PolygonOffset;
+	functions->PolygonMode = r700PolygonMode;
+
+	functions->RenderMode = r700RenderMode;
+
+	functions->ClipPlane = r700ClipPlane;
+
+	functions->Scissor = radeonScissor;
+
+	functions->DrawBuffer		= radeonDrawBuffer;
+	functions->ReadBuffer		= radeonReadBuffer;
+
+}
+
diff --git a/src/mesa/drivers/dri/r600/r700_state.h b/src/mesa/drivers/dri/r600/r700_state.h
new file mode 100644
index 00000000000..23246367db8
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_state.h
@@ -0,0 +1,51 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#ifndef _R700_STATE_H
+#define _R700_STATE_H
+
+#include "main/mtypes.h"
+
+#include "r600_context.h"
+
+#include "r700_chip.h"
+
+extern void r700UpdateStateParameters(GLcontext * ctx, GLuint new_state);
+extern void r700UpdateShaders (GLcontext * ctx);
+
+extern void r700UpdateViewportOffset(GLcontext * ctx);
+extern void r700UpdateDrawBuffer (GLcontext * ctx);
+
+extern void r700InitState (GLcontext * ctx);
+extern void r700InitStateFuncs (struct dd_function_table *functions);
+
+extern void r700SetRenderTarget(context_t *context, int id);
+extern void r700SetDefaultStates(context_t * context);
+
+void r700SetScissor(context_t *context);
+void r700SetDepthTarget(context_t *context);
+
+#endif	/* _R600_SCREEN_H */
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.c b/src/mesa/drivers/dri/r600/r700_vertprog.c
new file mode 100644
index 00000000000..cbfeaf071b4
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.c
@@ -0,0 +1,430 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+
+#include <stdio.h>
+#include <stdarg.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+
+#include "main/imports.h"
+#include "main/mtypes.h"
+
+#include "tnl/t_context.h"
+#include "shader/prog_parameter.h"
+#include "shader/prog_statevars.h"
+
+#include "r600_context.h"
+#include "r600_cmdbuf.h"
+
+#include "r700_debug.h"
+#include "r700_vertprog.h"
+
+unsigned int Map_Vertex_Output(r700_AssemblerBase       *pAsm, 
+					           struct gl_vertex_program *mesa_vp,
+					           unsigned int unStart)
+{
+    unsigned int i;
+	unsigned int unBit;
+	unsigned int unTotal = unStart;
+
+    //!!!!!!! THE ORDER MATCH FS INPUT 
+
+	unBit = 1 << VERT_RESULT_HPOS;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_HPOS] = unTotal++;
+	}
+
+	unBit = 1 << VERT_RESULT_COL0;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_COL0] = unTotal++;
+	}
+
+	unBit = 1 << VERT_RESULT_COL1;
+	if(mesa_vp->Base.OutputsWritten & unBit)
+	{
+		pAsm->ucVP_OutputMap[VERT_RESULT_COL1] = unTotal++;
+	}
+
+	//TODO : dealing back face.
+	//unBit = 1 << VERT_RESULT_BFC0;
+	//if(mesa_vp->Base.OutputsWritten & unBit)
+	//{
+	//	pAsm->ucVP_OutputMap[VERT_RESULT_COL0] = unTotal++;
+	//}
+
+	//unBit = 1 << VERT_RESULT_BFC1;
+	//if(mesa_vp->Base.OutputsWritten & unBit)
+	//{
+	//	pAsm->ucVP_OutputMap[VERT_RESULT_COL1] = unTotal++;
+	//}
+
+	//TODO : dealing fog.
+	//unBit = 1 << VERT_RESULT_FOGC;
+	//if(mesa_vp->Base.OutputsWritten & unBit)
+	//{
+	//	pAsm->ucVP_OutputMap[VERT_RESULT_FOGC] = unTotal++;
+	//}
+
+	//TODO : dealing point size.
+	//unBit = 1 << VERT_RESULT_PSIZ;
+	//if(mesa_vp->Base.OutputsWritten & unBit)
+	//{
+	//	pAsm->ucVP_OutputMap[VERT_RESULT_PSIZ] = unTotal++;
+	//}
+
+	for(i=0; i<8; i++)
+	{
+		unBit = 1 << (VERT_RESULT_TEX0 + i);
+		if(mesa_vp->Base.OutputsWritten & unBit)
+		{
+			pAsm->ucVP_OutputMap[VERT_RESULT_TEX0 + i] = unTotal++;
+		}
+	}
+
+	return (unTotal - unStart);
+}
+
+unsigned int Map_Vertex_Input(r700_AssemblerBase       *pAsm, 
+					  struct gl_vertex_program *mesa_vp,
+					  unsigned int unStart)
+{
+	int i;
+	unsigned int unBit;
+	unsigned int unTotal = unStart;
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(mesa_vp->Base.InputsRead & unBit)
+		{
+			pAsm->ucVP_AttributeMap[i] = unTotal++;
+		}
+	}
+	return (unTotal - unStart);
+}
+
+GLboolean Process_Vertex_Program_Vfetch_Instructions(
+						struct r700_vertex_program *vp,
+						struct gl_vertex_program   *mesa_vp)
+{
+	int i;
+    unsigned int unBit;
+	VTX_FETCH_METHOD vtxFetchMethod;
+	vtxFetchMethod.bEnableMini          = GL_FALSE;
+	vtxFetchMethod.mega_fetch_remainder = 0;
+
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(mesa_vp->Base.InputsRead & unBit)
+		{
+			assemble_vfetch_instruction(&vp->r700AsmCode,
+						    i,
+						    vp->r700AsmCode.ucVP_AttributeMap[i],
+						    vp->aos_desc[i].size,
+						    vp->aos_desc[i].type,
+						    &vtxFetchMethod);
+		}
+	}
+	
+	return GL_TRUE;
+}
+
+void Map_Vertex_Program(struct r700_vertex_program *vp,
+						struct gl_vertex_program   *mesa_vp)
+{
+    GLuint ui;
+    r700_AssemblerBase *pAsm = &(vp->r700AsmCode);
+	unsigned int num_inputs;
+
+	// R0 will always be used for index into vertex buffer
+	pAsm->number_used_registers = 1;
+	pAsm->starting_vfetch_register_number = pAsm->number_used_registers;
+
+    // Map Inputs: Add 1 to mapping since R0 is used for index
+	num_inputs = Map_Vertex_Input(pAsm, mesa_vp, pAsm->number_used_registers);
+	pAsm->number_used_registers += num_inputs;
+
+	// Create VFETCH instructions for inputs
+	if (GL_TRUE != Process_Vertex_Program_Vfetch_Instructions(vp, mesa_vp) ) 
+	{
+		r700_error(ERROR_ASM_VTX_CLAUSE, "Calling Process_Vertex_Program_Vfetch_Instructions return error. \n");
+		return; //error
+	}
+
+	// Map Outputs
+	pAsm->number_of_exports = Map_Vertex_Output(pAsm, mesa_vp, pAsm->number_used_registers);
+
+	pAsm->starting_export_register_number = pAsm->number_used_registers;
+
+	pAsm->number_used_registers += pAsm->number_of_exports;
+
+    pAsm->pucOutMask = (unsigned char*) MALLOC(pAsm->number_of_exports);
+    
+    for(ui=0; ui<pAsm->number_of_exports; ui++)
+    {
+        pAsm->pucOutMask[ui] = 0x0;
+    }
+
+    /* Map temporary registers (GPRs) */
+    pAsm->starting_temp_register_number = pAsm->number_used_registers;
+
+    if(mesa_vp->Base.NumNativeTemporaries >= mesa_vp->Base.NumTemporaries)
+    {   /* arb uses NumNativeTemporaries */
+        pAsm->number_used_registers += mesa_vp->Base.NumNativeTemporaries;
+    }
+    else
+    {   /* fix func t_vp uses NumTemporaries */
+        pAsm->number_used_registers += mesa_vp->Base.NumTemporaries;
+    }
+	
+    pAsm->uFirstHelpReg = pAsm->number_used_registers;
+}
+
+GLboolean Find_Instruction_Dependencies_vp(struct r700_vertex_program *vp,
+					                	struct gl_vertex_program   *mesa_vp)
+{
+    GLuint i, j;
+    GLint * puiTEMPwrites;
+    struct prog_instruction *pILInst;
+    InstDeps         *pInstDeps;
+
+    puiTEMPwrites = (GLint*) MALLOC(sizeof(GLuint)*mesa_vp->Base.NumTemporaries);
+    for(i=0; i<mesa_vp->Base.NumTemporaries; i++)
+    {
+        puiTEMPwrites[i] = -1;
+    }
+
+    pInstDeps = (InstDeps*)MALLOC(sizeof(InstDeps)*mesa_vp->Base.NumInstructions);
+
+    for(i=0; i<mesa_vp->Base.NumInstructions; i++)
+    {
+        pInstDeps[i].nDstDep = -1;
+        pILInst = &(mesa_vp->Base.Instructions[i]);
+
+        //Dst
+        if(pILInst->DstReg.File == PROGRAM_TEMPORARY)
+        {
+            //Set lastwrite for the temp
+            puiTEMPwrites[pILInst->DstReg.Index] = i;
+        }
+
+        //Src
+        for(j=0; j<3; j++)
+        {
+            if(pILInst->SrcReg[j].File == PROGRAM_TEMPORARY)
+            {
+                //Set dep.
+                pInstDeps[i].nSrcDeps[j] = puiTEMPwrites[pILInst->SrcReg[j].Index];
+            }
+            else
+            {
+                pInstDeps[i].nSrcDeps[j] = -1;
+            }
+        }
+    }
+
+    vp->r700AsmCode.pInstDeps = pInstDeps;
+
+    FREE(puiTEMPwrites);
+
+    return GL_TRUE;
+}
+
+GLboolean r700TranslateVertexShader(struct r700_vertex_program *vp,
+							   struct gl_vertex_program   *mesa_vp)
+{
+	//Init_Program
+	Init_r700_AssemblerBase(SPT_VP, &(vp->r700AsmCode), &(vp->r700Shader) );
+	Map_Vertex_Program( vp, mesa_vp );
+
+	if(GL_FALSE == Find_Instruction_Dependencies_vp(vp, mesa_vp))
+	{
+		return GL_FALSE;
+    }
+
+	if(GL_FALSE == AssembleInstr(mesa_vp->Base.NumInstructions,
+                                 &(mesa_vp->Base.Instructions[0]), 
+                                 &(vp->r700AsmCode)) )
+	{
+		return GL_FALSE;
+	} 
+
+    if(GL_FALSE == Process_Vertex_Exports(&(vp->r700AsmCode), mesa_vp->Base.OutputsWritten) )
+    {
+        return GL_FALSE;
+    }
+
+    vp->r700Shader.nRegs = (vp->r700AsmCode.number_used_registers == 0) ? 0 
+                         : (vp->r700AsmCode.number_used_registers - 1);
+
+	vp->r700Shader.nParamExports = vp->r700AsmCode.number_of_exports;
+
+    vp->translated = GL_TRUE;
+
+	return GL_TRUE;
+}
+
+void r700SelectVertexShader(GLcontext *ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+    struct r700_vertex_program *vpc
+             = (struct r700_vertex_program *)ctx->VertexProgram._Current;
+    if (context->radeon.radeonScreen->chip_family < CHIP_FAMILY_RV770)
+    {
+        vpc->r700AsmCode.bR6xx = 1;
+    }
+    
+    TNLcontext *tnl = TNL_CONTEXT(ctx);
+	struct vertex_buffer *vb = &tnl->vb;
+
+    unsigned int unBit;
+    unsigned int i;
+	for(i=0; i<VERT_ATTRIB_MAX; i++)
+	{
+		unBit = 1 << i;
+		if(vpc->mesa_program.Base.InputsRead & unBit) /* ctx->Array.ArrayObj->xxxxxxx */
+		{
+			vpc->aos_desc[i].size   = vb->AttribPtr[i]->size;
+			vpc->aos_desc[i].stride = vb->AttribPtr[i]->size * sizeof(GL_FLOAT);/* when emit array, data is packed. vb->AttribPtr[i]->stride;*/
+			vpc->aos_desc[i].type   = GL_FLOAT;
+		}
+	}
+
+    if(GL_FALSE == vpc->translated)
+    {
+        r700TranslateVertexShader(vpc,
+		    					  &(vpc->mesa_program) );
+    }
+}
+
+void * r700GetActiveVpShaderBo(GLcontext * ctx)
+{
+    struct r700_vertex_program *vp
+             = (struct r700_vertex_program *)ctx->VertexProgram._Current;
+
+    return vp->shaderbo;
+}
+
+GLboolean r700SetupVertexProgram(GLcontext * ctx)
+{
+    context_t *context = R700_CONTEXT(ctx);
+
+    BATCH_LOCALS(&context->radeon);
+
+    R700_CHIP_CONTEXT *r700 = (R700_CHIP_CONTEXT*)(&context->hw);
+
+    struct r700_vertex_program *vp
+             = (struct r700_vertex_program *)ctx->VertexProgram._Current;
+
+    struct gl_program_parameter_list *paramList;
+    unsigned int unNumParamData;
+
+    unsigned int ui;
+
+    if(GL_FALSE == vp->loaded)
+    {
+        if(vp->r700Shader.bNeedsAssembly == GL_TRUE)
+	    {
+		    Assemble( &(vp->r700Shader) );
+	    }
+
+        /* Load vp to gpu */
+        r600EmitShader(ctx, 
+                       &(vp->shaderbo), 
+                       (GLvoid *)(vp->r700Shader.pProgram),
+                       vp->r700Shader.uShaderBinaryDWORDSize,
+                       "VS"); 
+
+        vp->loaded = GL_TRUE;
+    }
+
+    DumpHwBinary(DUMP_VERTEX_SHADER, (GLvoid *)(vp->r700Shader.pProgram),
+                 vp->r700Shader.uShaderBinaryDWORDSize);
+
+    /* TODO : enable this after MemUse fixed *=
+    (context->chipobj.MemUse)(context, vp->shadercode.buf->id);
+    */
+
+    r700->vs.SQ_PGM_START_VS.u32All = 0; /* set from buffer object. */ 
+    
+    SETfield(r700->vs.SQ_PGM_RESOURCES_VS.u32All, vp->r700Shader.nRegs + 1,
+             NUM_GPRS_shift, NUM_GPRS_mask);
+
+    if(vp->r700Shader.uStackSize) /* we don't use branch for now, it should be zero. */
+	{
+        SETfield(r700->vs.SQ_PGM_RESOURCES_VS.u32All, vp->r700Shader.uStackSize,
+                 STACK_SIZE_shift, STACK_SIZE_mask);
+    }
+
+    SETfield(r700->SPI_VS_OUT_CONFIG.u32All, vp->r700Shader.nParamExports ? (vp->r700Shader.nParamExports - 1) : 0,
+             VS_EXPORT_COUNT_shift, VS_EXPORT_COUNT_mask);
+	SETfield(r700->SPI_PS_IN_CONTROL_0.u32All, vp->r700Shader.nParamExports,
+             NUM_INTERP_shift, NUM_INTERP_mask);
+
+    /*
+    SETbit(r700->SPI_PS_IN_CONTROL_0.u32All, PERSP_GRADIENT_ENA_bit);
+    CLEARbit(r700->SPI_PS_IN_CONTROL_0.u32All, LINEAR_GRADIENT_ENA_bit);
+    */
+
+    /* sent out shader constants. */
+
+    paramList = vp->mesa_program.Base.Parameters;
+
+    if(NULL != paramList)
+    {
+        _mesa_load_state_parameters(ctx, paramList);
+
+        unNumParamData = paramList->NumParameters * 4;
+
+        BEGIN_BATCH_NO_AUTOSTATE(unNumParamData + 2); 
+        
+        R600_OUT_BATCH(CP_PACKET3(R600_IT_SET_ALU_CONST, unNumParamData));
+        /* assembler map const from very beginning. */
+        R600_OUT_BATCH(SQ_ALU_CONSTANT_VS_OFFSET * 4);
+
+        unNumParamData = paramList->NumParameters;
+
+        for(ui=0; ui<unNumParamData; ui++)
+        {
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][0])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][1])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][2])));
+            R600_OUT_BATCH(*((unsigned int*)&(paramList->ParameterValues[ui][3])));
+        }
+        END_BATCH();
+        COMMIT_BATCH();
+    }
+
+    return GL_TRUE;
+}
+
+
+
+
diff --git a/src/mesa/drivers/dri/r600/r700_vertprog.h b/src/mesa/drivers/dri/r600/r700_vertprog.h
new file mode 100644
index 00000000000..6a9726a3d0f
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/r700_vertprog.h
@@ -0,0 +1,88 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Authors:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+
+#ifndef _R700_VERTPROG_H_
+#define _R700_VERTPROG_H_
+
+#include "main/glheader.h"
+#include "main/mtypes.h" 
+
+#include "r700_shader.h"
+#include "r700_assembler.h"
+
+typedef struct ArrayDesc //TEMP
+{
+	GLint size;   //number of data element
+	GLenum type;  //data element type
+	GLsizei stride;
+} ArrayDesc;
+
+struct r700_vertex_program 
+{
+    struct gl_vertex_program mesa_program; /* Must be first */
+
+	struct r700_vertex_program *next;
+
+    r700_AssemblerBase r700AsmCode;
+    R700_Shader        r700Shader;
+
+	GLboolean translated;
+    GLboolean loaded;
+	
+	/* ... */
+
+    void * shaderbo;
+
+	ArrayDesc              aos_desc[VERT_ATTRIB_MAX];
+};
+
+//Internal
+unsigned int Map_Vertex_Output(r700_AssemblerBase       *pAsm, 
+					           struct gl_vertex_program *mesa_vp,
+					           unsigned int unStart);
+unsigned int Map_Vertex_Input(r700_AssemblerBase       *pAsm, 
+					          struct gl_vertex_program *mesa_vp,
+					          unsigned int unStart);
+GLboolean Process_Vertex_Program_Vfetch_Instructions(
+						struct r700_vertex_program *vp,
+						struct gl_vertex_program   *mesa_vp);
+void Map_Vertex_Program(struct r700_vertex_program *vp,
+						struct gl_vertex_program   *mesa_vp);
+GLboolean Find_Instruction_Dependencies_vp(struct r700_vertex_program *vp,
+					                	   struct gl_vertex_program   *mesa_vp);
+
+/* Interface */
+extern GLboolean r700TranslateVertexShader(struct r700_vertex_program *vp,
+							   struct gl_vertex_program   *mesa_vp);
+
+extern void r700SelectVertexShader(GLcontext *ctx);
+
+extern GLboolean r700SetupVertexProgram(GLcontext * ctx);
+
+extern void *    r700GetActiveVpShaderBo(GLcontext * ctx);
+
+#endif /* _R700_VERTPROG_H_ */
diff --git a/src/mesa/drivers/dri/r300/radeon_lock.h b/src/mesa/drivers/dri/r600/radeon_context.h
index a344837f475..250570f6b89 100644
--- a/src/mesa/drivers/dri/r300/radeon_lock.h
+++ b/src/mesa/drivers/dri/r600/radeon_context.h
@@ -37,79 +37,40 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *   Gareth Hughes <[email protected]>
  *   Keith Whitwell <[email protected]>
  *   Kevin E. Martin <[email protected]>
+ *   Nicolai Haehnle <[email protected]>
  */
 
-#ifndef __RADEON_LOCK_H__
-#define __RADEON_LOCK_H__
+#ifndef __RADEON_CONTEXT_H__
+#define __RADEON_CONTEXT_H__
 
-#include "radeon_context.h"
+#include "main/mtypes.h"
+#include "main/colormac.h"
+#include "radeon_screen.h"
+#include "drm.h"
+#include "dri_util.h"
 
-extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
-extern void radeonUpdatePageFlipping(radeonContextPtr rmesa);
+#include "radeon_screen.h"
 
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if (prevLockFile) {						\
-	 fprintf(stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__);	\
-	 exit(1);							\
-      }									\
-   } while (0)
+#if R200_MERGED
+extern void radeonFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 
+#define FALLBACK( radeon, bit, mode ) do {			\
+   if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",	\
+		     __FUNCTION__, bit, mode );			\
+   radeonFallback( (radeon)->glCtx, bit, mode );		\
+} while (0)
 #else
+#define FALLBACK( radeon, bit, mode ) fprintf(stderr, "%s:%s\n", __LINE__, __FILE__);
+#endif
 
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
+/* TCL fallbacks */
+extern void radeonTclFallback(GLcontext * ctx, GLuint bit, GLboolean mode);
 
+#if R200_MERGED
+#define TCL_FALLBACK( ctx, bit, mode )	radeonTclFallback( ctx, bit, mode )
+#else
+#define TCL_FALLBACK( ctx, bit, mode )	;
 #endif
 
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
 
-/* Lock the hardware and validate our state.
- */
-#define LOCK_HARDWARE( rmesa )						\
-	do {								\
-		char __ret = 0;						\
-		DEBUG_CHECK_LOCK();					\
-		DRM_CAS((rmesa)->dri.hwLock, (rmesa)->dri.hwContext,	\
-			(DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret); \
-		if (__ret)						\
-			radeonGetLock((rmesa), 0);			\
-		DEBUG_LOCK();						\
-	} while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-	do {								\
-		DRM_UNLOCK((rmesa)->dri.fd,				\
-			(rmesa)->dri.hwLock,				\
-			(rmesa)->dri.hwContext);			\
-		DEBUG_RESET();						\
-	} while (0)
-
-#endif				/* __RADEON_LOCK_H__ */
+#endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/r600/sq_micro_reg.h b/src/mesa/drivers/dri/r600/sq_micro_reg.h
new file mode 100644
index 00000000000..bfd21cef623
--- /dev/null
+++ b/src/mesa/drivers/dri/r600/sq_micro_reg.h
@@ -0,0 +1,2008 @@
+/*
+ * Copyright (C) 2008-2009  Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDER(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/*
+ * Contacts:
+ *   Richard Li <[email protected]>, <[email protected]>
+ */
+
+#if !defined (_SQ_MICRO_REG_H)
+#define _SQ_MICRO_REG_H
+
+#if defined(LITTLEENDIAN_CPU)
+#elif defined(BIGENDIAN_CPU)
+#else
+#error "BIGENDIAN_CPU or LITTLEENDIAN_CPU must be defined"
+#endif
+
+/*
+ * SQ_ALU_SRC_GPR_BASE value
+ */
+
+#define SQ_ALU_SRC_GPR_BASE            0x00000000
+
+/*
+ * SQ_ALU_SRC_GPR_SIZE value
+ */
+
+#define SQ_ALU_SRC_GPR_SIZE            0x00000080
+
+/*
+ * SQ_ALU_SRC_KCACHE0_BASE value
+ */
+
+#define SQ_ALU_SRC_KCACHE0_BASE        0x00000080
+
+/*
+ * SQ_ALU_SRC_KCACHE0_SIZE value
+ */
+
+#define SQ_ALU_SRC_KCACHE0_SIZE        0x00000020
+
+/*
+ * SQ_ALU_SRC_KCACHE1_BASE value
+ */
+
+#define SQ_ALU_SRC_KCACHE1_BASE        0x000000a0
+
+/*
+ * SQ_ALU_SRC_KCACHE1_SIZE value
+ */
+
+#define SQ_ALU_SRC_KCACHE1_SIZE        0x00000020
+
+/*
+ * SQ_ALU_SRC_CFILE_BASE value
+ */
+
+#define SQ_ALU_SRC_CFILE_BASE          0x00000100
+
+/*
+ * SQ_ALU_SRC_CFILE_SIZE value
+ */
+
+#define SQ_ALU_SRC_CFILE_SIZE          0x00000100
+
+/*
+ * SQ_SP_OP_REDUC_BEGIN value
+ */
+
+#define SQ_SP_OP_REDUC_BEGIN           0x00000050
+
+/*
+ * SQ_SP_OP_REDUC_END value
+ */
+
+#define SQ_SP_OP_REDUC_END             0x00000053
+
+/*
+ * SQ_SP_OP_TRANS_BEGIN value
+ */
+
+#define SQ_SP_OP_TRANS_BEGIN           0x00000060
+
+/*
+ * SQ_SP_OP_TRANS_END value
+ */
+
+#define SQ_SP_OP_TRANS_END             0x0000007f
+
+/*
+ * SQ_CF_WORD0 struct
+ */
+
+#define SQ_CF_WORD0_ADDR_SIZE          32
+
+#define SQ_CF_WORD0_ADDR_SHIFT         0
+
+#define SQ_CF_WORD0_ADDR_MASK          0xffffffff
+
+#define SQ_CF_WORD0_MASK \
+     (SQ_CF_WORD0_ADDR_MASK)
+
+#define SQ_CF_WORD0_DEFAULT            0xcdcdcdcd
+
+#define SQ_CF_WORD0_GET_ADDR(sq_cf_word0) \
+     ((sq_cf_word0 & SQ_CF_WORD0_ADDR_MASK) >> SQ_CF_WORD0_ADDR_SHIFT)
+
+#define SQ_CF_WORD0_SET_ADDR(sq_cf_word0_reg, addr) \
+     sq_cf_word0_reg = (sq_cf_word0_reg & ~SQ_CF_WORD0_ADDR_MASK) | (addr << SQ_CF_WORD0_ADDR_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_word0_t {
+          unsigned int addr                           : SQ_CF_WORD0_ADDR_SIZE;
+     } sq_cf_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_word0_t {
+          unsigned int addr                           : SQ_CF_WORD0_ADDR_SIZE;
+     } sq_cf_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_word0_t f;
+} sq_cf_word0_u;
+
+
+/*
+ * SQ_CF_WORD1 struct
+ */
+
+#define SQ_CF_WORD1_POP_COUNT_SIZE     3
+#define SQ_CF_WORD1_CF_CONST_SIZE      5
+#define SQ_CF_WORD1_COND_SIZE          2
+#define SQ_CF_WORD1_COUNT_SIZE         3
+#define SQ_CF_WORD1_CALL_COUNT_SIZE    6
+#define SQ_CF_WORD1_COUNT_3_SIZE       1
+#define SQ_CF_WORD1_END_OF_PROGRAM_SIZE 1
+#define SQ_CF_WORD1_VALID_PIXEL_MODE_SIZE 1
+#define SQ_CF_WORD1_CF_INST_SIZE       7
+#define SQ_CF_WORD1_WHOLE_QUAD_MODE_SIZE 1
+#define SQ_CF_WORD1_BARRIER_SIZE       1
+
+#define SQ_CF_WORD1_POP_COUNT_SHIFT    0
+#define SQ_CF_WORD1_CF_CONST_SHIFT     3
+#define SQ_CF_WORD1_COND_SHIFT         8
+#define SQ_CF_WORD1_COUNT_SHIFT        10
+#define SQ_CF_WORD1_CALL_COUNT_SHIFT   13
+#define SQ_CF_WORD1_COUNT_3_SHIFT      19
+#define SQ_CF_WORD1_END_OF_PROGRAM_SHIFT 21
+#define SQ_CF_WORD1_VALID_PIXEL_MODE_SHIFT 22
+#define SQ_CF_WORD1_CF_INST_SHIFT      23
+#define SQ_CF_WORD1_WHOLE_QUAD_MODE_SHIFT 30
+#define SQ_CF_WORD1_BARRIER_SHIFT      31
+
+#define SQ_CF_WORD1_POP_COUNT_MASK     0x00000007
+#define SQ_CF_WORD1_CF_CONST_MASK      0x000000f8
+#define SQ_CF_WORD1_COND_MASK          0x00000300
+#define SQ_CF_WORD1_COUNT_MASK         0x00001c00
+#define SQ_CF_WORD1_CALL_COUNT_MASK    0x0007e000
+#define SQ_CF_WORD1_COUNT_3_MASK       0x00080000
+#define SQ_CF_WORD1_END_OF_PROGRAM_MASK 0x00200000
+#define SQ_CF_WORD1_VALID_PIXEL_MODE_MASK 0x00400000
+#define SQ_CF_WORD1_CF_INST_MASK       0x3f800000
+#define SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK 0x40000000
+#define SQ_CF_WORD1_BARRIER_MASK       0x80000000
+
+#define SQ_CF_WORD1_MASK \
+     (SQ_CF_WORD1_POP_COUNT_MASK | \
+      SQ_CF_WORD1_CF_CONST_MASK | \
+      SQ_CF_WORD1_COND_MASK | \
+      SQ_CF_WORD1_COUNT_MASK | \
+      SQ_CF_WORD1_CALL_COUNT_MASK | \
+      SQ_CF_WORD1_COUNT_3_MASK | \
+      SQ_CF_WORD1_END_OF_PROGRAM_MASK | \
+      SQ_CF_WORD1_VALID_PIXEL_MODE_MASK | \
+      SQ_CF_WORD1_CF_INST_MASK | \
+      SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK | \
+      SQ_CF_WORD1_BARRIER_MASK)
+
+#define SQ_CF_WORD1_DEFAULT            0xcdcdcdcd
+
+#define SQ_CF_WORD1_GET_POP_COUNT(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_POP_COUNT_MASK) >> SQ_CF_WORD1_POP_COUNT_SHIFT)
+#define SQ_CF_WORD1_GET_CF_CONST(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_CF_CONST_MASK) >> SQ_CF_WORD1_CF_CONST_SHIFT)
+#define SQ_CF_WORD1_GET_COND(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_COND_MASK) >> SQ_CF_WORD1_COND_SHIFT)
+#define SQ_CF_WORD1_GET_COUNT(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_COUNT_MASK) >> SQ_CF_WORD1_COUNT_SHIFT)
+#define SQ_CF_WORD1_GET_CALL_COUNT(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_CALL_COUNT_MASK) >> SQ_CF_WORD1_CALL_COUNT_SHIFT)
+#define SQ_CF_WORD1_GET_COUNT_3(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_COUNT_3_MASK) >> SQ_CF_WORD1_COUNT_3_SHIFT)
+#define SQ_CF_WORD1_GET_END_OF_PROGRAM(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_END_OF_PROGRAM_MASK) >> SQ_CF_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_WORD1_GET_VALID_PIXEL_MODE(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_VALID_PIXEL_MODE_MASK) >> SQ_CF_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_WORD1_GET_CF_INST(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_CF_INST_MASK) >> SQ_CF_WORD1_CF_INST_SHIFT)
+#define SQ_CF_WORD1_GET_WHOLE_QUAD_MODE(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK) >> SQ_CF_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_WORD1_GET_BARRIER(sq_cf_word1) \
+     ((sq_cf_word1 & SQ_CF_WORD1_BARRIER_MASK) >> SQ_CF_WORD1_BARRIER_SHIFT)
+
+#define SQ_CF_WORD1_SET_POP_COUNT(sq_cf_word1_reg, pop_count) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_POP_COUNT_MASK) | (pop_count << SQ_CF_WORD1_POP_COUNT_SHIFT)
+#define SQ_CF_WORD1_SET_CF_CONST(sq_cf_word1_reg, cf_const) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_CF_CONST_MASK) | (cf_const << SQ_CF_WORD1_CF_CONST_SHIFT)
+#define SQ_CF_WORD1_SET_COND(sq_cf_word1_reg, cond) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_COND_MASK) | (cond << SQ_CF_WORD1_COND_SHIFT)
+#define SQ_CF_WORD1_SET_COUNT(sq_cf_word1_reg, count) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_COUNT_MASK) | (count << SQ_CF_WORD1_COUNT_SHIFT)
+#define SQ_CF_WORD1_SET_CALL_COUNT(sq_cf_word1_reg, call_count) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_CALL_COUNT_MASK) | (call_count << SQ_CF_WORD1_CALL_COUNT_SHIFT)
+#define SQ_CF_WORD1_SET_COUNT_3(sq_cf_word1_reg, count_3) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_COUNT_3_MASK) | (count_3 << SQ_CF_WORD1_COUNT_3_SHIFT)
+#define SQ_CF_WORD1_SET_END_OF_PROGRAM(sq_cf_word1_reg, end_of_program) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_END_OF_PROGRAM_MASK) | (end_of_program << SQ_CF_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_WORD1_SET_VALID_PIXEL_MODE(sq_cf_word1_reg, valid_pixel_mode) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_VALID_PIXEL_MODE_MASK) | (valid_pixel_mode << SQ_CF_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_WORD1_SET_CF_INST(sq_cf_word1_reg, cf_inst) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_CF_INST_MASK) | (cf_inst << SQ_CF_WORD1_CF_INST_SHIFT)
+#define SQ_CF_WORD1_SET_WHOLE_QUAD_MODE(sq_cf_word1_reg, whole_quad_mode) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_WHOLE_QUAD_MODE_MASK) | (whole_quad_mode << SQ_CF_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_WORD1_SET_BARRIER(sq_cf_word1_reg, barrier) \
+     sq_cf_word1_reg = (sq_cf_word1_reg & ~SQ_CF_WORD1_BARRIER_MASK) | (barrier << SQ_CF_WORD1_BARRIER_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_word1_t {
+          unsigned int pop_count                      : SQ_CF_WORD1_POP_COUNT_SIZE;
+          unsigned int cf_const                       : SQ_CF_WORD1_CF_CONST_SIZE;
+          unsigned int cond                           : SQ_CF_WORD1_COND_SIZE;
+          unsigned int count                          : SQ_CF_WORD1_COUNT_SIZE;
+          unsigned int call_count                     : SQ_CF_WORD1_CALL_COUNT_SIZE;
+          unsigned int count_3                        : SQ_CF_WORD1_COUNT_3_SIZE;
+          unsigned int                                : 1;
+          unsigned int end_of_program                 : SQ_CF_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_WORD1_CF_INST_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int barrier                        : SQ_CF_WORD1_BARRIER_SIZE;
+     } sq_cf_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_word1_t {
+          unsigned int barrier                        : SQ_CF_WORD1_BARRIER_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_WORD1_CF_INST_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int end_of_program                 : SQ_CF_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int                                : 1;
+          unsigned int count_3                        : SQ_CF_WORD1_COUNT_3_SIZE;
+          unsigned int call_count                     : SQ_CF_WORD1_CALL_COUNT_SIZE;
+          unsigned int count                          : SQ_CF_WORD1_COUNT_SIZE;
+          unsigned int cond                           : SQ_CF_WORD1_COND_SIZE;
+          unsigned int cf_const                       : SQ_CF_WORD1_CF_CONST_SIZE;
+          unsigned int pop_count                      : SQ_CF_WORD1_POP_COUNT_SIZE;
+     } sq_cf_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_word1_t f;
+} sq_cf_word1_u;
+
+
+/*
+ * SQ_CF_ALU_WORD0 struct
+ */
+
+#define SQ_CF_ALU_WORD0_ADDR_SIZE      22
+#define SQ_CF_ALU_WORD0_KCACHE_BANK0_SIZE 4
+#define SQ_CF_ALU_WORD0_KCACHE_BANK1_SIZE 4
+#define SQ_CF_ALU_WORD0_KCACHE_MODE0_SIZE 2
+
+#define SQ_CF_ALU_WORD0_ADDR_SHIFT     0
+#define SQ_CF_ALU_WORD0_KCACHE_BANK0_SHIFT 22
+#define SQ_CF_ALU_WORD0_KCACHE_BANK1_SHIFT 26
+#define SQ_CF_ALU_WORD0_KCACHE_MODE0_SHIFT 30
+
+#define SQ_CF_ALU_WORD0_ADDR_MASK      0x003fffff
+#define SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK 0x03c00000
+#define SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK 0x3c000000
+#define SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK 0xc0000000
+
+#define SQ_CF_ALU_WORD0_MASK \
+     (SQ_CF_ALU_WORD0_ADDR_MASK | \
+      SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK | \
+      SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK | \
+      SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK)
+
+#define SQ_CF_ALU_WORD0_DEFAULT        0xcdcdcdcd
+
+#define SQ_CF_ALU_WORD0_GET_ADDR(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_ADDR_MASK) >> SQ_CF_ALU_WORD0_ADDR_SHIFT)
+#define SQ_CF_ALU_WORD0_GET_KCACHE_BANK0(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK) >> SQ_CF_ALU_WORD0_KCACHE_BANK0_SHIFT)
+#define SQ_CF_ALU_WORD0_GET_KCACHE_BANK1(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK) >> SQ_CF_ALU_WORD0_KCACHE_BANK1_SHIFT)
+#define SQ_CF_ALU_WORD0_GET_KCACHE_MODE0(sq_cf_alu_word0) \
+     ((sq_cf_alu_word0 & SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK) >> SQ_CF_ALU_WORD0_KCACHE_MODE0_SHIFT)
+
+#define SQ_CF_ALU_WORD0_SET_ADDR(sq_cf_alu_word0_reg, addr) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_ADDR_MASK) | (addr << SQ_CF_ALU_WORD0_ADDR_SHIFT)
+#define SQ_CF_ALU_WORD0_SET_KCACHE_BANK0(sq_cf_alu_word0_reg, kcache_bank0) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_KCACHE_BANK0_MASK) | (kcache_bank0 << SQ_CF_ALU_WORD0_KCACHE_BANK0_SHIFT)
+#define SQ_CF_ALU_WORD0_SET_KCACHE_BANK1(sq_cf_alu_word0_reg, kcache_bank1) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_KCACHE_BANK1_MASK) | (kcache_bank1 << SQ_CF_ALU_WORD0_KCACHE_BANK1_SHIFT)
+#define SQ_CF_ALU_WORD0_SET_KCACHE_MODE0(sq_cf_alu_word0_reg, kcache_mode0) \
+     sq_cf_alu_word0_reg = (sq_cf_alu_word0_reg & ~SQ_CF_ALU_WORD0_KCACHE_MODE0_MASK) | (kcache_mode0 << SQ_CF_ALU_WORD0_KCACHE_MODE0_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word0_t {
+          unsigned int addr                           : SQ_CF_ALU_WORD0_ADDR_SIZE;
+          unsigned int kcache_bank0                   : SQ_CF_ALU_WORD0_KCACHE_BANK0_SIZE;
+          unsigned int kcache_bank1                   : SQ_CF_ALU_WORD0_KCACHE_BANK1_SIZE;
+          unsigned int kcache_mode0                   : SQ_CF_ALU_WORD0_KCACHE_MODE0_SIZE;
+     } sq_cf_alu_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word0_t {
+          unsigned int kcache_mode0                   : SQ_CF_ALU_WORD0_KCACHE_MODE0_SIZE;
+          unsigned int kcache_bank1                   : SQ_CF_ALU_WORD0_KCACHE_BANK1_SIZE;
+          unsigned int kcache_bank0                   : SQ_CF_ALU_WORD0_KCACHE_BANK0_SIZE;
+          unsigned int addr                           : SQ_CF_ALU_WORD0_ADDR_SIZE;
+     } sq_cf_alu_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alu_word0_t f;
+} sq_cf_alu_word0_u;
+
+
+/*
+ * SQ_CF_ALU_WORD1 struct
+ */
+
+#define SQ_CF_ALU_WORD1_KCACHE_MODE1_SIZE 2
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR0_SIZE 8
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR1_SIZE 8
+#define SQ_CF_ALU_WORD1_COUNT_SIZE     7
+#define SQ_CF_ALU_WORD1_ALT_CONST_SIZE 1
+#define SQ_CF_ALU_WORD1_CF_INST_SIZE   4
+#define SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SIZE 1
+#define SQ_CF_ALU_WORD1_BARRIER_SIZE   1
+
+#define SQ_CF_ALU_WORD1_KCACHE_MODE1_SHIFT 0
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR0_SHIFT 2
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR1_SHIFT 10
+#define SQ_CF_ALU_WORD1_COUNT_SHIFT    18
+#define SQ_CF_ALU_WORD1_ALT_CONST_SHIFT 25
+#define SQ_CF_ALU_WORD1_CF_INST_SHIFT  26
+#define SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SHIFT 30
+#define SQ_CF_ALU_WORD1_BARRIER_SHIFT  31
+
+#define SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK 0x00000003
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK 0x000003fc
+#define SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK 0x0003fc00
+#define SQ_CF_ALU_WORD1_COUNT_MASK     0x01fc0000
+#define SQ_CF_ALU_WORD1_ALT_CONST_MASK 0x02000000
+#define SQ_CF_ALU_WORD1_CF_INST_MASK   0x3c000000
+#define SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK 0x40000000
+#define SQ_CF_ALU_WORD1_BARRIER_MASK   0x80000000
+
+#define SQ_CF_ALU_WORD1_MASK \
+     (SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK | \
+      SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK | \
+      SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK | \
+      SQ_CF_ALU_WORD1_COUNT_MASK | \
+      SQ_CF_ALU_WORD1_ALT_CONST_MASK | \
+      SQ_CF_ALU_WORD1_CF_INST_MASK | \
+      SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK | \
+      SQ_CF_ALU_WORD1_BARRIER_MASK)
+
+#define SQ_CF_ALU_WORD1_DEFAULT        0xcdcdcdcd
+
+#define SQ_CF_ALU_WORD1_GET_KCACHE_MODE1(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK) >> SQ_CF_ALU_WORD1_KCACHE_MODE1_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_KCACHE_ADDR0(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK) >> SQ_CF_ALU_WORD1_KCACHE_ADDR0_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_KCACHE_ADDR1(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK) >> SQ_CF_ALU_WORD1_KCACHE_ADDR1_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_COUNT(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_COUNT_MASK) >> SQ_CF_ALU_WORD1_COUNT_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_ALT_CONST(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_ALT_CONST_MASK) >> SQ_CF_ALU_WORD1_ALT_CONST_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_CF_INST(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_CF_INST_MASK) >> SQ_CF_ALU_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_WHOLE_QUAD_MODE(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK) >> SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALU_WORD1_GET_BARRIER(sq_cf_alu_word1) \
+     ((sq_cf_alu_word1 & SQ_CF_ALU_WORD1_BARRIER_MASK) >> SQ_CF_ALU_WORD1_BARRIER_SHIFT)
+
+#define SQ_CF_ALU_WORD1_SET_KCACHE_MODE1(sq_cf_alu_word1_reg, kcache_mode1) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_KCACHE_MODE1_MASK) | (kcache_mode1 << SQ_CF_ALU_WORD1_KCACHE_MODE1_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_KCACHE_ADDR0(sq_cf_alu_word1_reg, kcache_addr0) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_KCACHE_ADDR0_MASK) | (kcache_addr0 << SQ_CF_ALU_WORD1_KCACHE_ADDR0_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_KCACHE_ADDR1(sq_cf_alu_word1_reg, kcache_addr1) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_KCACHE_ADDR1_MASK) | (kcache_addr1 << SQ_CF_ALU_WORD1_KCACHE_ADDR1_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_COUNT(sq_cf_alu_word1_reg, count) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_COUNT_MASK) | (count << SQ_CF_ALU_WORD1_COUNT_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_ALT_CONST(sq_cf_alu_word1_reg, alt_const) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_ALT_CONST_MASK) | (alt_const << SQ_CF_ALU_WORD1_ALT_CONST_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_CF_INST(sq_cf_alu_word1_reg, cf_inst) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_CF_INST_MASK) | (cf_inst << SQ_CF_ALU_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_WHOLE_QUAD_MODE(sq_cf_alu_word1_reg, whole_quad_mode) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_MASK) | (whole_quad_mode << SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALU_WORD1_SET_BARRIER(sq_cf_alu_word1_reg, barrier) \
+     sq_cf_alu_word1_reg = (sq_cf_alu_word1_reg & ~SQ_CF_ALU_WORD1_BARRIER_MASK) | (barrier << SQ_CF_ALU_WORD1_BARRIER_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word1_t {
+          unsigned int kcache_mode1                   : SQ_CF_ALU_WORD1_KCACHE_MODE1_SIZE;
+          unsigned int kcache_addr0                   : SQ_CF_ALU_WORD1_KCACHE_ADDR0_SIZE;
+          unsigned int kcache_addr1                   : SQ_CF_ALU_WORD1_KCACHE_ADDR1_SIZE;
+          unsigned int count                          : SQ_CF_ALU_WORD1_COUNT_SIZE;
+          unsigned int alt_const                      : SQ_CF_ALU_WORD1_ALT_CONST_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALU_WORD1_CF_INST_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int barrier                        : SQ_CF_ALU_WORD1_BARRIER_SIZE;
+     } sq_cf_alu_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alu_word1_t {
+          unsigned int barrier                        : SQ_CF_ALU_WORD1_BARRIER_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALU_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALU_WORD1_CF_INST_SIZE;
+          unsigned int alt_const                      : SQ_CF_ALU_WORD1_ALT_CONST_SIZE;
+          unsigned int count                          : SQ_CF_ALU_WORD1_COUNT_SIZE;
+          unsigned int kcache_addr1                   : SQ_CF_ALU_WORD1_KCACHE_ADDR1_SIZE;
+          unsigned int kcache_addr0                   : SQ_CF_ALU_WORD1_KCACHE_ADDR0_SIZE;
+          unsigned int kcache_mode1                   : SQ_CF_ALU_WORD1_KCACHE_MODE1_SIZE;
+     } sq_cf_alu_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alu_word1_t f;
+} sq_cf_alu_word1_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD0 struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SIZE 13
+#define SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SIZE 2
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SIZE 7
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SIZE 7
+#define SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SIZE 2
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SHIFT 0
+#define SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SHIFT 13
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SHIFT 15
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SHIFT 22
+#define SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SHIFT 23
+#define SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SHIFT 30
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK 0x00001fff
+#define SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK 0x00006000
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK 0x003f8000
+#define SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK 0x00400000
+#define SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK 0x3f800000
+#define SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK 0xc0000000
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_DEFAULT 0xcdcdcdcd
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_ARRAY_BASE(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_TYPE(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_RW_GPR(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_RW_REL(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_INDEX_GPR(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_GET_ELEM_SIZE(sq_cf_alloc_export_word0) \
+     ((sq_cf_alloc_export_word0 & SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_ARRAY_BASE(sq_cf_alloc_export_word0_reg, array_base) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_MASK) | (array_base << SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_TYPE(sq_cf_alloc_export_word0_reg, type) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_TYPE_MASK) | (type << SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_RW_GPR(sq_cf_alloc_export_word0_reg, rw_gpr) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_MASK) | (rw_gpr << SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_RW_REL(sq_cf_alloc_export_word0_reg, rw_rel) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_MASK) | (rw_rel << SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_INDEX_GPR(sq_cf_alloc_export_word0_reg, index_gpr) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_MASK) | (index_gpr << SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD0_SET_ELEM_SIZE(sq_cf_alloc_export_word0_reg, elem_size) \
+     sq_cf_alloc_export_word0_reg = (sq_cf_alloc_export_word0_reg & ~SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_MASK) | (elem_size << SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word0_t {
+          unsigned int array_base                     : SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SIZE;
+          unsigned int type                           : SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SIZE;
+          unsigned int rw_gpr                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SIZE;
+          unsigned int rw_rel                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SIZE;
+          unsigned int index_gpr                      : SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SIZE;
+          unsigned int elem_size                      : SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SIZE;
+     } sq_cf_alloc_export_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word0_t {
+          unsigned int elem_size                      : SQ_CF_ALLOC_EXPORT_WORD0_ELEM_SIZE_SIZE;
+          unsigned int index_gpr                      : SQ_CF_ALLOC_EXPORT_WORD0_INDEX_GPR_SIZE;
+          unsigned int rw_rel                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_REL_SIZE;
+          unsigned int rw_gpr                         : SQ_CF_ALLOC_EXPORT_WORD0_RW_GPR_SIZE;
+          unsigned int type                           : SQ_CF_ALLOC_EXPORT_WORD0_TYPE_SIZE;
+          unsigned int array_base                     : SQ_CF_ALLOC_EXPORT_WORD0_ARRAY_BASE_SIZE;
+     } sq_cf_alloc_export_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word0_t f;
+} sq_cf_alloc_export_word0_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD1 struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SIZE 4
+#define SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SIZE 7
+#define SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SIZE 1
+#define SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SIZE 1
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SHIFT 17
+#define SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SHIFT 21
+#define SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SHIFT 22
+#define SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SHIFT 23
+#define SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SHIFT 30
+#define SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SHIFT 31
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK 0x001e0000
+#define SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK 0x00200000
+#define SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK 0x00400000
+#define SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK 0x3f800000
+#define SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK 0x40000000
+#define SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK 0x80000000
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_DEFAULT 0xcdcc0000
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_BURST_COUNT(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_END_OF_PROGRAM(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_VALID_PIXEL_MODE(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_CF_INST(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_WHOLE_QUAD_MODE(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_GET_BARRIER(sq_cf_alloc_export_word1) \
+     ((sq_cf_alloc_export_word1 & SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_BURST_COUNT(sq_cf_alloc_export_word1_reg, burst_count) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_MASK) | (burst_count << SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_END_OF_PROGRAM(sq_cf_alloc_export_word1_reg, end_of_program) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_MASK) | (end_of_program << SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_VALID_PIXEL_MODE(sq_cf_alloc_export_word1_reg, valid_pixel_mode) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_MASK) | (valid_pixel_mode << SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_CF_INST(sq_cf_alloc_export_word1_reg, cf_inst) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_MASK) | (cf_inst << SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_WHOLE_QUAD_MODE(sq_cf_alloc_export_word1_reg, whole_quad_mode) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_MASK) | (whole_quad_mode << SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SET_BARRIER(sq_cf_alloc_export_word1_reg, barrier) \
+     sq_cf_alloc_export_word1_reg = (sq_cf_alloc_export_word1_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_MASK) | (barrier << SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_t {
+          unsigned int                                : 17;
+          unsigned int burst_count                    : SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SIZE;
+          unsigned int end_of_program                 : SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int barrier                        : SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SIZE;
+     } sq_cf_alloc_export_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_t {
+          unsigned int barrier                        : SQ_CF_ALLOC_EXPORT_WORD1_BARRIER_SIZE;
+          unsigned int whole_quad_mode                : SQ_CF_ALLOC_EXPORT_WORD1_WHOLE_QUAD_MODE_SIZE;
+          unsigned int cf_inst                        : SQ_CF_ALLOC_EXPORT_WORD1_CF_INST_SIZE;
+          unsigned int valid_pixel_mode               : SQ_CF_ALLOC_EXPORT_WORD1_VALID_PIXEL_MODE_SIZE;
+          unsigned int end_of_program                 : SQ_CF_ALLOC_EXPORT_WORD1_END_OF_PROGRAM_SIZE;
+          unsigned int burst_count                    : SQ_CF_ALLOC_EXPORT_WORD1_BURST_COUNT_SIZE;
+          unsigned int                                : 17;
+     } sq_cf_alloc_export_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word1_t f;
+} sq_cf_alloc_export_word1_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD1_BUF struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SIZE 12
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SIZE 4
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SHIFT 0
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SHIFT 12
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK 0x00000fff
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK 0x0000f000
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_DEFAULT 0x0000cdcd
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_GET_ARRAY_SIZE(sq_cf_alloc_export_word1_buf) \
+     ((sq_cf_alloc_export_word1_buf & SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_GET_COMP_MASK(sq_cf_alloc_export_word1_buf) \
+     ((sq_cf_alloc_export_word1_buf & SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_SET_ARRAY_SIZE(sq_cf_alloc_export_word1_buf_reg, array_size) \
+     sq_cf_alloc_export_word1_buf_reg = (sq_cf_alloc_export_word1_buf_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_MASK) | (array_size << SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_BUF_SET_COMP_MASK(sq_cf_alloc_export_word1_buf_reg, comp_mask) \
+     sq_cf_alloc_export_word1_buf_reg = (sq_cf_alloc_export_word1_buf_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_MASK) | (comp_mask << SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_buf_t {
+          unsigned int array_size                     : SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SIZE;
+          unsigned int comp_mask                      : SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SIZE;
+          unsigned int                                : 16;
+     } sq_cf_alloc_export_word1_buf_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_buf_t {
+          unsigned int                                : 16;
+          unsigned int comp_mask                      : SQ_CF_ALLOC_EXPORT_WORD1_BUF_COMP_MASK_SIZE;
+          unsigned int array_size                     : SQ_CF_ALLOC_EXPORT_WORD1_BUF_ARRAY_SIZE_SIZE;
+     } sq_cf_alloc_export_word1_buf_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word1_buf_t f;
+} sq_cf_alloc_export_word1_buf_u;
+
+
+/*
+ * SQ_CF_ALLOC_EXPORT_WORD1_SWIZ struct
+ */
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SIZE 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SIZE 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SIZE 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SIZE 3
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SHIFT 0
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SHIFT 3
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SHIFT 6
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SHIFT 9
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK 0x00000007
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK 0x00000038
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK 0x000001c0
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK 0x00000e00
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_MASK \
+     (SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK | \
+      SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_DEFAULT 0x00000dcd
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_X(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_Y(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_Z(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_GET_SEL_W(sq_cf_alloc_export_word1_swiz) \
+     ((sq_cf_alloc_export_word1_swiz & SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK) >> SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SHIFT)
+
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_X(sq_cf_alloc_export_word1_swiz_reg, sel_x) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_MASK) | (sel_x << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_Y(sq_cf_alloc_export_word1_swiz_reg, sel_y) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_MASK) | (sel_y << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_Z(sq_cf_alloc_export_word1_swiz_reg, sel_z) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_MASK) | (sel_z << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SHIFT)
+#define SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SET_SEL_W(sq_cf_alloc_export_word1_swiz_reg, sel_w) \
+     sq_cf_alloc_export_word1_swiz_reg = (sq_cf_alloc_export_word1_swiz_reg & ~SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_MASK) | (sel_w << SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_swiz_t {
+          unsigned int sel_x                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SIZE;
+          unsigned int sel_y                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SIZE;
+          unsigned int sel_z                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SIZE;
+          unsigned int sel_w                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SIZE;
+          unsigned int                                : 20;
+     } sq_cf_alloc_export_word1_swiz_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_cf_alloc_export_word1_swiz_t {
+          unsigned int                                : 20;
+          unsigned int sel_w                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_W_SIZE;
+          unsigned int sel_z                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Z_SIZE;
+          unsigned int sel_y                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_Y_SIZE;
+          unsigned int sel_x                          : SQ_CF_ALLOC_EXPORT_WORD1_SWIZ_SEL_X_SIZE;
+     } sq_cf_alloc_export_word1_swiz_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_cf_alloc_export_word1_swiz_t f;
+} sq_cf_alloc_export_word1_swiz_u;
+
+
+/*
+ * SQ_ALU_WORD0 struct
+ */
+
+#define SQ_ALU_WORD0_SRC0_SEL_SIZE     9
+#define SQ_ALU_WORD0_SRC0_REL_SIZE     1
+#define SQ_ALU_WORD0_SRC0_CHAN_SIZE    2
+#define SQ_ALU_WORD0_SRC0_NEG_SIZE     1
+#define SQ_ALU_WORD0_SRC1_SEL_SIZE     9
+#define SQ_ALU_WORD0_SRC1_REL_SIZE     1
+#define SQ_ALU_WORD0_SRC1_CHAN_SIZE    2
+#define SQ_ALU_WORD0_SRC1_NEG_SIZE     1
+#define SQ_ALU_WORD0_INDEX_MODE_SIZE   3
+#define SQ_ALU_WORD0_PRED_SEL_SIZE     2
+#define SQ_ALU_WORD0_LAST_SIZE         1
+
+#define SQ_ALU_WORD0_SRC0_SEL_SHIFT    0
+#define SQ_ALU_WORD0_SRC0_REL_SHIFT    9
+#define SQ_ALU_WORD0_SRC0_CHAN_SHIFT   10
+#define SQ_ALU_WORD0_SRC0_NEG_SHIFT    12
+#define SQ_ALU_WORD0_SRC1_SEL_SHIFT    13
+#define SQ_ALU_WORD0_SRC1_REL_SHIFT    22
+#define SQ_ALU_WORD0_SRC1_CHAN_SHIFT   23
+#define SQ_ALU_WORD0_SRC1_NEG_SHIFT    25
+#define SQ_ALU_WORD0_INDEX_MODE_SHIFT  26
+#define SQ_ALU_WORD0_PRED_SEL_SHIFT    29
+#define SQ_ALU_WORD0_LAST_SHIFT        31
+
+#define SQ_ALU_WORD0_SRC0_SEL_MASK     0x000001ff
+#define SQ_ALU_WORD0_SRC0_REL_MASK     0x00000200
+#define SQ_ALU_WORD0_SRC0_CHAN_MASK    0x00000c00
+#define SQ_ALU_WORD0_SRC0_NEG_MASK     0x00001000
+#define SQ_ALU_WORD0_SRC1_SEL_MASK     0x003fe000
+#define SQ_ALU_WORD0_SRC1_REL_MASK     0x00400000
+#define SQ_ALU_WORD0_SRC1_CHAN_MASK    0x01800000
+#define SQ_ALU_WORD0_SRC1_NEG_MASK     0x02000000
+#define SQ_ALU_WORD0_INDEX_MODE_MASK   0x1c000000
+#define SQ_ALU_WORD0_PRED_SEL_MASK     0x60000000
+#define SQ_ALU_WORD0_LAST_MASK         0x80000000
+
+#define SQ_ALU_WORD0_MASK \
+     (SQ_ALU_WORD0_SRC0_SEL_MASK | \
+      SQ_ALU_WORD0_SRC0_REL_MASK | \
+      SQ_ALU_WORD0_SRC0_CHAN_MASK | \
+      SQ_ALU_WORD0_SRC0_NEG_MASK | \
+      SQ_ALU_WORD0_SRC1_SEL_MASK | \
+      SQ_ALU_WORD0_SRC1_REL_MASK | \
+      SQ_ALU_WORD0_SRC1_CHAN_MASK | \
+      SQ_ALU_WORD0_SRC1_NEG_MASK | \
+      SQ_ALU_WORD0_INDEX_MODE_MASK | \
+      SQ_ALU_WORD0_PRED_SEL_MASK | \
+      SQ_ALU_WORD0_LAST_MASK)
+
+#define SQ_ALU_WORD0_DEFAULT           0xcdcdcdcd
+
+#define SQ_ALU_WORD0_GET_SRC0_SEL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_SEL_MASK) >> SQ_ALU_WORD0_SRC0_SEL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC0_REL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_REL_MASK) >> SQ_ALU_WORD0_SRC0_REL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC0_CHAN(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_CHAN_MASK) >> SQ_ALU_WORD0_SRC0_CHAN_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC0_NEG(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC0_NEG_MASK) >> SQ_ALU_WORD0_SRC0_NEG_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_SEL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_SEL_MASK) >> SQ_ALU_WORD0_SRC1_SEL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_REL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_REL_MASK) >> SQ_ALU_WORD0_SRC1_REL_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_CHAN(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_CHAN_MASK) >> SQ_ALU_WORD0_SRC1_CHAN_SHIFT)
+#define SQ_ALU_WORD0_GET_SRC1_NEG(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_SRC1_NEG_MASK) >> SQ_ALU_WORD0_SRC1_NEG_SHIFT)
+#define SQ_ALU_WORD0_GET_INDEX_MODE(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_INDEX_MODE_MASK) >> SQ_ALU_WORD0_INDEX_MODE_SHIFT)
+#define SQ_ALU_WORD0_GET_PRED_SEL(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_PRED_SEL_MASK) >> SQ_ALU_WORD0_PRED_SEL_SHIFT)
+#define SQ_ALU_WORD0_GET_LAST(sq_alu_word0) \
+     ((sq_alu_word0 & SQ_ALU_WORD0_LAST_MASK) >> SQ_ALU_WORD0_LAST_SHIFT)
+
+#define SQ_ALU_WORD0_SET_SRC0_SEL(sq_alu_word0_reg, src0_sel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_SEL_MASK) | (src0_sel << SQ_ALU_WORD0_SRC0_SEL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC0_REL(sq_alu_word0_reg, src0_rel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_REL_MASK) | (src0_rel << SQ_ALU_WORD0_SRC0_REL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC0_CHAN(sq_alu_word0_reg, src0_chan) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_CHAN_MASK) | (src0_chan << SQ_ALU_WORD0_SRC0_CHAN_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC0_NEG(sq_alu_word0_reg, src0_neg) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC0_NEG_MASK) | (src0_neg << SQ_ALU_WORD0_SRC0_NEG_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_SEL(sq_alu_word0_reg, src1_sel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_SEL_MASK) | (src1_sel << SQ_ALU_WORD0_SRC1_SEL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_REL(sq_alu_word0_reg, src1_rel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_REL_MASK) | (src1_rel << SQ_ALU_WORD0_SRC1_REL_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_CHAN(sq_alu_word0_reg, src1_chan) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_CHAN_MASK) | (src1_chan << SQ_ALU_WORD0_SRC1_CHAN_SHIFT)
+#define SQ_ALU_WORD0_SET_SRC1_NEG(sq_alu_word0_reg, src1_neg) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_SRC1_NEG_MASK) | (src1_neg << SQ_ALU_WORD0_SRC1_NEG_SHIFT)
+#define SQ_ALU_WORD0_SET_INDEX_MODE(sq_alu_word0_reg, index_mode) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_INDEX_MODE_MASK) | (index_mode << SQ_ALU_WORD0_INDEX_MODE_SHIFT)
+#define SQ_ALU_WORD0_SET_PRED_SEL(sq_alu_word0_reg, pred_sel) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_PRED_SEL_MASK) | (pred_sel << SQ_ALU_WORD0_PRED_SEL_SHIFT)
+#define SQ_ALU_WORD0_SET_LAST(sq_alu_word0_reg, last) \
+     sq_alu_word0_reg = (sq_alu_word0_reg & ~SQ_ALU_WORD0_LAST_MASK) | (last << SQ_ALU_WORD0_LAST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word0_t {
+          unsigned int src0_sel                       : SQ_ALU_WORD0_SRC0_SEL_SIZE;
+          unsigned int src0_rel                       : SQ_ALU_WORD0_SRC0_REL_SIZE;
+          unsigned int src0_chan                      : SQ_ALU_WORD0_SRC0_CHAN_SIZE;
+          unsigned int src0_neg                       : SQ_ALU_WORD0_SRC0_NEG_SIZE;
+          unsigned int src1_sel                       : SQ_ALU_WORD0_SRC1_SEL_SIZE;
+          unsigned int src1_rel                       : SQ_ALU_WORD0_SRC1_REL_SIZE;
+          unsigned int src1_chan                      : SQ_ALU_WORD0_SRC1_CHAN_SIZE;
+          unsigned int src1_neg                       : SQ_ALU_WORD0_SRC1_NEG_SIZE;
+          unsigned int index_mode                     : SQ_ALU_WORD0_INDEX_MODE_SIZE;
+          unsigned int pred_sel                       : SQ_ALU_WORD0_PRED_SEL_SIZE;
+          unsigned int last                           : SQ_ALU_WORD0_LAST_SIZE;
+     } sq_alu_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word0_t {
+          unsigned int last                           : SQ_ALU_WORD0_LAST_SIZE;
+          unsigned int pred_sel                       : SQ_ALU_WORD0_PRED_SEL_SIZE;
+          unsigned int index_mode                     : SQ_ALU_WORD0_INDEX_MODE_SIZE;
+          unsigned int src1_neg                       : SQ_ALU_WORD0_SRC1_NEG_SIZE;
+          unsigned int src1_chan                      : SQ_ALU_WORD0_SRC1_CHAN_SIZE;
+          unsigned int src1_rel                       : SQ_ALU_WORD0_SRC1_REL_SIZE;
+          unsigned int src1_sel                       : SQ_ALU_WORD0_SRC1_SEL_SIZE;
+          unsigned int src0_neg                       : SQ_ALU_WORD0_SRC0_NEG_SIZE;
+          unsigned int src0_chan                      : SQ_ALU_WORD0_SRC0_CHAN_SIZE;
+          unsigned int src0_rel                       : SQ_ALU_WORD0_SRC0_REL_SIZE;
+          unsigned int src0_sel                       : SQ_ALU_WORD0_SRC0_SEL_SIZE;
+     } sq_alu_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word0_t f;
+} sq_alu_word0_u;
+
+
+/*
+ * SQ_ALU_WORD1 struct
+ */
+
+#define SQ_ALU_WORD1_ENCODING_SIZE     3
+#define SQ_ALU_WORD1_BANK_SWIZZLE_SIZE 3
+#define SQ_ALU_WORD1_DST_GPR_SIZE      7
+#define SQ_ALU_WORD1_DST_REL_SIZE      1
+#define SQ_ALU_WORD1_DST_CHAN_SIZE     2
+#define SQ_ALU_WORD1_CLAMP_SIZE        1
+
+#define SQ_ALU_WORD1_ENCODING_SHIFT    15
+#define SQ_ALU_WORD1_BANK_SWIZZLE_SHIFT 18
+#define SQ_ALU_WORD1_DST_GPR_SHIFT     21
+#define SQ_ALU_WORD1_DST_REL_SHIFT     28
+#define SQ_ALU_WORD1_DST_CHAN_SHIFT    29
+#define SQ_ALU_WORD1_CLAMP_SHIFT       31
+
+#define SQ_ALU_WORD1_ENCODING_MASK     0x00038000
+#define SQ_ALU_WORD1_BANK_SWIZZLE_MASK 0x001c0000
+#define SQ_ALU_WORD1_DST_GPR_MASK      0x0fe00000
+#define SQ_ALU_WORD1_DST_REL_MASK      0x10000000
+#define SQ_ALU_WORD1_DST_CHAN_MASK     0x60000000
+#define SQ_ALU_WORD1_CLAMP_MASK        0x80000000
+
+#define SQ_ALU_WORD1_MASK \
+     (SQ_ALU_WORD1_ENCODING_MASK | \
+      SQ_ALU_WORD1_BANK_SWIZZLE_MASK | \
+      SQ_ALU_WORD1_DST_GPR_MASK | \
+      SQ_ALU_WORD1_DST_REL_MASK | \
+      SQ_ALU_WORD1_DST_CHAN_MASK | \
+      SQ_ALU_WORD1_CLAMP_MASK)
+
+#define SQ_ALU_WORD1_DEFAULT           0xcdcd8000
+
+#define SQ_ALU_WORD1_GET_ENCODING(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_ENCODING_MASK) >> SQ_ALU_WORD1_ENCODING_SHIFT)
+#define SQ_ALU_WORD1_GET_BANK_SWIZZLE(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_BANK_SWIZZLE_MASK) >> SQ_ALU_WORD1_BANK_SWIZZLE_SHIFT)
+#define SQ_ALU_WORD1_GET_DST_GPR(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_DST_GPR_MASK) >> SQ_ALU_WORD1_DST_GPR_SHIFT)
+#define SQ_ALU_WORD1_GET_DST_REL(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_DST_REL_MASK) >> SQ_ALU_WORD1_DST_REL_SHIFT)
+#define SQ_ALU_WORD1_GET_DST_CHAN(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_DST_CHAN_MASK) >> SQ_ALU_WORD1_DST_CHAN_SHIFT)
+#define SQ_ALU_WORD1_GET_CLAMP(sq_alu_word1) \
+     ((sq_alu_word1 & SQ_ALU_WORD1_CLAMP_MASK) >> SQ_ALU_WORD1_CLAMP_SHIFT)
+
+#define SQ_ALU_WORD1_SET_ENCODING(sq_alu_word1_reg, encoding) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_ENCODING_MASK) | (encoding << SQ_ALU_WORD1_ENCODING_SHIFT)
+#define SQ_ALU_WORD1_SET_BANK_SWIZZLE(sq_alu_word1_reg, bank_swizzle) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_BANK_SWIZZLE_MASK) | (bank_swizzle << SQ_ALU_WORD1_BANK_SWIZZLE_SHIFT)
+#define SQ_ALU_WORD1_SET_DST_GPR(sq_alu_word1_reg, dst_gpr) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_DST_GPR_MASK) | (dst_gpr << SQ_ALU_WORD1_DST_GPR_SHIFT)
+#define SQ_ALU_WORD1_SET_DST_REL(sq_alu_word1_reg, dst_rel) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_DST_REL_MASK) | (dst_rel << SQ_ALU_WORD1_DST_REL_SHIFT)
+#define SQ_ALU_WORD1_SET_DST_CHAN(sq_alu_word1_reg, dst_chan) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_DST_CHAN_MASK) | (dst_chan << SQ_ALU_WORD1_DST_CHAN_SHIFT)
+#define SQ_ALU_WORD1_SET_CLAMP(sq_alu_word1_reg, clamp) \
+     sq_alu_word1_reg = (sq_alu_word1_reg & ~SQ_ALU_WORD1_CLAMP_MASK) | (clamp << SQ_ALU_WORD1_CLAMP_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_t {
+          unsigned int                                : 15;
+          unsigned int encoding                       : SQ_ALU_WORD1_ENCODING_SIZE;
+          unsigned int bank_swizzle                   : SQ_ALU_WORD1_BANK_SWIZZLE_SIZE;
+          unsigned int dst_gpr                        : SQ_ALU_WORD1_DST_GPR_SIZE;
+          unsigned int dst_rel                        : SQ_ALU_WORD1_DST_REL_SIZE;
+          unsigned int dst_chan                       : SQ_ALU_WORD1_DST_CHAN_SIZE;
+          unsigned int clamp                          : SQ_ALU_WORD1_CLAMP_SIZE;
+     } sq_alu_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_t {
+          unsigned int clamp                          : SQ_ALU_WORD1_CLAMP_SIZE;
+          unsigned int dst_chan                       : SQ_ALU_WORD1_DST_CHAN_SIZE;
+          unsigned int dst_rel                        : SQ_ALU_WORD1_DST_REL_SIZE;
+          unsigned int dst_gpr                        : SQ_ALU_WORD1_DST_GPR_SIZE;
+          unsigned int bank_swizzle                   : SQ_ALU_WORD1_BANK_SWIZZLE_SIZE;
+          unsigned int encoding                       : SQ_ALU_WORD1_ENCODING_SIZE;
+          unsigned int                                : 15;
+     } sq_alu_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word1_t f;
+} sq_alu_word1_u;
+
+
+/*
+ * SQ_ALU_WORD1_OP2_V2 struct
+ */
+
+#define SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE 1
+#define SQ_ALU_WORD1_OP2_V2_OMOD_SIZE  2
+#define SQ_ALU_WORD1_OP2_V2_ALU_INST_SIZE 11
+
+#define SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SHIFT 0
+#define SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SHIFT 1
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SHIFT 2
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SHIFT 3
+#define SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SHIFT 4
+#define SQ_ALU_WORD1_OP2_V2_OMOD_SHIFT 5
+#define SQ_ALU_WORD1_OP2_V2_ALU_INST_SHIFT 7
+
+#define SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK 0x00000001
+#define SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK 0x00000002
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK 0x00000004
+#define SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK 0x00000008
+#define SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK 0x00000010
+#define SQ_ALU_WORD1_OP2_V2_OMOD_MASK  0x00000060
+#define SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK 0x0003ff80
+
+#define SQ_ALU_WORD1_OP2_V2_MASK \
+     (SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK | \
+      SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK | \
+      SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK | \
+      SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK | \
+      SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK | \
+      SQ_ALU_WORD1_OP2_V2_OMOD_MASK | \
+      SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK)
+
+#define SQ_ALU_WORD1_OP2_V2_DEFAULT    0x0001cdcd
+
+#define SQ_ALU_WORD1_OP2_V2_GET_SRC0_ABS(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK) >> SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_SRC1_ABS(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK) >> SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_UPDATE_EXECUTE_MASK(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK) >> SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_UPDATE_PRED(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK) >> SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_WRITE_MASK(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK) >> SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_OMOD(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_OMOD_MASK) >> SQ_ALU_WORD1_OP2_V2_OMOD_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_GET_ALU_INST(sq_alu_word1_op2_v2) \
+     ((sq_alu_word1_op2_v2 & SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK) >> SQ_ALU_WORD1_OP2_V2_ALU_INST_SHIFT)
+
+#define SQ_ALU_WORD1_OP2_V2_SET_SRC0_ABS(sq_alu_word1_op2_v2_reg, src0_abs) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_SRC0_ABS_MASK) | (src0_abs << SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_SRC1_ABS(sq_alu_word1_op2_v2_reg, src1_abs) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_SRC1_ABS_MASK) | (src1_abs << SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_UPDATE_EXECUTE_MASK(sq_alu_word1_op2_v2_reg, update_execute_mask) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_MASK) | (update_execute_mask << SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_UPDATE_PRED(sq_alu_word1_op2_v2_reg, update_pred) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_MASK) | (update_pred << SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_WRITE_MASK(sq_alu_word1_op2_v2_reg, write_mask) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_WRITE_MASK_MASK) | (write_mask << SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_OMOD(sq_alu_word1_op2_v2_reg, omod) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_OMOD_MASK) | (omod << SQ_ALU_WORD1_OP2_V2_OMOD_SHIFT)
+#define SQ_ALU_WORD1_OP2_V2_SET_ALU_INST(sq_alu_word1_op2_v2_reg, alu_inst) \
+     sq_alu_word1_op2_v2_reg = (sq_alu_word1_op2_v2_reg & ~SQ_ALU_WORD1_OP2_V2_ALU_INST_MASK) | (alu_inst << SQ_ALU_WORD1_OP2_V2_ALU_INST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_v2_t {
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP2_V2_ALU_INST_SIZE;
+          unsigned int                                : 14;
+     } sq_alu_word1_op2_v2_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_v2_t {
+          unsigned int                                : 14;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP2_V2_ALU_INST_SIZE;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+     } sq_alu_word1_op2_v2_t;
+
+#endif
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_r6xx_t {
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int fog_export                     : 1;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int alu_inst                       : 10;
+          unsigned int                                : 14;
+     } sq_alu_word1_op2_v1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op2_r6xx_t {
+          unsigned int                                : 14;
+          unsigned int alu_inst                       : 10;
+          unsigned int omod                           : SQ_ALU_WORD1_OP2_V2_OMOD_SIZE;
+          unsigned int fog_export                     : 1;
+          unsigned int write_mask                     : SQ_ALU_WORD1_OP2_V2_WRITE_MASK_SIZE;
+          unsigned int update_pred                    : SQ_ALU_WORD1_OP2_V2_UPDATE_PRED_SIZE;
+          unsigned int update_execute_mask            : SQ_ALU_WORD1_OP2_V2_UPDATE_EXECUTE_MASK_SIZE;
+          unsigned int src1_abs                       : SQ_ALU_WORD1_OP2_V2_SRC1_ABS_SIZE;
+          unsigned int src0_abs                       : SQ_ALU_WORD1_OP2_V2_SRC0_ABS_SIZE;
+     } sq_alu_word1_op2_v1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word1_op2_v2_t f;
+     sq_alu_word1_op2_v1_t f6;
+} sq_alu_word1_op2_v2_u;
+
+
+/*
+ * SQ_ALU_WORD1_OP3 struct
+ */
+
+#define SQ_ALU_WORD1_OP3_SRC2_SEL_SIZE 9
+#define SQ_ALU_WORD1_OP3_SRC2_REL_SIZE 1
+#define SQ_ALU_WORD1_OP3_SRC2_CHAN_SIZE 2
+#define SQ_ALU_WORD1_OP3_SRC2_NEG_SIZE 1
+#define SQ_ALU_WORD1_OP3_ALU_INST_SIZE 5
+
+#define SQ_ALU_WORD1_OP3_SRC2_SEL_SHIFT 0
+#define SQ_ALU_WORD1_OP3_SRC2_REL_SHIFT 9
+#define SQ_ALU_WORD1_OP3_SRC2_CHAN_SHIFT 10
+#define SQ_ALU_WORD1_OP3_SRC2_NEG_SHIFT 12
+#define SQ_ALU_WORD1_OP3_ALU_INST_SHIFT 13
+
+#define SQ_ALU_WORD1_OP3_SRC2_SEL_MASK 0x000001ff
+#define SQ_ALU_WORD1_OP3_SRC2_REL_MASK 0x00000200
+#define SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK 0x00000c00
+#define SQ_ALU_WORD1_OP3_SRC2_NEG_MASK 0x00001000
+#define SQ_ALU_WORD1_OP3_ALU_INST_MASK 0x0003e000
+
+#define SQ_ALU_WORD1_OP3_MASK \
+     (SQ_ALU_WORD1_OP3_SRC2_SEL_MASK | \
+      SQ_ALU_WORD1_OP3_SRC2_REL_MASK | \
+      SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK | \
+      SQ_ALU_WORD1_OP3_SRC2_NEG_MASK | \
+      SQ_ALU_WORD1_OP3_ALU_INST_MASK)
+
+#define SQ_ALU_WORD1_OP3_DEFAULT       0x0001cdcd
+
+#define SQ_ALU_WORD1_OP3_GET_SRC2_SEL(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_SEL_MASK) >> SQ_ALU_WORD1_OP3_SRC2_SEL_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_SRC2_REL(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_REL_MASK) >> SQ_ALU_WORD1_OP3_SRC2_REL_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_SRC2_CHAN(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK) >> SQ_ALU_WORD1_OP3_SRC2_CHAN_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_SRC2_NEG(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_SRC2_NEG_MASK) >> SQ_ALU_WORD1_OP3_SRC2_NEG_SHIFT)
+#define SQ_ALU_WORD1_OP3_GET_ALU_INST(sq_alu_word1_op3) \
+     ((sq_alu_word1_op3 & SQ_ALU_WORD1_OP3_ALU_INST_MASK) >> SQ_ALU_WORD1_OP3_ALU_INST_SHIFT)
+
+#define SQ_ALU_WORD1_OP3_SET_SRC2_SEL(sq_alu_word1_op3_reg, src2_sel) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_SEL_MASK) | (src2_sel << SQ_ALU_WORD1_OP3_SRC2_SEL_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_SRC2_REL(sq_alu_word1_op3_reg, src2_rel) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_REL_MASK) | (src2_rel << SQ_ALU_WORD1_OP3_SRC2_REL_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_SRC2_CHAN(sq_alu_word1_op3_reg, src2_chan) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_CHAN_MASK) | (src2_chan << SQ_ALU_WORD1_OP3_SRC2_CHAN_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_SRC2_NEG(sq_alu_word1_op3_reg, src2_neg) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_SRC2_NEG_MASK) | (src2_neg << SQ_ALU_WORD1_OP3_SRC2_NEG_SHIFT)
+#define SQ_ALU_WORD1_OP3_SET_ALU_INST(sq_alu_word1_op3_reg, alu_inst) \
+     sq_alu_word1_op3_reg = (sq_alu_word1_op3_reg & ~SQ_ALU_WORD1_OP3_ALU_INST_MASK) | (alu_inst << SQ_ALU_WORD1_OP3_ALU_INST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op3_t {
+          unsigned int src2_sel                       : SQ_ALU_WORD1_OP3_SRC2_SEL_SIZE;
+          unsigned int src2_rel                       : SQ_ALU_WORD1_OP3_SRC2_REL_SIZE;
+          unsigned int src2_chan                      : SQ_ALU_WORD1_OP3_SRC2_CHAN_SIZE;
+          unsigned int src2_neg                       : SQ_ALU_WORD1_OP3_SRC2_NEG_SIZE;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP3_ALU_INST_SIZE;
+          unsigned int                                : 14;
+     } sq_alu_word1_op3_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_alu_word1_op3_t {
+          unsigned int                                : 14;
+          unsigned int alu_inst                       : SQ_ALU_WORD1_OP3_ALU_INST_SIZE;
+          unsigned int src2_neg                       : SQ_ALU_WORD1_OP3_SRC2_NEG_SIZE;
+          unsigned int src2_chan                      : SQ_ALU_WORD1_OP3_SRC2_CHAN_SIZE;
+          unsigned int src2_rel                       : SQ_ALU_WORD1_OP3_SRC2_REL_SIZE;
+          unsigned int src2_sel                       : SQ_ALU_WORD1_OP3_SRC2_SEL_SIZE;
+     } sq_alu_word1_op3_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_alu_word1_op3_t f;
+} sq_alu_word1_op3_u;
+
+
+/*
+ * SQ_TEX_WORD0 struct
+ */
+
+#define SQ_TEX_WORD0_TEX_INST_SIZE     5
+#define SQ_TEX_WORD0_BC_FRAC_MODE_SIZE 1
+#define SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SIZE 1
+#define SQ_TEX_WORD0_RESOURCE_ID_SIZE  8
+#define SQ_TEX_WORD0_SRC_GPR_SIZE      7
+#define SQ_TEX_WORD0_SRC_REL_SIZE      1
+#define SQ_TEX_WORD0_ALT_CONST_SIZE    1
+
+#define SQ_TEX_WORD0_TEX_INST_SHIFT    0
+#define SQ_TEX_WORD0_BC_FRAC_MODE_SHIFT 5
+#define SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SHIFT 7
+#define SQ_TEX_WORD0_RESOURCE_ID_SHIFT 8
+#define SQ_TEX_WORD0_SRC_GPR_SHIFT     16
+#define SQ_TEX_WORD0_SRC_REL_SHIFT     23
+#define SQ_TEX_WORD0_ALT_CONST_SHIFT   24
+
+#define SQ_TEX_WORD0_TEX_INST_MASK     0x0000001f
+#define SQ_TEX_WORD0_BC_FRAC_MODE_MASK 0x00000020
+#define SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK 0x00000080
+#define SQ_TEX_WORD0_RESOURCE_ID_MASK  0x0000ff00
+#define SQ_TEX_WORD0_SRC_GPR_MASK      0x007f0000
+#define SQ_TEX_WORD0_SRC_REL_MASK      0x00800000
+#define SQ_TEX_WORD0_ALT_CONST_MASK    0x01000000
+
+#define SQ_TEX_WORD0_MASK \
+     (SQ_TEX_WORD0_TEX_INST_MASK | \
+      SQ_TEX_WORD0_BC_FRAC_MODE_MASK | \
+      SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK | \
+      SQ_TEX_WORD0_RESOURCE_ID_MASK | \
+      SQ_TEX_WORD0_SRC_GPR_MASK | \
+      SQ_TEX_WORD0_SRC_REL_MASK | \
+      SQ_TEX_WORD0_ALT_CONST_MASK)
+
+#define SQ_TEX_WORD0_DEFAULT           0x01cdcd8d
+
+#define SQ_TEX_WORD0_GET_TEX_INST(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_TEX_INST_MASK) >> SQ_TEX_WORD0_TEX_INST_SHIFT)
+#define SQ_TEX_WORD0_GET_BC_FRAC_MODE(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_BC_FRAC_MODE_MASK) >> SQ_TEX_WORD0_BC_FRAC_MODE_SHIFT)
+#define SQ_TEX_WORD0_GET_FETCH_WHOLE_QUAD(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK) >> SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_TEX_WORD0_GET_RESOURCE_ID(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_RESOURCE_ID_MASK) >> SQ_TEX_WORD0_RESOURCE_ID_SHIFT)
+#define SQ_TEX_WORD0_GET_SRC_GPR(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_SRC_GPR_MASK) >> SQ_TEX_WORD0_SRC_GPR_SHIFT)
+#define SQ_TEX_WORD0_GET_SRC_REL(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_SRC_REL_MASK) >> SQ_TEX_WORD0_SRC_REL_SHIFT)
+#define SQ_TEX_WORD0_GET_ALT_CONST(sq_tex_word0) \
+     ((sq_tex_word0 & SQ_TEX_WORD0_ALT_CONST_MASK) >> SQ_TEX_WORD0_ALT_CONST_SHIFT)
+
+#define SQ_TEX_WORD0_SET_TEX_INST(sq_tex_word0_reg, tex_inst) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_TEX_INST_MASK) | (tex_inst << SQ_TEX_WORD0_TEX_INST_SHIFT)
+#define SQ_TEX_WORD0_SET_BC_FRAC_MODE(sq_tex_word0_reg, bc_frac_mode) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_BC_FRAC_MODE_MASK) | (bc_frac_mode << SQ_TEX_WORD0_BC_FRAC_MODE_SHIFT)
+#define SQ_TEX_WORD0_SET_FETCH_WHOLE_QUAD(sq_tex_word0_reg, fetch_whole_quad) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_FETCH_WHOLE_QUAD_MASK) | (fetch_whole_quad << SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_TEX_WORD0_SET_RESOURCE_ID(sq_tex_word0_reg, resource_id) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_RESOURCE_ID_MASK) | (resource_id << SQ_TEX_WORD0_RESOURCE_ID_SHIFT)
+#define SQ_TEX_WORD0_SET_SRC_GPR(sq_tex_word0_reg, src_gpr) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_SRC_GPR_MASK) | (src_gpr << SQ_TEX_WORD0_SRC_GPR_SHIFT)
+#define SQ_TEX_WORD0_SET_SRC_REL(sq_tex_word0_reg, src_rel) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_SRC_REL_MASK) | (src_rel << SQ_TEX_WORD0_SRC_REL_SHIFT)
+#define SQ_TEX_WORD0_SET_ALT_CONST(sq_tex_word0_reg, alt_const) \
+     sq_tex_word0_reg = (sq_tex_word0_reg & ~SQ_TEX_WORD0_ALT_CONST_MASK) | (alt_const << SQ_TEX_WORD0_ALT_CONST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_tex_word0_t {
+          unsigned int tex_inst                       : SQ_TEX_WORD0_TEX_INST_SIZE;
+          unsigned int bc_frac_mode                   : SQ_TEX_WORD0_BC_FRAC_MODE_SIZE;
+          unsigned int                                : 1;
+          unsigned int fetch_whole_quad               : SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int resource_id                    : SQ_TEX_WORD0_RESOURCE_ID_SIZE;
+          unsigned int src_gpr                        : SQ_TEX_WORD0_SRC_GPR_SIZE;
+          unsigned int src_rel                        : SQ_TEX_WORD0_SRC_REL_SIZE;
+          unsigned int alt_const                      : SQ_TEX_WORD0_ALT_CONST_SIZE;
+          unsigned int                                : 7;
+     } sq_tex_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_tex_word0_t {
+          unsigned int                                : 7;
+          unsigned int alt_const                      : SQ_TEX_WORD0_ALT_CONST_SIZE;
+          unsigned int src_rel                        : SQ_TEX_WORD0_SRC_REL_SIZE;
+          unsigned int src_gpr                        : SQ_TEX_WORD0_SRC_GPR_SIZE;
+          unsigned int resource_id                    : SQ_TEX_WORD0_RESOURCE_ID_SIZE;
+          unsigned int fetch_whole_quad               : SQ_TEX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int                                : 1;
+          unsigned int bc_frac_mode                   : SQ_TEX_WORD0_BC_FRAC_MODE_SIZE;
+          unsigned int tex_inst                       : SQ_TEX_WORD0_TEX_INST_SIZE;
+     } sq_tex_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_tex_word0_t f;
+} sq_tex_word0_u;
+
+
+/*
+ * SQ_TEX_WORD1 struct
+ */
+
+#define SQ_TEX_WORD1_DST_GPR_SIZE      7
+#define SQ_TEX_WORD1_DST_REL_SIZE      1
+#define SQ_TEX_WORD1_DST_SEL_X_SIZE    3
+#define SQ_TEX_WORD1_DST_SEL_Y_SIZE    3
+#define SQ_TEX_WORD1_DST_SEL_Z_SIZE    3
+#define SQ_TEX_WORD1_DST_SEL_W_SIZE    3
+#define SQ_TEX_WORD1_LOD_BIAS_SIZE     7
+#define SQ_TEX_WORD1_COORD_TYPE_X_SIZE 1
+#define SQ_TEX_WORD1_COORD_TYPE_Y_SIZE 1
+#define SQ_TEX_WORD1_COORD_TYPE_Z_SIZE 1
+#define SQ_TEX_WORD1_COORD_TYPE_W_SIZE 1
+
+#define SQ_TEX_WORD1_DST_GPR_SHIFT     0
+#define SQ_TEX_WORD1_DST_REL_SHIFT     7
+#define SQ_TEX_WORD1_DST_SEL_X_SHIFT   9
+#define SQ_TEX_WORD1_DST_SEL_Y_SHIFT   12
+#define SQ_TEX_WORD1_DST_SEL_Z_SHIFT   15
+#define SQ_TEX_WORD1_DST_SEL_W_SHIFT   18
+#define SQ_TEX_WORD1_LOD_BIAS_SHIFT    21
+#define SQ_TEX_WORD1_COORD_TYPE_X_SHIFT 28
+#define SQ_TEX_WORD1_COORD_TYPE_Y_SHIFT 29
+#define SQ_TEX_WORD1_COORD_TYPE_Z_SHIFT 30
+#define SQ_TEX_WORD1_COORD_TYPE_W_SHIFT 31
+
+#define SQ_TEX_WORD1_DST_GPR_MASK      0x0000007f
+#define SQ_TEX_WORD1_DST_REL_MASK      0x00000080
+#define SQ_TEX_WORD1_DST_SEL_X_MASK    0x00000e00
+#define SQ_TEX_WORD1_DST_SEL_Y_MASK    0x00007000
+#define SQ_TEX_WORD1_DST_SEL_Z_MASK    0x00038000
+#define SQ_TEX_WORD1_DST_SEL_W_MASK    0x001c0000
+#define SQ_TEX_WORD1_LOD_BIAS_MASK     0x0fe00000
+#define SQ_TEX_WORD1_COORD_TYPE_X_MASK 0x10000000
+#define SQ_TEX_WORD1_COORD_TYPE_Y_MASK 0x20000000
+#define SQ_TEX_WORD1_COORD_TYPE_Z_MASK 0x40000000
+#define SQ_TEX_WORD1_COORD_TYPE_W_MASK 0x80000000
+
+#define SQ_TEX_WORD1_MASK \
+     (SQ_TEX_WORD1_DST_GPR_MASK | \
+      SQ_TEX_WORD1_DST_REL_MASK | \
+      SQ_TEX_WORD1_DST_SEL_X_MASK | \
+      SQ_TEX_WORD1_DST_SEL_Y_MASK | \
+      SQ_TEX_WORD1_DST_SEL_Z_MASK | \
+      SQ_TEX_WORD1_DST_SEL_W_MASK | \
+      SQ_TEX_WORD1_LOD_BIAS_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_X_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_Y_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_Z_MASK | \
+      SQ_TEX_WORD1_COORD_TYPE_W_MASK)
+
+#define SQ_TEX_WORD1_DEFAULT           0xcdcdcccd
+
+#define SQ_TEX_WORD1_GET_DST_GPR(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_GPR_MASK) >> SQ_TEX_WORD1_DST_GPR_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_REL(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_REL_MASK) >> SQ_TEX_WORD1_DST_REL_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_X(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_X_MASK) >> SQ_TEX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_Y(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_Y_MASK) >> SQ_TEX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_Z(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_Z_MASK) >> SQ_TEX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_TEX_WORD1_GET_DST_SEL_W(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_DST_SEL_W_MASK) >> SQ_TEX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_TEX_WORD1_GET_LOD_BIAS(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_LOD_BIAS_MASK) >> SQ_TEX_WORD1_LOD_BIAS_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_X(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_X_MASK) >> SQ_TEX_WORD1_COORD_TYPE_X_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_Y(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_Y_MASK) >> SQ_TEX_WORD1_COORD_TYPE_Y_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_Z(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_Z_MASK) >> SQ_TEX_WORD1_COORD_TYPE_Z_SHIFT)
+#define SQ_TEX_WORD1_GET_COORD_TYPE_W(sq_tex_word1) \
+     ((sq_tex_word1 & SQ_TEX_WORD1_COORD_TYPE_W_MASK) >> SQ_TEX_WORD1_COORD_TYPE_W_SHIFT)
+
+#define SQ_TEX_WORD1_SET_DST_GPR(sq_tex_word1_reg, dst_gpr) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_GPR_MASK) | (dst_gpr << SQ_TEX_WORD1_DST_GPR_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_REL(sq_tex_word1_reg, dst_rel) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_REL_MASK) | (dst_rel << SQ_TEX_WORD1_DST_REL_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_X(sq_tex_word1_reg, dst_sel_x) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_X_MASK) | (dst_sel_x << SQ_TEX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_Y(sq_tex_word1_reg, dst_sel_y) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_Y_MASK) | (dst_sel_y << SQ_TEX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_Z(sq_tex_word1_reg, dst_sel_z) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_Z_MASK) | (dst_sel_z << SQ_TEX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_TEX_WORD1_SET_DST_SEL_W(sq_tex_word1_reg, dst_sel_w) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_DST_SEL_W_MASK) | (dst_sel_w << SQ_TEX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_TEX_WORD1_SET_LOD_BIAS(sq_tex_word1_reg, lod_bias) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_LOD_BIAS_MASK) | (lod_bias << SQ_TEX_WORD1_LOD_BIAS_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_X(sq_tex_word1_reg, coord_type_x) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_X_MASK) | (coord_type_x << SQ_TEX_WORD1_COORD_TYPE_X_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_Y(sq_tex_word1_reg, coord_type_y) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_Y_MASK) | (coord_type_y << SQ_TEX_WORD1_COORD_TYPE_Y_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_Z(sq_tex_word1_reg, coord_type_z) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_Z_MASK) | (coord_type_z << SQ_TEX_WORD1_COORD_TYPE_Z_SHIFT)
+#define SQ_TEX_WORD1_SET_COORD_TYPE_W(sq_tex_word1_reg, coord_type_w) \
+     sq_tex_word1_reg = (sq_tex_word1_reg & ~SQ_TEX_WORD1_COORD_TYPE_W_MASK) | (coord_type_w << SQ_TEX_WORD1_COORD_TYPE_W_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_tex_word1_t {
+          unsigned int dst_gpr                        : SQ_TEX_WORD1_DST_GPR_SIZE;
+          unsigned int dst_rel                        : SQ_TEX_WORD1_DST_REL_SIZE;
+          unsigned int                                : 1;
+          unsigned int dst_sel_x                      : SQ_TEX_WORD1_DST_SEL_X_SIZE;
+          unsigned int dst_sel_y                      : SQ_TEX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_z                      : SQ_TEX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_w                      : SQ_TEX_WORD1_DST_SEL_W_SIZE;
+          unsigned int lod_bias                       : SQ_TEX_WORD1_LOD_BIAS_SIZE;
+          unsigned int coord_type_x                   : SQ_TEX_WORD1_COORD_TYPE_X_SIZE;
+          unsigned int coord_type_y                   : SQ_TEX_WORD1_COORD_TYPE_Y_SIZE;
+          unsigned int coord_type_z                   : SQ_TEX_WORD1_COORD_TYPE_Z_SIZE;
+          unsigned int coord_type_w                   : SQ_TEX_WORD1_COORD_TYPE_W_SIZE;
+     } sq_tex_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_tex_word1_t {
+          unsigned int coord_type_w                   : SQ_TEX_WORD1_COORD_TYPE_W_SIZE;
+          unsigned int coord_type_z                   : SQ_TEX_WORD1_COORD_TYPE_Z_SIZE;
+          unsigned int coord_type_y                   : SQ_TEX_WORD1_COORD_TYPE_Y_SIZE;
+          unsigned int coord_type_x                   : SQ_TEX_WORD1_COORD_TYPE_X_SIZE;
+          unsigned int lod_bias                       : SQ_TEX_WORD1_LOD_BIAS_SIZE;
+          unsigned int dst_sel_w                      : SQ_TEX_WORD1_DST_SEL_W_SIZE;
+          unsigned int dst_sel_z                      : SQ_TEX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_y                      : SQ_TEX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_x                      : SQ_TEX_WORD1_DST_SEL_X_SIZE;
+          unsigned int                                : 1;
+          unsigned int dst_rel                        : SQ_TEX_WORD1_DST_REL_SIZE;
+          unsigned int dst_gpr                        : SQ_TEX_WORD1_DST_GPR_SIZE;
+     } sq_tex_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_tex_word1_t f;
+} sq_tex_word1_u;
+
+
+/*
+ * SQ_TEX_WORD2 struct
+ */
+
+#define SQ_TEX_WORD2_OFFSET_X_SIZE     5
+#define SQ_TEX_WORD2_OFFSET_Y_SIZE     5
+#define SQ_TEX_WORD2_OFFSET_Z_SIZE     5
+#define SQ_TEX_WORD2_SAMPLER_ID_SIZE   5
+#define SQ_TEX_WORD2_SRC_SEL_X_SIZE    3
+#define SQ_TEX_WORD2_SRC_SEL_Y_SIZE    3
+#define SQ_TEX_WORD2_SRC_SEL_Z_SIZE    3
+#define SQ_TEX_WORD2_SRC_SEL_W_SIZE    3
+
+#define SQ_TEX_WORD2_OFFSET_X_SHIFT    0
+#define SQ_TEX_WORD2_OFFSET_Y_SHIFT    5
+#define SQ_TEX_WORD2_OFFSET_Z_SHIFT    10
+#define SQ_TEX_WORD2_SAMPLER_ID_SHIFT  15
+#define SQ_TEX_WORD2_SRC_SEL_X_SHIFT   20
+#define SQ_TEX_WORD2_SRC_SEL_Y_SHIFT   23
+#define SQ_TEX_WORD2_SRC_SEL_Z_SHIFT   26
+#define SQ_TEX_WORD2_SRC_SEL_W_SHIFT   29
+
+#define SQ_TEX_WORD2_OFFSET_X_MASK     0x0000001f
+#define SQ_TEX_WORD2_OFFSET_Y_MASK     0x000003e0
+#define SQ_TEX_WORD2_OFFSET_Z_MASK     0x00007c00
+#define SQ_TEX_WORD2_SAMPLER_ID_MASK   0x000f8000
+#define SQ_TEX_WORD2_SRC_SEL_X_MASK    0x00700000
+#define SQ_TEX_WORD2_SRC_SEL_Y_MASK    0x03800000
+#define SQ_TEX_WORD2_SRC_SEL_Z_MASK    0x1c000000
+#define SQ_TEX_WORD2_SRC_SEL_W_MASK    0xe0000000
+
+#define SQ_TEX_WORD2_MASK \
+     (SQ_TEX_WORD2_OFFSET_X_MASK | \
+      SQ_TEX_WORD2_OFFSET_Y_MASK | \
+      SQ_TEX_WORD2_OFFSET_Z_MASK | \
+      SQ_TEX_WORD2_SAMPLER_ID_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_X_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_Y_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_Z_MASK | \
+      SQ_TEX_WORD2_SRC_SEL_W_MASK)
+
+#define SQ_TEX_WORD2_DEFAULT           0xcdcdcdcd
+
+#define SQ_TEX_WORD2_GET_OFFSET_X(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_OFFSET_X_MASK) >> SQ_TEX_WORD2_OFFSET_X_SHIFT)
+#define SQ_TEX_WORD2_GET_OFFSET_Y(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_OFFSET_Y_MASK) >> SQ_TEX_WORD2_OFFSET_Y_SHIFT)
+#define SQ_TEX_WORD2_GET_OFFSET_Z(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_OFFSET_Z_MASK) >> SQ_TEX_WORD2_OFFSET_Z_SHIFT)
+#define SQ_TEX_WORD2_GET_SAMPLER_ID(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SAMPLER_ID_MASK) >> SQ_TEX_WORD2_SAMPLER_ID_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_X(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_X_MASK) >> SQ_TEX_WORD2_SRC_SEL_X_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_Y(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_Y_MASK) >> SQ_TEX_WORD2_SRC_SEL_Y_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_Z(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_Z_MASK) >> SQ_TEX_WORD2_SRC_SEL_Z_SHIFT)
+#define SQ_TEX_WORD2_GET_SRC_SEL_W(sq_tex_word2) \
+     ((sq_tex_word2 & SQ_TEX_WORD2_SRC_SEL_W_MASK) >> SQ_TEX_WORD2_SRC_SEL_W_SHIFT)
+
+#define SQ_TEX_WORD2_SET_OFFSET_X(sq_tex_word2_reg, offset_x) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_OFFSET_X_MASK) | (offset_x << SQ_TEX_WORD2_OFFSET_X_SHIFT)
+#define SQ_TEX_WORD2_SET_OFFSET_Y(sq_tex_word2_reg, offset_y) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_OFFSET_Y_MASK) | (offset_y << SQ_TEX_WORD2_OFFSET_Y_SHIFT)
+#define SQ_TEX_WORD2_SET_OFFSET_Z(sq_tex_word2_reg, offset_z) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_OFFSET_Z_MASK) | (offset_z << SQ_TEX_WORD2_OFFSET_Z_SHIFT)
+#define SQ_TEX_WORD2_SET_SAMPLER_ID(sq_tex_word2_reg, sampler_id) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SAMPLER_ID_MASK) | (sampler_id << SQ_TEX_WORD2_SAMPLER_ID_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_X(sq_tex_word2_reg, src_sel_x) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_X_MASK) | (src_sel_x << SQ_TEX_WORD2_SRC_SEL_X_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_Y(sq_tex_word2_reg, src_sel_y) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_Y_MASK) | (src_sel_y << SQ_TEX_WORD2_SRC_SEL_Y_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_Z(sq_tex_word2_reg, src_sel_z) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_Z_MASK) | (src_sel_z << SQ_TEX_WORD2_SRC_SEL_Z_SHIFT)
+#define SQ_TEX_WORD2_SET_SRC_SEL_W(sq_tex_word2_reg, src_sel_w) \
+     sq_tex_word2_reg = (sq_tex_word2_reg & ~SQ_TEX_WORD2_SRC_SEL_W_MASK) | (src_sel_w << SQ_TEX_WORD2_SRC_SEL_W_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_tex_word2_t {
+          unsigned int offset_x                       : SQ_TEX_WORD2_OFFSET_X_SIZE;
+          unsigned int offset_y                       : SQ_TEX_WORD2_OFFSET_Y_SIZE;
+          unsigned int offset_z                       : SQ_TEX_WORD2_OFFSET_Z_SIZE;
+          unsigned int sampler_id                     : SQ_TEX_WORD2_SAMPLER_ID_SIZE;
+          unsigned int src_sel_x                      : SQ_TEX_WORD2_SRC_SEL_X_SIZE;
+          unsigned int src_sel_y                      : SQ_TEX_WORD2_SRC_SEL_Y_SIZE;
+          unsigned int src_sel_z                      : SQ_TEX_WORD2_SRC_SEL_Z_SIZE;
+          unsigned int src_sel_w                      : SQ_TEX_WORD2_SRC_SEL_W_SIZE;
+     } sq_tex_word2_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_tex_word2_t {
+          unsigned int src_sel_w                      : SQ_TEX_WORD2_SRC_SEL_W_SIZE;
+          unsigned int src_sel_z                      : SQ_TEX_WORD2_SRC_SEL_Z_SIZE;
+          unsigned int src_sel_y                      : SQ_TEX_WORD2_SRC_SEL_Y_SIZE;
+          unsigned int src_sel_x                      : SQ_TEX_WORD2_SRC_SEL_X_SIZE;
+          unsigned int sampler_id                     : SQ_TEX_WORD2_SAMPLER_ID_SIZE;
+          unsigned int offset_z                       : SQ_TEX_WORD2_OFFSET_Z_SIZE;
+          unsigned int offset_y                       : SQ_TEX_WORD2_OFFSET_Y_SIZE;
+          unsigned int offset_x                       : SQ_TEX_WORD2_OFFSET_X_SIZE;
+     } sq_tex_word2_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_tex_word2_t f;
+} sq_tex_word2_u;
+
+
+/*
+ * SQ_VTX_WORD0 struct
+ */
+
+#define SQ_VTX_WORD0_VTX_INST_SIZE     5
+#define SQ_VTX_WORD0_FETCH_TYPE_SIZE   2
+#define SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SIZE 1
+#define SQ_VTX_WORD0_BUFFER_ID_SIZE    8
+#define SQ_VTX_WORD0_SRC_GPR_SIZE      7
+#define SQ_VTX_WORD0_SRC_REL_SIZE      1
+#define SQ_VTX_WORD0_SRC_SEL_X_SIZE    2
+#define SQ_VTX_WORD0_MEGA_FETCH_COUNT_SIZE 6
+
+#define SQ_VTX_WORD0_VTX_INST_SHIFT    0
+#define SQ_VTX_WORD0_FETCH_TYPE_SHIFT  5
+#define SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SHIFT 7
+#define SQ_VTX_WORD0_BUFFER_ID_SHIFT   8
+#define SQ_VTX_WORD0_SRC_GPR_SHIFT     16
+#define SQ_VTX_WORD0_SRC_REL_SHIFT     23
+#define SQ_VTX_WORD0_SRC_SEL_X_SHIFT   24
+#define SQ_VTX_WORD0_MEGA_FETCH_COUNT_SHIFT 26
+
+#define SQ_VTX_WORD0_VTX_INST_MASK     0x0000001f
+#define SQ_VTX_WORD0_FETCH_TYPE_MASK   0x00000060
+#define SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK 0x00000080
+#define SQ_VTX_WORD0_BUFFER_ID_MASK    0x0000ff00
+#define SQ_VTX_WORD0_SRC_GPR_MASK      0x007f0000
+#define SQ_VTX_WORD0_SRC_REL_MASK      0x00800000
+#define SQ_VTX_WORD0_SRC_SEL_X_MASK    0x03000000
+#define SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK 0xfc000000
+
+#define SQ_VTX_WORD0_MASK \
+     (SQ_VTX_WORD0_VTX_INST_MASK | \
+      SQ_VTX_WORD0_FETCH_TYPE_MASK | \
+      SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK | \
+      SQ_VTX_WORD0_BUFFER_ID_MASK | \
+      SQ_VTX_WORD0_SRC_GPR_MASK | \
+      SQ_VTX_WORD0_SRC_REL_MASK | \
+      SQ_VTX_WORD0_SRC_SEL_X_MASK | \
+      SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK)
+
+#define SQ_VTX_WORD0_DEFAULT           0xcdcdcdcd
+
+#define SQ_VTX_WORD0_GET_VTX_INST(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_VTX_INST_MASK) >> SQ_VTX_WORD0_VTX_INST_SHIFT)
+#define SQ_VTX_WORD0_GET_FETCH_TYPE(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_FETCH_TYPE_MASK) >> SQ_VTX_WORD0_FETCH_TYPE_SHIFT)
+#define SQ_VTX_WORD0_GET_FETCH_WHOLE_QUAD(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK) >> SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_VTX_WORD0_GET_BUFFER_ID(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_BUFFER_ID_MASK) >> SQ_VTX_WORD0_BUFFER_ID_SHIFT)
+#define SQ_VTX_WORD0_GET_SRC_GPR(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_SRC_GPR_MASK) >> SQ_VTX_WORD0_SRC_GPR_SHIFT)
+#define SQ_VTX_WORD0_GET_SRC_REL(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_SRC_REL_MASK) >> SQ_VTX_WORD0_SRC_REL_SHIFT)
+#define SQ_VTX_WORD0_GET_SRC_SEL_X(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_SRC_SEL_X_MASK) >> SQ_VTX_WORD0_SRC_SEL_X_SHIFT)
+#define SQ_VTX_WORD0_GET_MEGA_FETCH_COUNT(sq_vtx_word0) \
+     ((sq_vtx_word0 & SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK) >> SQ_VTX_WORD0_MEGA_FETCH_COUNT_SHIFT)
+
+#define SQ_VTX_WORD0_SET_VTX_INST(sq_vtx_word0_reg, vtx_inst) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_VTX_INST_MASK) | (vtx_inst << SQ_VTX_WORD0_VTX_INST_SHIFT)
+#define SQ_VTX_WORD0_SET_FETCH_TYPE(sq_vtx_word0_reg, fetch_type) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_FETCH_TYPE_MASK) | (fetch_type << SQ_VTX_WORD0_FETCH_TYPE_SHIFT)
+#define SQ_VTX_WORD0_SET_FETCH_WHOLE_QUAD(sq_vtx_word0_reg, fetch_whole_quad) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_FETCH_WHOLE_QUAD_MASK) | (fetch_whole_quad << SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SHIFT)
+#define SQ_VTX_WORD0_SET_BUFFER_ID(sq_vtx_word0_reg, buffer_id) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_BUFFER_ID_MASK) | (buffer_id << SQ_VTX_WORD0_BUFFER_ID_SHIFT)
+#define SQ_VTX_WORD0_SET_SRC_GPR(sq_vtx_word0_reg, src_gpr) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_SRC_GPR_MASK) | (src_gpr << SQ_VTX_WORD0_SRC_GPR_SHIFT)
+#define SQ_VTX_WORD0_SET_SRC_REL(sq_vtx_word0_reg, src_rel) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_SRC_REL_MASK) | (src_rel << SQ_VTX_WORD0_SRC_REL_SHIFT)
+#define SQ_VTX_WORD0_SET_SRC_SEL_X(sq_vtx_word0_reg, src_sel_x) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_SRC_SEL_X_MASK) | (src_sel_x << SQ_VTX_WORD0_SRC_SEL_X_SHIFT)
+#define SQ_VTX_WORD0_SET_MEGA_FETCH_COUNT(sq_vtx_word0_reg, mega_fetch_count) \
+     sq_vtx_word0_reg = (sq_vtx_word0_reg & ~SQ_VTX_WORD0_MEGA_FETCH_COUNT_MASK) | (mega_fetch_count << SQ_VTX_WORD0_MEGA_FETCH_COUNT_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word0_t {
+          unsigned int vtx_inst                       : SQ_VTX_WORD0_VTX_INST_SIZE;
+          unsigned int fetch_type                     : SQ_VTX_WORD0_FETCH_TYPE_SIZE;
+          unsigned int fetch_whole_quad               : SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int buffer_id                      : SQ_VTX_WORD0_BUFFER_ID_SIZE;
+          unsigned int src_gpr                        : SQ_VTX_WORD0_SRC_GPR_SIZE;
+          unsigned int src_rel                        : SQ_VTX_WORD0_SRC_REL_SIZE;
+          unsigned int src_sel_x                      : SQ_VTX_WORD0_SRC_SEL_X_SIZE;
+          unsigned int mega_fetch_count               : SQ_VTX_WORD0_MEGA_FETCH_COUNT_SIZE;
+     } sq_vtx_word0_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word0_t {
+          unsigned int mega_fetch_count               : SQ_VTX_WORD0_MEGA_FETCH_COUNT_SIZE;
+          unsigned int src_sel_x                      : SQ_VTX_WORD0_SRC_SEL_X_SIZE;
+          unsigned int src_rel                        : SQ_VTX_WORD0_SRC_REL_SIZE;
+          unsigned int src_gpr                        : SQ_VTX_WORD0_SRC_GPR_SIZE;
+          unsigned int buffer_id                      : SQ_VTX_WORD0_BUFFER_ID_SIZE;
+          unsigned int fetch_whole_quad               : SQ_VTX_WORD0_FETCH_WHOLE_QUAD_SIZE;
+          unsigned int fetch_type                     : SQ_VTX_WORD0_FETCH_TYPE_SIZE;
+          unsigned int vtx_inst                       : SQ_VTX_WORD0_VTX_INST_SIZE;
+     } sq_vtx_word0_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word0_t f;
+} sq_vtx_word0_u;
+
+
+/*
+ * SQ_VTX_WORD1 struct
+ */
+
+#define SQ_VTX_WORD1_DST_SEL_X_SIZE    3
+#define SQ_VTX_WORD1_DST_SEL_Y_SIZE    3
+#define SQ_VTX_WORD1_DST_SEL_Z_SIZE    3
+#define SQ_VTX_WORD1_DST_SEL_W_SIZE    3
+#define SQ_VTX_WORD1_USE_CONST_FIELDS_SIZE 1
+#define SQ_VTX_WORD1_DATA_FORMAT_SIZE  6
+#define SQ_VTX_WORD1_NUM_FORMAT_ALL_SIZE 2
+#define SQ_VTX_WORD1_FORMAT_COMP_ALL_SIZE 1
+#define SQ_VTX_WORD1_SRF_MODE_ALL_SIZE 1
+
+#define SQ_VTX_WORD1_DST_SEL_X_SHIFT   9
+#define SQ_VTX_WORD1_DST_SEL_Y_SHIFT   12
+#define SQ_VTX_WORD1_DST_SEL_Z_SHIFT   15
+#define SQ_VTX_WORD1_DST_SEL_W_SHIFT   18
+#define SQ_VTX_WORD1_USE_CONST_FIELDS_SHIFT 21
+#define SQ_VTX_WORD1_DATA_FORMAT_SHIFT 22
+#define SQ_VTX_WORD1_NUM_FORMAT_ALL_SHIFT 28
+#define SQ_VTX_WORD1_FORMAT_COMP_ALL_SHIFT 30
+#define SQ_VTX_WORD1_SRF_MODE_ALL_SHIFT 31
+
+#define SQ_VTX_WORD1_DST_SEL_X_MASK    0x00000e00
+#define SQ_VTX_WORD1_DST_SEL_Y_MASK    0x00007000
+#define SQ_VTX_WORD1_DST_SEL_Z_MASK    0x00038000
+#define SQ_VTX_WORD1_DST_SEL_W_MASK    0x001c0000
+#define SQ_VTX_WORD1_USE_CONST_FIELDS_MASK 0x00200000
+#define SQ_VTX_WORD1_DATA_FORMAT_MASK  0x0fc00000
+#define SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK 0x30000000
+#define SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK 0x40000000
+#define SQ_VTX_WORD1_SRF_MODE_ALL_MASK 0x80000000
+
+#define SQ_VTX_WORD1_MASK \
+     (SQ_VTX_WORD1_DST_SEL_X_MASK | \
+      SQ_VTX_WORD1_DST_SEL_Y_MASK | \
+      SQ_VTX_WORD1_DST_SEL_Z_MASK | \
+      SQ_VTX_WORD1_DST_SEL_W_MASK | \
+      SQ_VTX_WORD1_USE_CONST_FIELDS_MASK | \
+      SQ_VTX_WORD1_DATA_FORMAT_MASK | \
+      SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK | \
+      SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK | \
+      SQ_VTX_WORD1_SRF_MODE_ALL_MASK)
+
+#define SQ_VTX_WORD1_DEFAULT           0xcdcdcc00
+
+#define SQ_VTX_WORD1_GET_DST_SEL_X(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_X_MASK) >> SQ_VTX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_VTX_WORD1_GET_DST_SEL_Y(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_Y_MASK) >> SQ_VTX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_VTX_WORD1_GET_DST_SEL_Z(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_Z_MASK) >> SQ_VTX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_VTX_WORD1_GET_DST_SEL_W(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DST_SEL_W_MASK) >> SQ_VTX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_VTX_WORD1_GET_USE_CONST_FIELDS(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_USE_CONST_FIELDS_MASK) >> SQ_VTX_WORD1_USE_CONST_FIELDS_SHIFT)
+#define SQ_VTX_WORD1_GET_DATA_FORMAT(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_DATA_FORMAT_MASK) >> SQ_VTX_WORD1_DATA_FORMAT_SHIFT)
+#define SQ_VTX_WORD1_GET_NUM_FORMAT_ALL(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK) >> SQ_VTX_WORD1_NUM_FORMAT_ALL_SHIFT)
+#define SQ_VTX_WORD1_GET_FORMAT_COMP_ALL(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK) >> SQ_VTX_WORD1_FORMAT_COMP_ALL_SHIFT)
+#define SQ_VTX_WORD1_GET_SRF_MODE_ALL(sq_vtx_word1) \
+     ((sq_vtx_word1 & SQ_VTX_WORD1_SRF_MODE_ALL_MASK) >> SQ_VTX_WORD1_SRF_MODE_ALL_SHIFT)
+
+#define SQ_VTX_WORD1_SET_DST_SEL_X(sq_vtx_word1_reg, dst_sel_x) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_X_MASK) | (dst_sel_x << SQ_VTX_WORD1_DST_SEL_X_SHIFT)
+#define SQ_VTX_WORD1_SET_DST_SEL_Y(sq_vtx_word1_reg, dst_sel_y) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_Y_MASK) | (dst_sel_y << SQ_VTX_WORD1_DST_SEL_Y_SHIFT)
+#define SQ_VTX_WORD1_SET_DST_SEL_Z(sq_vtx_word1_reg, dst_sel_z) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_Z_MASK) | (dst_sel_z << SQ_VTX_WORD1_DST_SEL_Z_SHIFT)
+#define SQ_VTX_WORD1_SET_DST_SEL_W(sq_vtx_word1_reg, dst_sel_w) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DST_SEL_W_MASK) | (dst_sel_w << SQ_VTX_WORD1_DST_SEL_W_SHIFT)
+#define SQ_VTX_WORD1_SET_USE_CONST_FIELDS(sq_vtx_word1_reg, use_const_fields) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_USE_CONST_FIELDS_MASK) | (use_const_fields << SQ_VTX_WORD1_USE_CONST_FIELDS_SHIFT)
+#define SQ_VTX_WORD1_SET_DATA_FORMAT(sq_vtx_word1_reg, data_format) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_DATA_FORMAT_MASK) | (data_format << SQ_VTX_WORD1_DATA_FORMAT_SHIFT)
+#define SQ_VTX_WORD1_SET_NUM_FORMAT_ALL(sq_vtx_word1_reg, num_format_all) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_NUM_FORMAT_ALL_MASK) | (num_format_all << SQ_VTX_WORD1_NUM_FORMAT_ALL_SHIFT)
+#define SQ_VTX_WORD1_SET_FORMAT_COMP_ALL(sq_vtx_word1_reg, format_comp_all) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_FORMAT_COMP_ALL_MASK) | (format_comp_all << SQ_VTX_WORD1_FORMAT_COMP_ALL_SHIFT)
+#define SQ_VTX_WORD1_SET_SRF_MODE_ALL(sq_vtx_word1_reg, srf_mode_all) \
+     sq_vtx_word1_reg = (sq_vtx_word1_reg & ~SQ_VTX_WORD1_SRF_MODE_ALL_MASK) | (srf_mode_all << SQ_VTX_WORD1_SRF_MODE_ALL_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_t {
+          unsigned int                                : 9;
+          unsigned int dst_sel_x                      : SQ_VTX_WORD1_DST_SEL_X_SIZE;
+          unsigned int dst_sel_y                      : SQ_VTX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_z                      : SQ_VTX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_w                      : SQ_VTX_WORD1_DST_SEL_W_SIZE;
+          unsigned int use_const_fields               : SQ_VTX_WORD1_USE_CONST_FIELDS_SIZE;
+          unsigned int data_format                    : SQ_VTX_WORD1_DATA_FORMAT_SIZE;
+          unsigned int num_format_all                 : SQ_VTX_WORD1_NUM_FORMAT_ALL_SIZE;
+          unsigned int format_comp_all                : SQ_VTX_WORD1_FORMAT_COMP_ALL_SIZE;
+          unsigned int srf_mode_all                   : SQ_VTX_WORD1_SRF_MODE_ALL_SIZE;
+     } sq_vtx_word1_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_t {
+          unsigned int srf_mode_all                   : SQ_VTX_WORD1_SRF_MODE_ALL_SIZE;
+          unsigned int format_comp_all                : SQ_VTX_WORD1_FORMAT_COMP_ALL_SIZE;
+          unsigned int num_format_all                 : SQ_VTX_WORD1_NUM_FORMAT_ALL_SIZE;
+          unsigned int data_format                    : SQ_VTX_WORD1_DATA_FORMAT_SIZE;
+          unsigned int use_const_fields               : SQ_VTX_WORD1_USE_CONST_FIELDS_SIZE;
+          unsigned int dst_sel_w                      : SQ_VTX_WORD1_DST_SEL_W_SIZE;
+          unsigned int dst_sel_z                      : SQ_VTX_WORD1_DST_SEL_Z_SIZE;
+          unsigned int dst_sel_y                      : SQ_VTX_WORD1_DST_SEL_Y_SIZE;
+          unsigned int dst_sel_x                      : SQ_VTX_WORD1_DST_SEL_X_SIZE;
+          unsigned int                                : 9;
+     } sq_vtx_word1_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word1_t f;
+} sq_vtx_word1_u;
+
+
+/*
+ * SQ_VTX_WORD1_GPR struct
+ */
+
+#define SQ_VTX_WORD1_GPR_DST_GPR_SIZE  7
+#define SQ_VTX_WORD1_GPR_DST_REL_SIZE  1
+
+#define SQ_VTX_WORD1_GPR_DST_GPR_SHIFT 0
+#define SQ_VTX_WORD1_GPR_DST_REL_SHIFT 7
+
+#define SQ_VTX_WORD1_GPR_DST_GPR_MASK  0x0000007f
+#define SQ_VTX_WORD1_GPR_DST_REL_MASK  0x00000080
+
+#define SQ_VTX_WORD1_GPR_MASK \
+     (SQ_VTX_WORD1_GPR_DST_GPR_MASK | \
+      SQ_VTX_WORD1_GPR_DST_REL_MASK)
+
+#define SQ_VTX_WORD1_GPR_DEFAULT       0x000000cd
+
+#define SQ_VTX_WORD1_GPR_GET_DST_GPR(sq_vtx_word1_gpr) \
+     ((sq_vtx_word1_gpr & SQ_VTX_WORD1_GPR_DST_GPR_MASK) >> SQ_VTX_WORD1_GPR_DST_GPR_SHIFT)
+#define SQ_VTX_WORD1_GPR_GET_DST_REL(sq_vtx_word1_gpr) \
+     ((sq_vtx_word1_gpr & SQ_VTX_WORD1_GPR_DST_REL_MASK) >> SQ_VTX_WORD1_GPR_DST_REL_SHIFT)
+
+#define SQ_VTX_WORD1_GPR_SET_DST_GPR(sq_vtx_word1_gpr_reg, dst_gpr) \
+     sq_vtx_word1_gpr_reg = (sq_vtx_word1_gpr_reg & ~SQ_VTX_WORD1_GPR_DST_GPR_MASK) | (dst_gpr << SQ_VTX_WORD1_GPR_DST_GPR_SHIFT)
+#define SQ_VTX_WORD1_GPR_SET_DST_REL(sq_vtx_word1_gpr_reg, dst_rel) \
+     sq_vtx_word1_gpr_reg = (sq_vtx_word1_gpr_reg & ~SQ_VTX_WORD1_GPR_DST_REL_MASK) | (dst_rel << SQ_VTX_WORD1_GPR_DST_REL_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_gpr_t {
+          unsigned int dst_gpr                        : SQ_VTX_WORD1_GPR_DST_GPR_SIZE;
+          unsigned int dst_rel                        : SQ_VTX_WORD1_GPR_DST_REL_SIZE;
+          unsigned int                                : 24;
+     } sq_vtx_word1_gpr_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_gpr_t {
+          unsigned int                                : 24;
+          unsigned int dst_rel                        : SQ_VTX_WORD1_GPR_DST_REL_SIZE;
+          unsigned int dst_gpr                        : SQ_VTX_WORD1_GPR_DST_GPR_SIZE;
+     } sq_vtx_word1_gpr_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word1_gpr_t f;
+} sq_vtx_word1_gpr_u;
+
+
+/*
+ * SQ_VTX_WORD1_SEM struct
+ */
+
+#define SQ_VTX_WORD1_SEM_SEMANTIC_ID_SIZE 8
+
+#define SQ_VTX_WORD1_SEM_SEMANTIC_ID_SHIFT 0
+
+#define SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK 0x000000ff
+
+#define SQ_VTX_WORD1_SEM_MASK \
+     (SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK)
+
+#define SQ_VTX_WORD1_SEM_DEFAULT       0x000000cd
+
+#define SQ_VTX_WORD1_SEM_GET_SEMANTIC_ID(sq_vtx_word1_sem) \
+     ((sq_vtx_word1_sem & SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK) >> SQ_VTX_WORD1_SEM_SEMANTIC_ID_SHIFT)
+
+#define SQ_VTX_WORD1_SEM_SET_SEMANTIC_ID(sq_vtx_word1_sem_reg, semantic_id) \
+     sq_vtx_word1_sem_reg = (sq_vtx_word1_sem_reg & ~SQ_VTX_WORD1_SEM_SEMANTIC_ID_MASK) | (semantic_id << SQ_VTX_WORD1_SEM_SEMANTIC_ID_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_sem_t {
+          unsigned int semantic_id                    : SQ_VTX_WORD1_SEM_SEMANTIC_ID_SIZE;
+          unsigned int                                : 24;
+     } sq_vtx_word1_sem_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word1_sem_t {
+          unsigned int                                : 24;
+          unsigned int semantic_id                    : SQ_VTX_WORD1_SEM_SEMANTIC_ID_SIZE;
+     } sq_vtx_word1_sem_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word1_sem_t f;
+} sq_vtx_word1_sem_u;
+
+
+/*
+ * SQ_VTX_WORD2 struct
+ */
+
+#define SQ_VTX_WORD2_OFFSET_SIZE       16
+#define SQ_VTX_WORD2_ENDIAN_SWAP_SIZE  2
+#define SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SIZE 1
+#define SQ_VTX_WORD2_MEGA_FETCH_SIZE   1
+#define SQ_VTX_WORD2_ALT_CONST_SIZE    1
+
+#define SQ_VTX_WORD2_OFFSET_SHIFT      0
+#define SQ_VTX_WORD2_ENDIAN_SWAP_SHIFT 16
+#define SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SHIFT 18
+#define SQ_VTX_WORD2_MEGA_FETCH_SHIFT  19
+#define SQ_VTX_WORD2_ALT_CONST_SHIFT   20
+
+#define SQ_VTX_WORD2_OFFSET_MASK       0x0000ffff
+#define SQ_VTX_WORD2_ENDIAN_SWAP_MASK  0x00030000
+#define SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK 0x00040000
+#define SQ_VTX_WORD2_MEGA_FETCH_MASK   0x00080000
+#define SQ_VTX_WORD2_ALT_CONST_MASK    0x00100000
+
+#define SQ_VTX_WORD2_MASK \
+     (SQ_VTX_WORD2_OFFSET_MASK | \
+      SQ_VTX_WORD2_ENDIAN_SWAP_MASK | \
+      SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK | \
+      SQ_VTX_WORD2_MEGA_FETCH_MASK | \
+      SQ_VTX_WORD2_ALT_CONST_MASK)
+
+#define SQ_VTX_WORD2_DEFAULT           0x000dcdcd
+
+#define SQ_VTX_WORD2_GET_OFFSET(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_OFFSET_MASK) >> SQ_VTX_WORD2_OFFSET_SHIFT)
+#define SQ_VTX_WORD2_GET_ENDIAN_SWAP(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_ENDIAN_SWAP_MASK) >> SQ_VTX_WORD2_ENDIAN_SWAP_SHIFT)
+#define SQ_VTX_WORD2_GET_CONST_BUF_NO_STRIDE(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK) >> SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SHIFT)
+#define SQ_VTX_WORD2_GET_MEGA_FETCH(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_MEGA_FETCH_MASK) >> SQ_VTX_WORD2_MEGA_FETCH_SHIFT)
+#define SQ_VTX_WORD2_GET_ALT_CONST(sq_vtx_word2) \
+     ((sq_vtx_word2 & SQ_VTX_WORD2_ALT_CONST_MASK) >> SQ_VTX_WORD2_ALT_CONST_SHIFT)
+
+#define SQ_VTX_WORD2_SET_OFFSET(sq_vtx_word2_reg, offset) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_OFFSET_MASK) | (offset << SQ_VTX_WORD2_OFFSET_SHIFT)
+#define SQ_VTX_WORD2_SET_ENDIAN_SWAP(sq_vtx_word2_reg, endian_swap) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_ENDIAN_SWAP_MASK) | (endian_swap << SQ_VTX_WORD2_ENDIAN_SWAP_SHIFT)
+#define SQ_VTX_WORD2_SET_CONST_BUF_NO_STRIDE(sq_vtx_word2_reg, const_buf_no_stride) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_MASK) | (const_buf_no_stride << SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SHIFT)
+#define SQ_VTX_WORD2_SET_MEGA_FETCH(sq_vtx_word2_reg, mega_fetch) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_MEGA_FETCH_MASK) | (mega_fetch << SQ_VTX_WORD2_MEGA_FETCH_SHIFT)
+#define SQ_VTX_WORD2_SET_ALT_CONST(sq_vtx_word2_reg, alt_const) \
+     sq_vtx_word2_reg = (sq_vtx_word2_reg & ~SQ_VTX_WORD2_ALT_CONST_MASK) | (alt_const << SQ_VTX_WORD2_ALT_CONST_SHIFT)
+
+#if		defined(LITTLEENDIAN_CPU)
+
+     typedef struct _sq_vtx_word2_t {
+          unsigned int offset                         : SQ_VTX_WORD2_OFFSET_SIZE;
+          unsigned int endian_swap                    : SQ_VTX_WORD2_ENDIAN_SWAP_SIZE;
+          unsigned int const_buf_no_stride            : SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SIZE;
+          unsigned int mega_fetch                     : SQ_VTX_WORD2_MEGA_FETCH_SIZE;
+          unsigned int alt_const                      : SQ_VTX_WORD2_ALT_CONST_SIZE;
+          unsigned int                                : 11;
+     } sq_vtx_word2_t;
+
+#elif		defined(BIGENDIAN_CPU)
+
+     typedef struct _sq_vtx_word2_t {
+          unsigned int                                : 11;
+          unsigned int alt_const                      : SQ_VTX_WORD2_ALT_CONST_SIZE;
+          unsigned int mega_fetch                     : SQ_VTX_WORD2_MEGA_FETCH_SIZE;
+          unsigned int const_buf_no_stride            : SQ_VTX_WORD2_CONST_BUF_NO_STRIDE_SIZE;
+          unsigned int endian_swap                    : SQ_VTX_WORD2_ENDIAN_SWAP_SIZE;
+          unsigned int offset                         : SQ_VTX_WORD2_OFFSET_SIZE;
+     } sq_vtx_word2_t;
+
+#endif
+
+typedef union {
+     unsigned int val : 32;
+     sq_vtx_word2_t f;
+} sq_vtx_word2_u;
+
+#endif /* _SQ_MICRO_REG_H */
+
+
diff --git a/src/mesa/drivers/dri/radeon/Makefile b/src/mesa/drivers/dri/radeon/Makefile
index f223b2d9228..b59ad68f445 100644
--- a/src/mesa/drivers/dri/radeon/Makefile
+++ b/src/mesa/drivers/dri/radeon/Makefile
@@ -4,32 +4,51 @@
 TOP = ../../../../..
 include $(TOP)/configs/current
 
+CFLAGS += $(RADEON_CFLAGS)
+
 LIBNAME = radeon_dri.so
 
 MINIGLX_SOURCES = server/radeon_dri.c 
 
+ifeq ($(RADEON_LDFLAGS),)
+CS_SOURCES = radeon_cs_space_drm.c
+endif
+
+RADEON_COMMON_SOURCES = \
+	radeon_texture.c \
+	radeon_common_context.c \
+	radeon_common.c \
+	radeon_dma.c \
+	radeon_lock.c \
+	radeon_bo_legacy.c \
+	radeon_cs_legacy.c \
+	radeon_mipmap_tree.c \
+	radeon_span.c \
+	radeon_fbo.c
+
 DRIVER_SOURCES = \
 	radeon_context.c \
 	radeon_ioctl.c \
-	radeon_lock.c \
 	radeon_screen.c \
 	radeon_state.c \
 	radeon_state_init.c \
 	radeon_tex.c \
-	radeon_texmem.c \
 	radeon_texstate.c \
 	radeon_tcl.c \
 	radeon_swtcl.c \
-	radeon_span.c \
 	radeon_maos.c \
-	radeon_sanity.c 
+	radeon_sanity.c \
+	$(RADEON_COMMON_SOURCES)
 
 C_SOURCES = \
 	$(COMMON_SOURCES) \
-	$(DRIVER_SOURCES) 
+	$(DRIVER_SOURCES) \
+	$(CS_SOURCES)
 
 DRIVER_DEFINES = -DRADEON_COMMON=0
 
+DRI_LIB_DEPS += $(RADEON_LDFLAGS)
+
 X86_SOURCES = 
 
 include ../Makefile.template
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_drm.h b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
new file mode 100644
index 00000000000..8eeaea1cb20
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_drm.h
@@ -0,0 +1,217 @@
+/* 
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Jérôme Glisse <[email protected]>
+ */
+#ifndef RADEON_BO_H
+#define RADEON_BO_H
+
+#include <stdio.h>
+#include <stdint.h>
+//#include "radeon_track.h"
+
+#ifndef RADEON_DEBUG_BO
+#define RADEON_DEBUG_BO 0
+#endif
+
+/* bo object */
+#define RADEON_BO_FLAGS_MACRO_TILE  1
+#define RADEON_BO_FLAGS_MICRO_TILE  2
+
+struct radeon_bo_manager;
+
+struct radeon_bo {
+    uint32_t                    alignment;
+    uint32_t                    handle;
+    uint32_t                    size;
+    uint32_t                    domains;
+    uint32_t                    flags;
+    unsigned                    cref;
+#ifdef RADEON_BO_TRACK
+    struct radeon_track         *track;
+#endif
+    void                        *ptr;
+    struct radeon_bo_manager    *bom;
+    uint32_t                    space_accounted;
+};
+
+/* bo functions */
+struct radeon_bo_funcs {
+#ifdef RADEON_DEBUG_BO
+    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+                                 uint32_t flags,
+                                 char * szBufUsage);
+#else
+    struct radeon_bo *(*bo_open)(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+                                 uint32_t flags);
+#endif /* RADEON_DEBUG_BO */
+    void (*bo_ref)(struct radeon_bo *bo);
+    struct radeon_bo *(*bo_unref)(struct radeon_bo *bo);
+    int (*bo_map)(struct radeon_bo *bo, int write);
+    int (*bo_unmap)(struct radeon_bo *bo);
+    int (*bo_wait)(struct radeon_bo *bo);
+    int (*bo_is_static)(struct radeon_bo *bo);
+};
+
+struct radeon_bo_manager {
+    struct radeon_bo_funcs  *funcs;
+    int                     fd;
+
+#ifdef RADEON_BO_TRACK
+    struct radeon_tracker   tracker;
+#endif
+};
+    
+static inline void _radeon_bo_debug(struct radeon_bo *bo,
+                                    const char *op,
+                                    const char *file,
+                                    const char *func,
+                                    int line)
+{
+    fprintf(stderr, "%s %p 0x%08X 0x%08X 0x%08X [%s %s %d]\n",
+            op, bo, bo->handle, bo->size, bo->cref, file, func, line);
+}
+
+static inline struct radeon_bo *_radeon_bo_open(struct radeon_bo_manager *bom,
+                                                uint32_t handle,
+                                                uint32_t size,
+                                                uint32_t alignment,
+                                                uint32_t domains,
+                                                uint32_t flags,
+#ifdef RADEON_DEBUG_BO
+                                                char * szBufUsage,
+#endif /* RADEON_DEBUG_BO */
+                                                const char *file,
+                                                const char *func,
+                                                int line)
+{
+    struct radeon_bo *bo;
+
+#ifdef RADEON_DEBUG_BO
+    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags, szBufUsage);
+#else
+    bo = bom->funcs->bo_open(bom, handle, size, alignment, domains, flags);
+#endif /* RADEON_DEBUG_BO */
+
+#ifdef RADEON_BO_TRACK
+    if (bo) {
+        bo->track = radeon_tracker_add_track(&bom->tracker, bo->handle);
+        radeon_track_add_event(bo->track, file, func, "open", line);
+    }
+#endif
+    return bo;
+}
+
+static inline void _radeon_bo_ref(struct radeon_bo *bo,
+                                  const char *file,
+                                  const char *func,
+                                  int line)
+{
+    bo->cref++;
+#ifdef RADEON_BO_TRACK
+    radeon_track_add_event(bo->track, file, func, "ref", line); 
+#endif
+    bo->bom->funcs->bo_ref(bo);
+}
+
+static inline struct radeon_bo *_radeon_bo_unref(struct radeon_bo *bo,
+                                                 const char *file,
+                                                 const char *func,
+                                                 int line)
+{
+    bo->cref--;
+#ifdef RADEON_BO_TRACK
+    radeon_track_add_event(bo->track, file, func, "unref", line);
+    if (bo->cref <= 0) {
+        radeon_tracker_remove_track(&bo->bom->tracker, bo->track);
+        bo->track = NULL;
+    }
+#endif
+    return bo->bom->funcs->bo_unref(bo);
+}
+
+static inline int _radeon_bo_map(struct radeon_bo *bo,
+                                 int write,
+                                 const char *file,
+                                 const char *func,
+                                 int line)
+{
+    return bo->bom->funcs->bo_map(bo, write);
+}
+
+static inline int _radeon_bo_unmap(struct radeon_bo *bo,
+                                   const char *file,
+                                   const char *func,
+                                   int line)
+{
+    return bo->bom->funcs->bo_unmap(bo);
+}
+
+static inline int _radeon_bo_wait(struct radeon_bo *bo,
+                                  const char *file,
+                                  const char *func,
+                                  int line)
+{
+    return bo->bom->funcs->bo_wait(bo);
+}
+
+static inline int radeon_bo_is_static(struct radeon_bo *bo)
+{
+	if (bo->bom->funcs->bo_is_static)
+		return bo->bom->funcs->bo_is_static(bo);
+	return 0;
+}
+
+#ifdef RADEON_DEBUG_BO
+#define radeon_bo_open(bom, h, s, a, d, f, u)\
+    _radeon_bo_open(bom, h, s, a, d, f, u, __FILE__, __FUNCTION__, __LINE__)
+#else
+#define radeon_bo_open(bom, h, s, a, d, f)\
+    _radeon_bo_open(bom, h, s, a, d, f, __FILE__, __FUNCTION__, __LINE__)
+#endif /* RADEON_DEBUG_BO */
+#define radeon_bo_ref(bo)\
+    _radeon_bo_ref(bo, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_unref(bo)\
+    _radeon_bo_unref(bo, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_map(bo, w)\
+    _radeon_bo_map(bo, w, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_unmap(bo)\
+    _radeon_bo_unmap(bo, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_debug(bo, opcode)\
+    _radeon_bo_debug(bo, opcode, __FILE__, __FUNCTION__, __LINE__)
+#define radeon_bo_wait(bo) \
+    _radeon_bo_wait(bo, __FILE__, __func__, __LINE__)
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
new file mode 100644
index 00000000000..992eb4611b1
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.c
@@ -0,0 +1,906 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Dave Airlie
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <[email protected]>
+ *      Nicolai Haehnle <[email protected]>
+ *      Dave Airlie
+ *      Jérôme Glisse <[email protected]>
+ */
+#include <stdio.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include "xf86drm.h"
+#include "texmem.h"
+#include "main/simple_list.h"
+
+#include "drm.h"
+#include "radeon_drm.h"
+#include "radeon_common.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_macros.h"
+
+/* no seriously texmem.c is this screwed up */
+struct bo_legacy_texture_object {
+    driTextureObject    base;
+    struct bo_legacy *parent;
+};
+
+struct bo_legacy {
+    struct radeon_bo    base;
+    int                 map_count;
+    uint32_t            pending;
+    int                 is_pending;
+    int                 static_bo;
+    uint32_t            offset;
+    struct bo_legacy_texture_object *tobj;
+    int                 validated;
+    int                 dirty;
+    void                *ptr;
+    struct bo_legacy    *next, *prev;
+    struct bo_legacy    *pnext, *pprev;
+#ifdef RADEON_DEBUG_BO
+    char                szBufUsage[16];
+#endif /* RADEON_DEBUG_BO */
+};
+
+struct bo_manager_legacy {
+    struct radeon_bo_manager    base;
+    unsigned                    nhandle;
+    unsigned                    nfree_handles;
+    unsigned                    cfree_handles;
+    uint32_t                    current_age;
+    struct bo_legacy            bos;
+    struct bo_legacy            pending_bos;
+    uint32_t                    fb_location;
+    uint32_t                    texture_offset;
+    unsigned                    dma_alloc_size;
+    uint32_t                    dma_buf_count;
+    unsigned                    cpendings;
+    driTextureObject            texture_swapped;
+    driTexHeap                  *texture_heap;
+    struct radeon_screen        *screen;
+    unsigned                    *free_handles;
+};
+
+static void bo_legacy_tobj_destroy(void *data, driTextureObject *t)
+{
+    struct bo_legacy_texture_object *tobj = (struct bo_legacy_texture_object *)t;
+    
+    if (tobj->parent) {
+        tobj->parent->tobj = NULL;
+        tobj->parent->validated = 0;
+    }
+}
+
+static void inline clean_handles(struct bo_manager_legacy *bom)
+{
+  while (bom->cfree_handles > 0 &&
+	 !bom->free_handles[bom->cfree_handles - 1])
+    bom->cfree_handles--;
+
+}
+static int legacy_new_handle(struct bo_manager_legacy *bom, uint32_t *handle)
+{
+    uint32_t tmp;
+
+    *handle = 0;
+    if (bom->nhandle == 0xFFFFFFFF) {
+        return -EINVAL;
+    }
+    if (bom->cfree_handles > 0) {
+        tmp = bom->free_handles[--bom->cfree_handles];
+	clean_handles(bom);
+    } else {
+        bom->cfree_handles = 0;
+        tmp = bom->nhandle++;
+    }
+    assert(tmp);
+    *handle = tmp;
+    return 0;
+}
+
+static int legacy_free_handle(struct bo_manager_legacy *bom, uint32_t handle)
+{
+    uint32_t *handles;
+
+    if (!handle) {
+        return 0;
+    }
+    if (handle == (bom->nhandle - 1)) {
+        int i;
+
+        bom->nhandle--;
+        for (i = bom->cfree_handles - 1; i >= 0; i--) {
+            if (bom->free_handles[i] == (bom->nhandle - 1)) {
+                bom->nhandle--;
+                bom->free_handles[i] = 0;
+            }
+        }
+        clean_handles(bom);
+        return 0;
+    }
+    if (bom->cfree_handles < bom->nfree_handles) {
+        bom->free_handles[bom->cfree_handles++] = handle;
+        return 0;
+    }
+    bom->nfree_handles += 0x100;
+    handles = (uint32_t*)realloc(bom->free_handles, bom->nfree_handles * 4);
+    if (handles == NULL) {
+        bom->nfree_handles -= 0x100;
+        return -ENOMEM;
+    }
+    bom->free_handles = handles;
+    bom->free_handles[bom->cfree_handles++] = handle;
+    return 0;
+}
+
+static void legacy_get_current_age(struct bo_manager_legacy *boml)
+{
+    drm_radeon_getparam_t gp;
+    unsigned char *RADEONMMIO = NULL;
+    int r;
+
+    if (   IS_R300_CLASS(boml->screen) 
+        || IS_R600_CLASS(boml->screen) ) 
+    {
+    	gp.param = RADEON_PARAM_LAST_CLEAR;
+    	gp.value = (int *)&boml->current_age;
+    	r = drmCommandWriteRead(boml->base.fd, DRM_RADEON_GETPARAM,
+       	                     &gp, sizeof(gp));
+    	if (r) {
+       	 fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__, r);
+         exit(1);
+       }
+    } 
+    else {
+        RADEONMMIO = boml->screen->mmio.map;
+        boml->current_age = boml->screen->scratch[3];
+        boml->current_age = INREG(RADEON_GUI_SCRATCH_REG3);
+    }
+}
+
+static int legacy_is_pending(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (bo_legacy->is_pending <= 0) {
+        bo_legacy->is_pending = 0;
+        return 0;
+    }
+    if (boml->current_age >= bo_legacy->pending) {
+        if (boml->pending_bos.pprev == bo_legacy) {
+            boml->pending_bos.pprev = bo_legacy->pprev;
+        }
+        bo_legacy->pprev->pnext = bo_legacy->pnext;
+        if (bo_legacy->pnext) {
+            bo_legacy->pnext->pprev = bo_legacy->pprev;
+        }
+	assert(bo_legacy->is_pending <= bo->cref);
+        while (bo_legacy->is_pending--) {
+	    bo = radeon_bo_unref(bo);
+	    if (!bo)
+	      break;
+        }
+	if (bo)
+	  bo_legacy->is_pending = 0;
+        boml->cpendings--;
+        return 0;
+    }
+    return 1;
+}
+
+static int legacy_wait_pending(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (!bo_legacy->is_pending) {
+        return 0;
+    }
+    /* FIXME: lockup and userspace busy looping that's all the folks */
+    legacy_get_current_age(boml);
+    while (legacy_is_pending(bo)) {
+        usleep(10);
+        legacy_get_current_age(boml);
+    }
+    return 0;
+}
+
+static void legacy_track_pending(struct bo_manager_legacy *boml, int debug)
+{
+    struct bo_legacy *bo_legacy;
+    struct bo_legacy *next;
+
+    legacy_get_current_age(boml);
+    bo_legacy = boml->pending_bos.pnext;
+    while (bo_legacy) {
+        if (debug)
+	  fprintf(stderr,"pending %p %d %d %d\n", bo_legacy, bo_legacy->base.size,
+		  boml->current_age, bo_legacy->pending);
+        next = bo_legacy->pnext;
+        if (legacy_is_pending(&(bo_legacy->base))) {
+        }
+        bo_legacy = next;
+    } 
+}
+
+static int legacy_wait_any_pending(struct bo_manager_legacy *boml)
+{
+    struct bo_legacy *bo_legacy;
+
+    legacy_get_current_age(boml);
+    bo_legacy = boml->pending_bos.pnext;
+    if (!bo_legacy)
+      return -1;
+    legacy_wait_pending(&bo_legacy->base);
+    return 0;
+}
+
+static void legacy_kick_all_buffers(struct bo_manager_legacy *boml)
+{
+    struct bo_legacy *legacy;
+
+    legacy = boml->bos.next;
+    while (legacy != &boml->bos) {
+	if (legacy->tobj) {
+	    if (legacy->validated) {
+		driDestroyTextureObject(&legacy->tobj->base);
+		legacy->tobj = 0;
+		legacy->validated = 0;
+	    }
+	}
+	legacy = legacy->next;
+    }
+}
+
+static struct bo_legacy *bo_allocate(struct bo_manager_legacy *boml,
+                                     uint32_t size,
+                                     uint32_t alignment,
+                                     uint32_t domains,
+#ifdef RADEON_DEBUG_BO
+                                     uint32_t flags,
+                                     char * szBufUsage)
+#else
+                                     uint32_t flags)
+#endif /* RADEON_DEBUG_BO */
+{
+    struct bo_legacy *bo_legacy;
+    static int pgsize;
+
+    if (pgsize == 0)
+        pgsize = getpagesize() - 1;
+
+    size = (size + pgsize) & ~pgsize;
+
+    bo_legacy = (struct bo_legacy*)calloc(1, sizeof(struct bo_legacy));
+    if (bo_legacy == NULL) {
+        return NULL;
+    }
+    bo_legacy->base.bom = (struct radeon_bo_manager*)boml;
+    bo_legacy->base.handle = 0;
+    bo_legacy->base.size = size;
+    bo_legacy->base.alignment = alignment;
+    bo_legacy->base.domains = domains;
+    bo_legacy->base.flags = flags;
+    bo_legacy->base.ptr = NULL;
+    bo_legacy->map_count = 0;
+    bo_legacy->next = NULL;
+    bo_legacy->prev = NULL;
+    bo_legacy->pnext = NULL;
+    bo_legacy->pprev = NULL;
+    bo_legacy->next = boml->bos.next;
+    bo_legacy->prev = &boml->bos;
+    boml->bos.next = bo_legacy;
+    if (bo_legacy->next) {
+        bo_legacy->next->prev = bo_legacy;
+    }
+
+#ifdef RADEON_DEBUG_BO
+    sprintf(bo_legacy->szBufUsage, "%s", szBufUsage); 
+#endif /* RADEON_DEBUG_BO */
+
+    return bo_legacy;
+}
+
+static int bo_dma_alloc(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    drm_radeon_mem_alloc_t alloc;
+    unsigned size;
+    int base_offset;
+    int r;
+
+    /* align size on 4Kb */
+    size = (((4 * 1024) - 1) + bo->size) & ~((4 * 1024) - 1);
+    alloc.region = RADEON_MEM_REGION_GART;
+    alloc.alignment = bo_legacy->base.alignment;
+    alloc.size = size;
+    alloc.region_offset = &base_offset;
+    r = drmCommandWriteRead(bo->bom->fd,
+                            DRM_RADEON_ALLOC,
+                            &alloc,
+                            sizeof(alloc));
+    if (r) {
+        /* ptr is set to NULL if dma allocation failed */
+        bo_legacy->ptr = NULL;
+        return r;
+    }
+    bo_legacy->ptr = boml->screen->gartTextures.map + base_offset;
+    bo_legacy->offset = boml->screen->gart_texture_offset + base_offset;
+    bo->size = size;
+    boml->dma_alloc_size += size;
+    boml->dma_buf_count++;
+    return 0;
+}
+
+static int bo_dma_free(struct radeon_bo *bo)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    drm_radeon_mem_free_t memfree;
+    int r;
+
+    if (bo_legacy->ptr == NULL) {
+        /* ptr is set to NULL if dma allocation failed */
+        return 0;
+    }
+    legacy_get_current_age(boml);
+    memfree.region = RADEON_MEM_REGION_GART;
+    memfree.region_offset  = bo_legacy->offset;
+    memfree.region_offset -= boml->screen->gart_texture_offset;
+    r = drmCommandWrite(boml->base.fd,
+                        DRM_RADEON_FREE,
+                        &memfree,
+                        sizeof(memfree));
+    if (r) {
+        fprintf(stderr, "Failed to free bo[%p] at %08x\n",
+                &bo_legacy->base, memfree.region_offset);
+        fprintf(stderr, "ret = %s\n", strerror(-r));
+        return r;
+    }
+    boml->dma_alloc_size -= bo_legacy->base.size;
+    boml->dma_buf_count--;
+    return 0;
+}
+
+static void bo_free(struct bo_legacy *bo_legacy)
+{
+    struct bo_manager_legacy *boml;
+
+    if (bo_legacy == NULL) {
+        return;
+    }
+    boml = (struct bo_manager_legacy *)bo_legacy->base.bom;
+    bo_legacy->prev->next = bo_legacy->next;
+    if (bo_legacy->next) {
+        bo_legacy->next->prev = bo_legacy->prev;
+    }
+    if (!bo_legacy->static_bo) {
+        legacy_free_handle(boml, bo_legacy->base.handle);
+        if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) {
+            /* dma buffers */
+            bo_dma_free(&bo_legacy->base);
+        } else {
+  	    driDestroyTextureObject(&bo_legacy->tobj->base);
+	    bo_legacy->tobj = NULL;
+            /* free backing store */
+            free(bo_legacy->ptr);
+        }
+    }
+    memset(bo_legacy, 0 , sizeof(struct bo_legacy));
+    free(bo_legacy);
+}
+
+static struct radeon_bo *bo_open(struct radeon_bo_manager *bom,
+                                 uint32_t handle,
+                                 uint32_t size,
+                                 uint32_t alignment,
+                                 uint32_t domains,
+#ifdef RADEON_DEBUG_BO
+                                 uint32_t flags,
+                                 char *   szBufUsage)
+#else
+                                 uint32_t flags)
+#endif /* RADEON_DEBUG_BO */
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    struct bo_legacy *bo_legacy;
+    int r;
+
+    if (handle) {
+        bo_legacy = boml->bos.next;
+        while (bo_legacy) {
+            if (bo_legacy->base.handle == handle) {
+                radeon_bo_ref(&(bo_legacy->base));
+                return (struct radeon_bo*)bo_legacy;
+            }
+            bo_legacy = bo_legacy->next;
+        }
+        return NULL;
+    }
+#ifdef RADEON_DEBUG_BO
+    bo_legacy = bo_allocate(boml, size, alignment, domains, flags, szBufUsage);
+#else
+    bo_legacy = bo_allocate(boml, size, alignment, domains, flags);
+#endif /* RADEON_DEBUG_BO */
+    bo_legacy->static_bo = 0;
+    r = legacy_new_handle(boml, &bo_legacy->base.handle);
+    if (r) {
+        bo_free(bo_legacy);
+        return NULL;
+    }
+    if (bo_legacy->base.domains & RADEON_GEM_DOMAIN_GTT) 
+    {
+retry:
+        legacy_track_pending(boml, 0);
+        /* dma buffers */
+
+        r = bo_dma_alloc(&(bo_legacy->base));
+        if (r) 
+        {
+	         if (legacy_wait_any_pending(boml) == -1) 
+             {
+                  bo_free(bo_legacy);
+	              return NULL;
+             }
+	         goto retry;
+	         return NULL;
+        }
+    } 
+    else 
+    {
+        bo_legacy->ptr = malloc(bo_legacy->base.size);
+        if (bo_legacy->ptr == NULL) {
+            bo_free(bo_legacy);
+            return NULL;
+        }
+    }
+    radeon_bo_ref(&(bo_legacy->base));
+
+    return (struct radeon_bo*)bo_legacy;
+}
+
+static void bo_ref(struct radeon_bo *bo)
+{
+}
+
+static struct radeon_bo *bo_unref(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (bo->cref <= 0) {
+        bo_legacy->prev->next = bo_legacy->next;
+        if (bo_legacy->next) {
+            bo_legacy->next->prev = bo_legacy->prev;
+        }
+        if (!bo_legacy->is_pending) {
+            bo_free(bo_legacy);
+        }
+        return NULL;
+    }
+    return bo;
+}
+
+static int bo_map(struct radeon_bo *bo, int write)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    legacy_wait_pending(bo);
+    bo_legacy->validated = 0;
+    bo_legacy->dirty = 1;
+    bo_legacy->map_count++;
+    bo->ptr = bo_legacy->ptr;
+    /* Read the first pixel in the frame buffer.  This should
+     * be a noop, right?  In fact without this conform fails as reading
+     * from the framebuffer sometimes produces old results -- the
+     * on-card read cache gets mixed up and doesn't notice that the
+     * framebuffer has been updated.
+     *
+     * Note that we should probably be reading some otherwise unused
+     * region of VRAM, otherwise we might get incorrect results when
+     * reading pixels from the top left of the screen.
+     *
+     * I found this problem on an R420 with glean's texCube test.
+     * Note that the R200 span code also *writes* the first pixel in the
+     * framebuffer, but I've found this to be unnecessary.
+     *  -- Nicolai Hähnle, June 2008
+     */
+    if (!(bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+        int p;
+        volatile int *buf = (int*)boml->screen->driScreen->pFB;
+        p = *buf;
+    }
+
+    return 0;
+}
+
+static int bo_unmap(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (--bo_legacy->map_count > 0) 
+    {
+        return 0;
+    }
+    
+    bo->ptr = NULL;
+
+    return 0;
+}
+
+
+static int bo_is_static(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    return bo_legacy->static_bo;
+}
+
+static struct radeon_bo_funcs bo_legacy_funcs = {
+    bo_open,
+    bo_ref,
+    bo_unref,
+    bo_map,
+    bo_unmap,
+    NULL,
+    bo_is_static,
+};
+
+static int bo_vram_validate(struct radeon_bo *bo,
+                            uint32_t *soffset,
+                            uint32_t *eoffset)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    int r;
+    int retry_count = 0, pending_retry = 0;
+    
+    if (!bo_legacy->tobj) {
+	bo_legacy->tobj = CALLOC(sizeof(struct bo_legacy_texture_object));
+	bo_legacy->tobj->parent = bo_legacy;
+	make_empty_list(&bo_legacy->tobj->base);
+	bo_legacy->tobj->base.totalSize = bo->size;
+    retry:
+        r = driAllocateTexture(&boml->texture_heap, 1,
+                               &bo_legacy->tobj->base);
+        if (r) {
+		pending_retry = 0;
+		while(boml->cpendings && pending_retry++ < 10000) {
+			legacy_track_pending(boml, 0);
+			retry_count++;
+			if (retry_count > 2) {
+				free(bo_legacy->tobj);
+				bo_legacy->tobj = NULL;
+				fprintf(stderr, "Ouch! vram_validate failed %d\n", r);
+				return -1;
+			}
+			goto retry;
+		}
+	}
+        bo_legacy->offset = boml->texture_offset +
+                            bo_legacy->tobj->base.memBlock->ofs;
+        bo_legacy->dirty = 1;
+    }
+
+    assert(bo_legacy->tobj->base.memBlock);
+
+    if (bo_legacy->tobj)
+	driUpdateTextureLRU(&bo_legacy->tobj->base);
+
+    if (bo_legacy->dirty || bo_legacy->tobj->base.dirty_images[0]) {
+	    if (IS_R600_CLASS(boml->screen)) {
+		    char *src = bo_legacy->ptr;
+		    char *dst = (char *) boml->screen->driScreen->pFB +
+			    (bo_legacy->offset - boml->fb_location);
+
+		    /* FIXME: alignment, pitch, etc. */
+		    memcpy(dst, src, bo->size);
+	    } else {
+		    /* Copy to VRAM using a blit.
+		     * All memory is 4K aligned. We're using 1024 pixels wide blits.
+		     */
+		    drm_radeon_texture_t tex;
+		    drm_radeon_tex_image_t tmp;
+		    int ret;
+
+		    tex.offset = bo_legacy->offset;
+		    tex.image = &tmp;
+		    assert(!(tex.offset & 1023));
+
+		    tmp.x = 0;
+		    tmp.y = 0;
+		    if (bo->size < 4096) {
+			    tmp.width = (bo->size + 3) / 4;
+			    tmp.height = 1;
+		    } else {
+			    tmp.width = 1024;
+			    tmp.height = (bo->size + 4095) / 4096;
+		    }
+		    tmp.data = bo_legacy->ptr;
+		    tex.format = RADEON_TXFORMAT_ARGB8888;
+		    tex.width = tmp.width;
+		    tex.height = tmp.height;
+		    tex.pitch = MAX2(tmp.width / 16, 1);
+		    do {
+			    ret = drmCommandWriteRead(bo->bom->fd,
+						      DRM_RADEON_TEXTURE,
+						      &tex,
+						      sizeof(drm_radeon_texture_t));
+			    if (ret) {
+				    if (RADEON_DEBUG & DEBUG_IOCTL)
+					    fprintf(stderr, "DRM_RADEON_TEXTURE:  again!\n");
+				    usleep(1);
+			    }
+		    } while (ret == -EAGAIN);
+	    }
+	    bo_legacy->dirty = 0;
+	    bo_legacy->tobj->base.dirty_images[0] = 0;
+    }
+    return 0;
+}
+
+/* 
+ *  radeon_bo_legacy_validate -
+ *  returns:
+ *  0 - all good
+ *  -EINVAL - mapped buffer can't be validated
+ *  -EAGAIN - restart validation we've kicked all the buffers out
+ */
+int radeon_bo_legacy_validate(struct radeon_bo *bo,
+                              uint32_t *soffset,
+                              uint32_t *eoffset)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+    int r;
+    int retries = 0;
+
+    if (bo_legacy->map_count) {
+#ifdef RADEON_DEBUG_BO
+        fprintf(stderr, "bo(%p, %d, %s) is mapped (%d) can't valide it.\n",
+                bo, bo->size, bo_legacy->szBufUsage, bo_legacy->map_count);
+#else
+        fprintf(stderr, "bo(%p, %d) is mapped (%d) can't valide it.\n",
+                bo, bo->size, bo_legacy->map_count);
+#endif /* RADEON_DEBUG_BO */
+
+        return -EINVAL;
+    }
+    if (bo_legacy->static_bo || bo_legacy->validated) {
+        *soffset = bo_legacy->offset;
+        *eoffset = bo_legacy->offset + bo->size;
+
+        return 0;
+    }
+    if (!(bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+
+        r = bo_vram_validate(bo, soffset, eoffset);
+        if (r) {
+	    legacy_track_pending(boml, 0);
+	    legacy_kick_all_buffers(boml);
+	    retries++;
+	    if (retries == 2) {
+		fprintf(stderr,"legacy bo: failed to get relocations into aperture\n");
+		assert(0);
+		exit(-1);
+	    }
+	    return -EAGAIN;
+        }
+    }
+    *soffset = bo_legacy->offset;
+    *eoffset = bo_legacy->offset + bo->size;
+    bo_legacy->validated = 1;
+
+    return 0;
+}
+
+void radeon_bo_legacy_pending(struct radeon_bo *bo, uint32_t pending)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bo->bom;
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    bo_legacy->pending = pending;
+    bo_legacy->is_pending++;
+    /* add to pending list */
+    radeon_bo_ref(bo);
+    if (bo_legacy->is_pending > 1) {
+        return;    
+    }
+    bo_legacy->pprev = boml->pending_bos.pprev;
+    bo_legacy->pnext = NULL;
+    bo_legacy->pprev->pnext = bo_legacy;
+    boml->pending_bos.pprev = bo_legacy;
+    boml->cpendings++;
+}
+
+void radeon_bo_manager_legacy_dtor(struct radeon_bo_manager *bom)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    struct bo_legacy *bo_legacy;
+
+    if (bom == NULL) {
+        return;
+    }
+    bo_legacy = boml->bos.next;
+    while (bo_legacy) {
+        struct bo_legacy *next;
+
+        next = bo_legacy->next;
+        bo_free(bo_legacy);
+        bo_legacy = next;
+    }
+    driDestroyTextureHeap(boml->texture_heap);
+    free(boml->free_handles);
+    free(boml);
+}
+
+static struct bo_legacy *radeon_legacy_bo_alloc_static(struct bo_manager_legacy *bom,
+						       int size, 
+#ifdef RADEON_DEBUG_BO
+                               uint32_t offset,
+                               char * szBufUsage)
+#else
+                               uint32_t offset)
+#endif /* RADEON_DEBUG_BO */
+{
+    struct bo_legacy *bo;
+
+#ifdef RADEON_DEBUG_BO
+    bo = bo_allocate(bom, size, 0, RADEON_GEM_DOMAIN_VRAM, 0, szBufUsage);
+#else
+    bo = bo_allocate(bom, size, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+#endif /* RADEON_DEBUG_BO */
+    if (bo == NULL)
+	return NULL;
+    bo->static_bo = 1;
+    bo->offset = offset + bom->fb_location;
+    bo->base.handle = bo->offset;
+    bo->ptr = bom->screen->driScreen->pFB + offset;
+    if (bo->base.handle > bom->nhandle) {
+        bom->nhandle = bo->base.handle + 1;
+    }
+    radeon_bo_ref(&(bo->base));
+    return bo;
+}
+
+struct radeon_bo_manager *radeon_bo_manager_legacy_ctor(struct radeon_screen *scrn)
+{
+    struct bo_manager_legacy *bom;
+    struct bo_legacy *bo;
+    unsigned size;
+
+    bom = (struct bo_manager_legacy*)
+          calloc(1, sizeof(struct bo_manager_legacy));
+    if (bom == NULL) {
+        return NULL;
+    }
+
+    make_empty_list(&bom->texture_swapped);
+
+    bom->texture_heap = driCreateTextureHeap(0,
+                                             bom,
+                                             scrn->texSize[0],
+                                             12,
+                                             RADEON_NR_TEX_REGIONS,
+                                             (drmTextureRegionPtr)scrn->sarea->tex_list[0],
+                                             &scrn->sarea->tex_age[0],
+                                             &bom->texture_swapped,
+                                             sizeof(struct bo_legacy_texture_object),
+                                             &bo_legacy_tobj_destroy);
+    bom->texture_offset = scrn->texOffset[0];
+
+    bom->base.funcs = &bo_legacy_funcs;
+    bom->base.fd = scrn->driScreen->fd;
+    bom->bos.next = NULL;
+    bom->bos.prev = NULL;
+    bom->pending_bos.pprev = &bom->pending_bos;
+    bom->pending_bos.pnext = NULL;
+    bom->screen = scrn;
+    bom->fb_location = scrn->fbLocation;
+    bom->nhandle = 1;
+    bom->cfree_handles = 0;
+    bom->nfree_handles = 0x400;
+    bom->free_handles = (uint32_t*)malloc(bom->nfree_handles * 4);
+    if (bom->free_handles == NULL) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+
+    /* biggest framebuffer size */
+    size = 4096*4096*4; 
+
+    /* allocate front */
+#ifdef RADEON_DEBUG_BO
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->frontOffset, "FRONT BUF");
+#else
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->frontOffset);
+#endif /* RADEON_DEBUG_BO */
+    if (!bo) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+    if (scrn->sarea->tiling_enabled) {
+        bo->base.flags = RADEON_BO_FLAGS_MACRO_TILE;
+    }
+
+    /* allocate back */
+#ifdef RADEON_DEBUG_BO
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->backOffset, "BACK BUF");
+#else
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->backOffset);
+#endif /* RADEON_DEBUG_BO */
+    if (!bo) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+    if (scrn->sarea->tiling_enabled) {
+        bo->base.flags = RADEON_BO_FLAGS_MACRO_TILE;
+    }
+
+    /* allocate depth */
+#ifdef RADEON_DEBUG_BO
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->depthOffset, "Z BUF");
+#else
+    bo = radeon_legacy_bo_alloc_static(bom, size, bom->screen->depthOffset);
+#endif /* RADEON_DEBUG_BO */
+    if (!bo) {
+        radeon_bo_manager_legacy_dtor((struct radeon_bo_manager*)bom);
+        return NULL;
+    }
+    bo->base.flags = 0;
+    if (scrn->sarea->tiling_enabled) {
+        bo->base.flags |= RADEON_BO_FLAGS_MACRO_TILE;
+        bo->base.flags |= RADEON_BO_FLAGS_MICRO_TILE;
+    }
+    return (struct radeon_bo_manager*)bom;
+}
+
+void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom)
+{
+    struct bo_manager_legacy *boml = (struct bo_manager_legacy *)bom;
+    DRI_AGE_TEXTURES(boml->texture_heap);
+}
+
+unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo)
+{
+    struct bo_legacy *bo_legacy = (struct bo_legacy*)bo;
+
+    if (bo_legacy->static_bo || (bo->domains & RADEON_GEM_DOMAIN_GTT)) {
+        return 0;
+    }
+    return bo->size;
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
new file mode 100644
index 00000000000..0db817cab07
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bo_legacy.h
@@ -0,0 +1,46 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <[email protected]>
+ *      Nicolai Haehnle <[email protected]>
+ *      Jérôme Glisse <[email protected]>
+ */
+#ifndef RADEON_BO_LEGACY_H
+#define RADEON_BO_LEGACY_H
+
+#include "radeon_screen.h"
+
+void radeon_bo_legacy_pending(struct radeon_bo *bo, uint32_t pending);
+int radeon_bo_legacy_validate(struct radeon_bo *bo,
+                              uint32_t *soffset,
+                              uint32_t *eoffset);
+struct radeon_bo_manager *radeon_bo_manager_legacy_ctor(struct radeon_screen *scrn);
+void radeon_bo_manager_legacy_dtor(struct radeon_bo_manager *bom);
+void radeon_bo_legacy_texture_age(struct radeon_bo_manager *bom);
+unsigned radeon_bo_legacy_relocs_size(struct radeon_bo *bo);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
new file mode 100644
index 00000000000..e0c70dd9a11
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_bocs_wrapper.h
@@ -0,0 +1,85 @@
+#ifndef RADEON_CS_WRAPPER_H
+#define RADEON_CS_WRAPPER_H
+
+#ifdef HAVE_LIBDRM_RADEON
+
+#include "radeon_bo.h"
+#include "radeon_bo_gem.h"
+#include "radeon_cs.h"
+#include "radeon_cs_gem.h"
+
+#else
+#include <stdint.h>
+
+#define RADEON_GEM_DOMAIN_CPU 0x1   // Cached CPU domain
+#define RADEON_GEM_DOMAIN_GTT 0x2   // GTT or cache flushed
+#define RADEON_GEM_DOMAIN_VRAM 0x4  // VRAM domain
+
+/* to be used to build locally in mesa with no libdrm bits */
+#include "../radeon/radeon_bo_drm.h"
+#include "../radeon/radeon_cs_drm.h"
+
+#ifndef DRM_RADEON_GEM_INFO
+#define DRM_RADEON_GEM_INFO 0x1c
+
+struct drm_radeon_gem_info {
+        uint64_t gart_size;
+        uint64_t vram_size;
+        uint64_t vram_visible;
+};
+
+struct drm_radeon_info {
+	uint32_t request;
+	uint32_t pad;
+	uint32_t value;
+};
+#endif
+
+#ifndef RADEON_PARAM_DEVICE_ID
+#define RADEON_PARAM_DEVICE_ID 16
+#endif
+
+#ifndef RADEON_INFO_DEVICE_ID
+#define RADEON_INFO_DEVICE_ID 0
+#endif
+#ifndef RADEON_INFO_NUM_GB_PIPES
+#define RADEON_INFO_NUM_GB_PIPES 0
+#endif
+
+#ifndef DRM_RADEON_INFO
+#define DRM_RADEON_INFO 0x1
+#endif
+
+
+static inline uint32_t radeon_gem_name_bo(struct radeon_bo *dummy)
+{
+  return 0;
+}
+
+static inline void *radeon_bo_manager_gem_ctor(int fd)
+{
+  return NULL;
+}
+
+static inline void radeon_bo_manager_gem_dtor(void *dummy)
+{
+}
+
+static inline void *radeon_cs_manager_gem_ctor(int fd)
+{
+  return NULL;
+}
+
+static inline void radeon_cs_manager_gem_dtor(void *dummy)
+{
+}
+
+static inline void radeon_tracker_print(void *ptr, int io)
+{
+}
+#endif
+
+#include "radeon_bo_legacy.h"
+#include "radeon_cs_legacy.h"
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_chipset.h b/src/mesa/drivers/dri/radeon/radeon_chipset.h
index f6bd1eb83fb..0a6a2df35b8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_chipset.h
+++ b/src/mesa/drivers/dri/radeon/radeon_chipset.h
@@ -255,6 +255,131 @@
 #define PCI_CHIP_RS740_796E             0x796E
 #define PCI_CHIP_RS740_796F             0x796F
 
+#define PCI_CHIP_R600_9400              0x9400
+#define PCI_CHIP_R600_9401              0x9401
+#define PCI_CHIP_R600_9402              0x9402
+#define PCI_CHIP_R600_9403              0x9403
+#define PCI_CHIP_R600_9405              0x9405
+#define PCI_CHIP_R600_940A              0x940A
+#define PCI_CHIP_R600_940B              0x940B
+#define PCI_CHIP_R600_940F              0x940F
+
+#define PCI_CHIP_RV610_94C0             0x94C0
+#define PCI_CHIP_RV610_94C1             0x94C1
+#define PCI_CHIP_RV610_94C3             0x94C3
+#define PCI_CHIP_RV610_94C4             0x94C4
+#define PCI_CHIP_RV610_94C5             0x94C5
+#define PCI_CHIP_RV610_94C6             0x94C6
+#define PCI_CHIP_RV610_94C7             0x94C7
+#define PCI_CHIP_RV610_94C8             0x94C8
+#define PCI_CHIP_RV610_94C9             0x94C9
+#define PCI_CHIP_RV610_94CB             0x94CB
+#define PCI_CHIP_RV610_94CC             0x94CC
+#define PCI_CHIP_RV610_94CD             0x94CD
+
+#define PCI_CHIP_RV630_9580             0x9580
+#define PCI_CHIP_RV630_9581             0x9581
+#define PCI_CHIP_RV630_9583             0x9583
+#define PCI_CHIP_RV630_9586             0x9586
+#define PCI_CHIP_RV630_9587             0x9587
+#define PCI_CHIP_RV630_9588             0x9588
+#define PCI_CHIP_RV630_9589             0x9589
+#define PCI_CHIP_RV630_958A             0x958A
+#define PCI_CHIP_RV630_958B             0x958B
+#define PCI_CHIP_RV630_958C             0x958C
+#define PCI_CHIP_RV630_958D             0x958D
+#define PCI_CHIP_RV630_958E             0x958E
+#define PCI_CHIP_RV630_958F             0x958F
+
+#define PCI_CHIP_RV670_9500             0x9500
+#define PCI_CHIP_RV670_9501             0x9501
+#define PCI_CHIP_RV670_9504             0x9504
+#define PCI_CHIP_RV670_9505             0x9505
+#define PCI_CHIP_RV670_9506             0x9506
+#define PCI_CHIP_RV670_9507             0x9507
+#define PCI_CHIP_RV670_9508             0x9508
+#define PCI_CHIP_RV670_9509             0x9509
+#define PCI_CHIP_RV670_950F             0x950F
+#define PCI_CHIP_RV670_9511             0x9511
+#define PCI_CHIP_RV670_9515             0x9515
+#define PCI_CHIP_RV670_9517             0x9517
+#define PCI_CHIP_RV670_9519             0x9519
+
+#define PCI_CHIP_RV620_95C0             0x95C0
+#define PCI_CHIP_RV620_95C2             0x95C2
+#define PCI_CHIP_RV620_95C4             0x95C4
+#define PCI_CHIP_RV620_95C5             0x95C5
+#define PCI_CHIP_RV620_95C6             0x95C6
+#define PCI_CHIP_RV620_95C7             0x95C7
+#define PCI_CHIP_RV620_95C9             0x95C9
+#define PCI_CHIP_RV620_95CC             0x95CC
+#define PCI_CHIP_RV620_95CD             0x95CD
+#define PCI_CHIP_RV620_95CE             0x95CE
+#define PCI_CHIP_RV620_95CF             0x95CF
+
+#define PCI_CHIP_RV635_9590             0x9590
+#define PCI_CHIP_RV635_9591             0x9591
+#define PCI_CHIP_RV635_9593             0x9593
+#define PCI_CHIP_RV635_9595             0x9595
+#define PCI_CHIP_RV635_9596             0x9596
+#define PCI_CHIP_RV635_9597             0x9597
+#define PCI_CHIP_RV635_9598             0x9598
+#define PCI_CHIP_RV635_9599             0x9599
+#define PCI_CHIP_RV635_959B             0x959B
+
+#define PCI_CHIP_RS780_9610             0x9610
+#define PCI_CHIP_RS780_9611             0x9611
+#define PCI_CHIP_RS780_9612             0x9612
+#define PCI_CHIP_RS780_9613             0x9613
+#define PCI_CHIP_RS780_9614             0x9614
+#define PCI_CHIP_RS780_9615             0x9615
+#define PCI_CHIP_RS780_9616             0x9616
+
+#define PCI_CHIP_RV770_9440             0x9440
+#define PCI_CHIP_RV770_9441             0x9441
+#define PCI_CHIP_RV770_9442             0x9442
+#define PCI_CHIP_RV770_9444             0x9444
+#define PCI_CHIP_RV770_9446             0x9446
+#define PCI_CHIP_RV770_944A             0x944A
+#define PCI_CHIP_RV770_944B             0x944B
+#define PCI_CHIP_RV770_944C             0x944C
+#define PCI_CHIP_RV770_944E             0x944E
+#define PCI_CHIP_RV770_9450             0x9450
+#define PCI_CHIP_RV770_9452             0x9452
+#define PCI_CHIP_RV770_9456             0x9456
+#define PCI_CHIP_RV770_945A             0x945A
+#define PCI_CHIP_RV770_945B             0x945B
+#define PCI_CHIP_RV790_9460             0x9460
+#define PCI_CHIP_RV790_9462             0x9462
+#define PCI_CHIP_RV770_946A             0x946A
+#define PCI_CHIP_RV770_946B             0x946B
+#define PCI_CHIP_RV770_947A             0x947A
+#define PCI_CHIP_RV770_947B             0x947B
+
+#define PCI_CHIP_RV730_9487             0x9487
+#define PCI_CHIP_RV730_9489             0x9489
+#define PCI_CHIP_RV730_948F             0x948F
+#define PCI_CHIP_RV730_9490             0x9490
+#define PCI_CHIP_RV730_9491             0x9491
+#define PCI_CHIP_RV730_9498             0x9498
+#define PCI_CHIP_RV730_949C             0x949C
+#define PCI_CHIP_RV730_949E             0x949E
+#define PCI_CHIP_RV730_949F             0x949F
+
+#define PCI_CHIP_RV710_9540             0x9540
+#define PCI_CHIP_RV710_9541             0x9541
+#define PCI_CHIP_RV710_9542             0x9542
+#define PCI_CHIP_RV710_954E             0x954E
+#define PCI_CHIP_RV710_954F             0x954F
+#define PCI_CHIP_RV710_9552             0x9552
+#define PCI_CHIP_RV710_9553             0x9553
+#define PCI_CHIP_RV710_9555             0x9555
+
+#define PCI_CHIP_RV740_94A0             0x94A0
+#define PCI_CHIP_RV740_94A1             0x94A1
+#define PCI_CHIP_RV740_94B1             0x94B1
+#define PCI_CHIP_RV740_94B3             0x94B3
+#define PCI_CHIP_RV740_94B5             0x94B5
 
 enum {
    CHIP_FAMILY_R100,
@@ -282,6 +407,17 @@ enum {
    CHIP_FAMILY_R580,
    CHIP_FAMILY_RV560,
    CHIP_FAMILY_RV570,
+   CHIP_FAMILY_R600,
+   CHIP_FAMILY_RV610,
+   CHIP_FAMILY_RV630,
+   CHIP_FAMILY_RV670,
+   CHIP_FAMILY_RV620,
+   CHIP_FAMILY_RV635,
+   CHIP_FAMILY_RS780,
+   CHIP_FAMILY_RV770,
+   CHIP_FAMILY_RV730,
+   CHIP_FAMILY_RV710,
+   CHIP_FAMILY_RV740,
    CHIP_FAMILY_LAST
 };
 
@@ -289,6 +425,7 @@ enum {
 #define RADEON_CLASS_R100		(0 << 0)
 #define RADEON_CLASS_R200		(1 << 0)
 #define RADEON_CLASS_R300		(2 << 0)
+#define RADEON_CLASS_R600		(3 << 0)
 #define RADEON_CLASS_MASK		(3 << 0)
 
 #define RADEON_CHIPSET_TCL		(1 << 2)	/* tcl support - any radeon */
diff --git a/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
new file mode 100644
index 00000000000..abb023c7def
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cmdbuf.h
@@ -0,0 +1,123 @@
+#ifndef COMMON_CMDBUF_H
+#define COMMON_CMDBUF_H
+
+#include "radeon_bocs_wrapper.h"
+
+void rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller);
+int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller);
+int rcommonFlushCmdBufLocked(radeonContextPtr rmesa, const char *caller);
+void rcommonInitCmdBuf(radeonContextPtr rmesa);
+void rcommonDestroyCmdBuf(radeonContextPtr rmesa);
+
+void rcommonBeginBatch(radeonContextPtr rmesa,
+		       int n,
+		       int dostate,
+		       const char *file,
+		       const char *function,
+		       int line);
+
+/* +r6/r7 : code here moved */
+
+#define CP_PACKET2  (2 << 30)
+#define CP_PACKET0(reg, n)	(RADEON_CP_PACKET0 | ((n)<<16) | ((reg)>>2))
+#define CP_PACKET0_ONE(reg, n)	(RADEON_CP_PACKET0 | RADEON_CP_PACKET0_ONE_REG_WR | ((n)<<16) | ((reg)>>2))
+#define CP_PACKET3(pkt, n)	(RADEON_CP_PACKET3 | (pkt) | ((n) << 16))
+
+/**
+ * Every function writing to the command buffer needs to declare this
+ * to get the necessary local variables.
+ */
+#define BATCH_LOCALS(rmesa) \
+	const radeonContextPtr b_l_rmesa = rmesa
+
+/**
+ * Prepare writing n dwords to the command buffer,
+ * including producing any necessary state emits on buffer wraparound.
+ */
+#define BEGIN_BATCH(n) rcommonBeginBatch(b_l_rmesa, n, 1, __FILE__, __FUNCTION__, __LINE__)
+
+/**
+ * Same as BEGIN_BATCH, but do not cause automatic state emits.
+ */
+#define BEGIN_BATCH_NO_AUTOSTATE(n) rcommonBeginBatch(b_l_rmesa, n, 0, __FILE__, __FUNCTION__, __LINE__)
+
+/**
+ * Write one dword to the command buffer.
+ */
+#define OUT_BATCH(data) \
+	do { \
+        radeon_cs_write_dword(b_l_rmesa->cmdbuf.cs, data);\
+	} while(0)
+
+/**
+ * Write a relocated dword to the command buffer.
+ */
+#define OUT_BATCH_RELOC(data, bo, offset, rd, wd, flags) 	\
+	do { 							\
+        if (0 && offset) {					\
+            fprintf(stderr, "(%s:%s:%d) offset : %d\n",		\
+            __FILE__, __FUNCTION__, __LINE__, offset);		\
+        }							\
+        radeon_cs_write_dword(b_l_rmesa->cmdbuf.cs, offset);	\
+        radeon_cs_write_reloc(b_l_rmesa->cmdbuf.cs, 		\
+                              bo, rd, wd, flags);		\
+	if (!b_l_rmesa->radeonScreen->kernel_mm) 		\
+		b_l_rmesa->cmdbuf.cs->section_cdw += 2;		\
+	} while(0)
+
+
+/**
+ * Write n dwords from ptr to the command buffer.
+ */
+#define OUT_BATCH_TABLE(ptr,n) \
+	do { \
+		int _i; \
+        for (_i=0; _i < n; _i++) {\
+            radeon_cs_write_dword(b_l_rmesa->cmdbuf.cs, ptr[_i]);\
+        }\
+	} while(0)
+
+/**
+ * Finish writing dwords to the command buffer.
+ * The number of (direct or indirect) OUT_BATCH calls between the previous
+ * BEGIN_BATCH and END_BATCH must match the number specified at BEGIN_BATCH time.
+ */
+#define END_BATCH() \
+	do { \
+        radeon_cs_end(b_l_rmesa->cmdbuf.cs, __FILE__, __FUNCTION__, __LINE__);\
+	} while(0)
+
+/**
+ * After the last END_BATCH() of rendering, this indicates that flushing
+ * the command buffer now is okay.
+ */
+#define COMMIT_BATCH() \
+	do { \
+	} while(0)
+
+
+/** Single register write to command buffer; requires 2 dwords. */
+#define OUT_BATCH_REGVAL(reg, val) \
+	OUT_BATCH(cmdpacket0(b_l_rmesa->radeonScreen, (reg), 1)); \
+	OUT_BATCH((val))
+
+/** Continuous register range write to command buffer; requires 1 dword,
+ * expects count dwords afterwards for register contents. */
+#define OUT_BATCH_REGSEQ(reg, count) \
+	OUT_BATCH(cmdpacket0(b_l_rmesa->radeonScreen, (reg), (count)))
+
+/** Write a 32 bit float to the ring; requires 1 dword. */
+#define OUT_BATCH_FLOAT32(f) \
+	OUT_BATCH(radeonPackFloat32((f)))
+
+/* +r6/r7 : code here moved */
+
+/* Fire the buffered vertices no matter what.
+ */
+static INLINE void radeon_firevertices(radeonContextPtr radeon)
+{
+   if (radeon->cmdbuf.cs->cdw || radeon->dma.flush )
+      radeon->glCtx->Driver.Flush(radeon->glCtx); /* +r6/r7 */
+}
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.c b/src/mesa/drivers/dri/radeon/radeon_common.c
new file mode 100644
index 00000000000..dde615a4d9d
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common.c
@@ -0,0 +1,1292 @@
+/**************************************************************************
+
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+/*
+ * Authors:
+ *   Keith Whitwell <[email protected]>
+ */
+
+/*
+   - Scissor implementation
+   - buffer swap/copy ioctls
+   - finish/flush
+   - state emission
+   - cmdbuffer management
+*/
+
+#include <errno.h>
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/arrayobj.h"
+#include "main/api_arrayelt.h"
+#include "main/enums.h"
+#include "main/colormac.h"
+#include "main/light.h"
+#include "main/framebuffer.h"
+#include "main/simple_list.h"
+#include "main/renderbuffer.h"
+#include "swrast/swrast.h"
+#include "vbo/vbo.h"
+#include "tnl/tnl.h"
+#include "tnl/t_pipeline.h"
+#include "swrast_setup/swrast_setup.h"
+
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/buffers.h"
+#include "main/depth.h"
+#include "main/polygon.h"
+#include "main/shaders.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "glapi/dispatch.h"
+#include "swrast/swrast.h"
+#include "main/stencil.h"
+#include "main/matrix.h"
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/viewport.h"
+
+#include "dri_util.h"
+#include "vblank.h"
+
+#include "radeon_common.h"
+#include "radeon_bocs_wrapper.h"
+#include "radeon_lock.h"
+#include "radeon_drm.h"
+#include "radeon_mipmap_tree.h"
+
+#define DEBUG_CMDBUF         0
+
+/* =============================================================
+ * Scissoring
+ */
+
+static GLboolean intersect_rect(drm_clip_rect_t * out,
+				drm_clip_rect_t * a, drm_clip_rect_t * b)
+{
+	*out = *a;
+	if (b->x1 > out->x1)
+		out->x1 = b->x1;
+	if (b->y1 > out->y1)
+		out->y1 = b->y1;
+	if (b->x2 < out->x2)
+		out->x2 = b->x2;
+	if (b->y2 < out->y2)
+		out->y2 = b->y2;
+	if (out->x1 >= out->x2)
+		return GL_FALSE;
+	if (out->y1 >= out->y2)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void radeonRecalcScissorRects(radeonContextPtr radeon)
+{
+	drm_clip_rect_t *out;
+	int i;
+
+	/* Grow cliprect store?
+	 */
+	if (radeon->state.scissor.numAllocedClipRects < radeon->numClipRects) {
+		while (radeon->state.scissor.numAllocedClipRects <
+		       radeon->numClipRects) {
+			radeon->state.scissor.numAllocedClipRects += 1;	/* zero case */
+			radeon->state.scissor.numAllocedClipRects *= 2;
+		}
+
+		if (radeon->state.scissor.pClipRects)
+			FREE(radeon->state.scissor.pClipRects);
+
+		radeon->state.scissor.pClipRects =
+			MALLOC(radeon->state.scissor.numAllocedClipRects *
+			       sizeof(drm_clip_rect_t));
+
+		if (radeon->state.scissor.pClipRects == NULL) {
+			radeon->state.scissor.numAllocedClipRects = 0;
+			return;
+		}
+	}
+
+	out = radeon->state.scissor.pClipRects;
+	radeon->state.scissor.numClipRects = 0;
+
+	for (i = 0; i < radeon->numClipRects; i++) {
+		if (intersect_rect(out,
+				   &radeon->pClipRects[i],
+				   &radeon->state.scissor.rect)) {
+			radeon->state.scissor.numClipRects++;
+			out++;
+		}
+	}
+}
+
+void radeon_get_cliprects(radeonContextPtr radeon,
+			  struct drm_clip_rect **cliprects,
+			  unsigned int *num_cliprects,
+			  int *x_off, int *y_off)
+{
+	__DRIdrawablePrivate *dPriv = radeon_get_drawable(radeon);
+	struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+
+	if (radeon->constant_cliprect) {
+		radeon->fboRect.x1 = 0;
+		radeon->fboRect.y1 = 0;
+		radeon->fboRect.x2 = radeon->glCtx->DrawBuffer->Width;
+		radeon->fboRect.y2 = radeon->glCtx->DrawBuffer->Height;
+
+		*cliprects = &radeon->fboRect;
+		*num_cliprects = 1;
+		*x_off = 0;
+		*y_off = 0;
+	} else if (radeon->front_cliprects ||
+		   rfb->pf_active || dPriv->numBackClipRects == 0) {
+		*cliprects = dPriv->pClipRects;
+		*num_cliprects = dPriv->numClipRects;
+		*x_off = dPriv->x;
+		*y_off = dPriv->y;
+	} else {
+		*num_cliprects = dPriv->numBackClipRects;
+		*cliprects = dPriv->pBackClipRects;
+		*x_off = dPriv->backX;
+		*y_off = dPriv->backY;
+	}
+}
+
+/**
+ * Update cliprects and scissors.
+ */
+void radeonSetCliprects(radeonContextPtr radeon)
+{
+	__DRIdrawablePrivate *const drawable = radeon_get_drawable(radeon);
+	__DRIdrawablePrivate *const readable = radeon_get_readable(radeon);
+	struct radeon_framebuffer *const draw_rfb = drawable->driverPrivate;
+	struct radeon_framebuffer *const read_rfb = readable->driverPrivate;
+	int x_off, y_off;
+
+	radeon_get_cliprects(radeon, &radeon->pClipRects,
+			     &radeon->numClipRects, &x_off, &y_off);
+
+	if ((draw_rfb->base.Width != drawable->w) ||
+	    (draw_rfb->base.Height != drawable->h)) {
+		_mesa_resize_framebuffer(radeon->glCtx, &draw_rfb->base,
+					 drawable->w, drawable->h);
+		draw_rfb->base.Initialized = GL_TRUE;
+	}
+
+	if (drawable != readable) {
+		if ((read_rfb->base.Width != readable->w) ||
+		    (read_rfb->base.Height != readable->h)) {
+			_mesa_resize_framebuffer(radeon->glCtx, &read_rfb->base,
+						 readable->w, readable->h);
+			read_rfb->base.Initialized = GL_TRUE;
+		}
+	}
+
+	if (radeon->state.scissor.enabled)
+		radeonRecalcScissorRects(radeon);
+
+}
+
+
+
+void radeonUpdateScissor( GLcontext *ctx )
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+
+	if ( !ctx->DrawBuffer->Name ) {
+		__DRIdrawablePrivate *dPriv = radeon_get_drawable(rmesa);
+
+		int x = ctx->Scissor.X;
+		int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
+		int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
+		int h = dPriv->h - ctx->Scissor.Y - 1;
+
+		rmesa->state.scissor.rect.x1 = x + dPriv->x;
+		rmesa->state.scissor.rect.y1 = y + dPriv->y;
+		rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
+		rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
+	} else {
+		rmesa->state.scissor.rect.x1 = ctx->Scissor.X;
+		rmesa->state.scissor.rect.y1 = ctx->Scissor.Y;
+		rmesa->state.scissor.rect.x2 = ctx->Scissor.X + ctx->Scissor.Width;
+		rmesa->state.scissor.rect.y2 = ctx->Scissor.Y + ctx->Scissor.Height;
+	}
+
+	radeonRecalcScissorRects( rmesa );
+}
+
+/* =============================================================
+ * Scissoring
+ */
+
+void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	if (ctx->Scissor.Enabled) {
+		/* We don't pipeline cliprect changes */
+		radeon_firevertices(radeon);
+		radeonUpdateScissor(ctx);
+	}
+}
+
+
+/* ================================================================
+ * SwapBuffers with client-side throttling
+ */
+
+static uint32_t radeonGetLastFrame(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t frame = 0;
+
+	gp.param = RADEON_PARAM_LAST_FRAME;
+	gp.value = (int *)&frame;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return frame;
+}
+
+uint32_t radeonGetAge(radeonContextPtr radeon)
+{
+	drm_radeon_getparam_t gp;
+	int ret;
+	uint32_t age;
+
+	gp.param = RADEON_PARAM_LAST_CLEAR;
+	gp.value = (int *)&age;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_GETPARAM,
+				  &gp, sizeof(gp));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonGetParam: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+
+	return age;
+}
+
+static void radeonEmitIrqLocked(radeonContextPtr radeon)
+{
+	drm_radeon_irq_emit_t ie;
+	int ret;
+
+	ie.irq_seq = &radeon->iw.irq_seq;
+	ret = drmCommandWriteRead(radeon->dri.fd, DRM_RADEON_IRQ_EMIT,
+				  &ie, sizeof(ie));
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqEmit: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitIrq(radeonContextPtr radeon)
+{
+	int ret;
+
+	do {
+		ret = drmCommandWrite(radeon->dri.fd, DRM_RADEON_IRQ_WAIT,
+				      &radeon->iw, sizeof(radeon->iw));
+	} while (ret && (errno == EINTR || errno == EBUSY));
+
+	if (ret) {
+		fprintf(stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__,
+			ret);
+		exit(1);
+	}
+}
+
+static void radeonWaitForFrameCompletion(radeonContextPtr radeon)
+{
+	drm_radeon_sarea_t *sarea = radeon->sarea;
+
+	if (radeon->do_irqs) {
+		if (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			if (!radeon->irqsEmitted) {
+				while (radeonGetLastFrame(radeon) <
+				       sarea->last_frame) ;
+			} else {
+				UNLOCK_HARDWARE(radeon);
+				radeonWaitIrq(radeon);
+				LOCK_HARDWARE(radeon);
+			}
+			radeon->irqsEmitted = 10;
+		}
+
+		if (radeon->irqsEmitted) {
+			radeonEmitIrqLocked(radeon);
+			radeon->irqsEmitted--;
+		}
+	} else {
+		while (radeonGetLastFrame(radeon) < sarea->last_frame) {
+			UNLOCK_HARDWARE(radeon);
+			if (radeon->do_usleeps)
+				DO_USLEEP(1);
+			LOCK_HARDWARE(radeon);
+		}
+	}
+}
+
+/* wait for idle */
+void radeonWaitForIdleLocked(radeonContextPtr radeon)
+{
+	int ret;
+	int i = 0;
+
+	do {
+		ret = drmCommandNone(radeon->dri.fd, DRM_RADEON_CP_IDLE);
+		if (ret)
+			DO_USLEEP(1);
+	} while (ret && ++i < 100);
+
+	if (ret < 0) {
+		UNLOCK_HARDWARE(radeon);
+		fprintf(stderr, "Error: R300 timed out... exiting\n");
+		exit(-1);
+	}
+}
+
+static void radeonWaitForIdle(radeonContextPtr radeon)
+{
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+        LOCK_HARDWARE(radeon);
+	    radeonWaitForIdleLocked(radeon);
+	    UNLOCK_HARDWARE(radeon);
+    }
+}
+
+static void radeon_flip_renderbuffers(struct radeon_framebuffer *rfb)
+{
+	int current_page = rfb->pf_current_page;
+	int next_page = (current_page + 1) % rfb->pf_num_pages;
+	struct gl_renderbuffer *tmp_rb;
+
+	/* Exchange renderbuffers if necessary but make sure their
+	 * reference counts are preserved.
+	 */
+	if (rfb->color_rb[current_page] &&
+	    rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer !=
+	    &rfb->color_rb[current_page]->base) {
+		tmp_rb = NULL;
+		_mesa_reference_renderbuffer(&tmp_rb,
+					     rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer);
+		tmp_rb = &rfb->color_rb[current_page]->base;
+		_mesa_reference_renderbuffer(&rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer, tmp_rb);
+		_mesa_reference_renderbuffer(&tmp_rb, NULL);
+	}
+
+	if (rfb->color_rb[next_page] &&
+	    rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer !=
+	    &rfb->color_rb[next_page]->base) {
+		tmp_rb = NULL;
+		_mesa_reference_renderbuffer(&tmp_rb,
+					     rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer);
+		tmp_rb = &rfb->color_rb[next_page]->base;
+		_mesa_reference_renderbuffer(&rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer, tmp_rb);
+		_mesa_reference_renderbuffer(&tmp_rb, NULL);
+	}
+}
+
+/* Copy the back color buffer to the front color buffer.
+ */
+void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+		       const drm_clip_rect_t	  *rect)
+{
+	radeonContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	GLint nbox, i, ret;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+
+	LOCK_HARDWARE(rmesa);
+
+	rfb = dPriv->driverPrivate;
+
+	if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+		fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
+	}
+
+	nbox = dPriv->numClipRects; /* must be in locked region */
+
+	for ( i = 0 ; i < nbox ; ) {
+		GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
+		drm_clip_rect_t *box = dPriv->pClipRects;
+		drm_clip_rect_t *b = rmesa->sarea->boxes;
+		GLint n = 0;
+
+		for ( ; i < nr ; i++ ) {
+
+			*b = box[i];
+
+			if (rect)
+			{
+				if (rect->x1 > b->x1)
+					b->x1 = rect->x1;
+				if (rect->y1 > b->y1)
+					b->y1 = rect->y1;
+				if (rect->x2 < b->x2)
+					b->x2 = rect->x2;
+				if (rect->y2 < b->y2)
+					b->y2 = rect->y2;
+
+				if (b->x1 >= b->x2 || b->y1 >= b->y2)
+					continue;
+			}
+
+			b++;
+			n++;
+		}
+		rmesa->sarea->nbox = n;
+
+		if (!n)
+			continue;
+
+		if (IS_R600_CLASS(rmesa->radeonScreen)) {
+			int cpp = rmesa->radeonScreen->cpp;
+			int src_pitch = rmesa->radeonScreen->backPitch * cpp;
+			int dst_pitch = rmesa->radeonScreen->frontPitch * cpp;
+			char *src = (char *)rmesa->radeonScreen->driScreen->pFB + rmesa->radeonScreen->backOffset;
+			char *dst = (char *)rmesa->radeonScreen->driScreen->pFB + rmesa->radeonScreen->frontOffset;
+			int j;
+			drm_clip_rect_t *pb = rmesa->sarea->boxes;
+
+			for (j = 0; j < n; j++) {
+				int x = pb[j].x1;
+				int y = pb[j].y1;
+				int w = pb[j].x2 - x;
+				int h = pb[j].y2 - y;
+
+				src += (y * src_pitch) + (x * cpp);
+				dst += (y * dst_pitch) + (x * cpp);
+
+				while (h--) {
+					memcpy(dst, src, w * cpp);
+					src += src_pitch;
+					dst += dst_pitch;
+				}
+			}
+		}
+
+		ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
+
+		if ( ret ) {
+			fprintf( stderr, "DRM_RADEON_SWAP_BUFFERS: return = %d\n", ret );
+			UNLOCK_HARDWARE( rmesa );
+			exit( 1 );
+		}
+	}
+
+	UNLOCK_HARDWARE( rmesa );
+}
+
+static int radeonScheduleSwap(__DRIdrawablePrivate *dPriv, GLboolean *missed_target)
+{
+	radeonContextPtr rmesa;
+
+	rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	radeon_firevertices(rmesa);
+
+	LOCK_HARDWARE( rmesa );
+
+	if (!dPriv->numClipRects) {
+		UNLOCK_HARDWARE(rmesa);
+		usleep(10000);	/* throttle invisible client 10ms */
+		return 0;
+	}
+
+	radeonWaitForFrameCompletion(rmesa);
+
+	UNLOCK_HARDWARE(rmesa);
+	driWaitForVBlank(dPriv, missed_target);
+
+	return 0;
+}
+
+static GLboolean radeonPageFlip( __DRIdrawablePrivate *dPriv )
+{
+	radeonContextPtr radeon;
+	GLint ret;
+	__DRIscreenPrivate *psp;
+	struct radeon_renderbuffer *rrb;
+	struct radeon_framebuffer *rfb;
+
+	assert(dPriv);
+	assert(dPriv->driContextPriv);
+	assert(dPriv->driContextPriv->driverPrivate);
+
+	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+	rfb = dPriv->driverPrivate;
+	rrb = (void *)rfb->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
+
+	psp = dPriv->driScreenPriv;
+
+	LOCK_HARDWARE(radeon);
+
+	if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+		fprintf(stderr, "%s: pfCurrentPage: %d %d\n", __FUNCTION__,
+			radeon->sarea->pfCurrentPage, radeon->sarea->pfState);
+	}
+	drm_clip_rect_t *box = dPriv->pClipRects;
+	drm_clip_rect_t *b = radeon->sarea->boxes;
+	b[0] = box[0];
+	radeon->sarea->nbox = 1;
+
+	ret = drmCommandNone( radeon->dri.fd, DRM_RADEON_FLIP );
+
+	UNLOCK_HARDWARE(radeon);
+
+	if ( ret ) {
+		fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
+		return GL_FALSE;
+	}
+
+	if (!rfb->pf_active)
+		return GL_FALSE;
+
+	rfb->pf_current_page = radeon->sarea->pfCurrentPage;
+	radeon_flip_renderbuffers(rfb);
+	radeon_draw_buffer(radeon->glCtx, &rfb->base);
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Swap front and back buffer.
+ */
+void radeonSwapBuffers(__DRIdrawablePrivate * dPriv)
+{
+	int64_t ust;
+	__DRIscreenPrivate *psp;
+
+	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+		radeonContextPtr radeon;
+		GLcontext *ctx;
+
+		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+		ctx = radeon->glCtx;
+
+		if (ctx->Visual.doubleBufferMode) {
+			GLboolean missed_target;
+			struct radeon_framebuffer *rfb = dPriv->driverPrivate;
+			_mesa_notifySwapBuffers(ctx);/* flush pending rendering comands */
+
+			radeonScheduleSwap(dPriv, &missed_target);
+
+			if (rfb->pf_active) {
+				radeonPageFlip(dPriv);
+			} else {
+				radeonCopyBuffer(dPriv, NULL);
+			}
+
+			psp = dPriv->driScreenPriv;
+
+			rfb->swap_count++;
+			(*psp->systemTime->getUST)( & ust );
+			if ( missed_target ) {
+				rfb->swap_missed_count++;
+				rfb->swap_missed_ust = ust - rfb->swap_ust;
+			}
+
+			rfb->swap_ust = ust;
+			radeon->hw.all_dirty = GL_TRUE;
+		}
+	} else {
+		/* XXX this shouldn't be an error but we can't handle it for now */
+		_mesa_problem(NULL, "%s: drawable has no context!",
+			      __FUNCTION__);
+	}
+}
+
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h )
+{
+	if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
+		radeonContextPtr radeon;
+		GLcontext *ctx;
+
+		radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
+		ctx = radeon->glCtx;
+
+		if (ctx->Visual.doubleBufferMode) {
+			drm_clip_rect_t rect;
+			rect.x1 = x + dPriv->x;
+			rect.y1 = (dPriv->h - y - h) + dPriv->y;
+			rect.x2 = rect.x1 + w;
+			rect.y2 = rect.y1 + h;
+			_mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
+			radeonCopyBuffer(dPriv, &rect);
+		}
+	} else {
+		/* XXX this shouldn't be an error but we can't handle it for now */
+		_mesa_problem(NULL, "%s: drawable has no context!",
+			      __FUNCTION__);
+	}
+}
+
+void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct radeon_renderbuffer *rrbDepth = NULL, *rrbStencil = NULL,
+		*rrbColor = NULL;
+	uint32_t offset = 0;
+
+
+	if (!fb) {
+		/* this can happen during the initial context initialization */
+		return;
+	}
+
+	/* radeons only handle 1 color draw so far */
+	if (fb->_NumColorDrawBuffers != 1) {
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE);
+		return;
+	}
+
+	/* Do this here, note core Mesa, since this function is called from
+	 * many places within the driver.
+	 */
+	if (ctx->NewState & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
+		/* this updates the DrawBuffer->_NumColorDrawBuffers fields, etc */
+		_mesa_update_framebuffer(ctx);
+		/* this updates the DrawBuffer's Width/Height if it's a FBO */
+		_mesa_update_draw_buffer_bounds(ctx);
+	}
+
+	if (fb->_Status != GL_FRAMEBUFFER_COMPLETE_EXT) {
+		/* this may occur when we're called by glBindFrameBuffer() during
+		 * the process of someone setting up renderbuffers, etc.
+		 */
+		/*_mesa_debug(ctx, "DrawBuffer: incomplete user FBO\n");*/
+		return;
+	}
+
+	if (fb->Name)
+		;/* do something depthy/stencily TODO */
+
+
+		/* none */
+	if (fb->Name == 0) {
+		if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
+			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer);
+			radeon->front_cliprects = GL_TRUE;
+			radeon->front_buffer_dirty = GL_TRUE;
+		} else {
+			rrbColor = radeon_renderbuffer(fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer);
+			radeon->front_cliprects = GL_FALSE;
+		}
+	} else {
+		/* user FBO in theory */
+		struct radeon_renderbuffer *rrb;
+		rrb = radeon_renderbuffer(fb->_ColorDrawBuffers[0]);
+		if (rrb) {
+			offset = rrb->draw_offset;
+			rrbColor = rrb;
+		}
+		radeon->constant_cliprect = GL_TRUE;
+	}
+
+	if (rrbColor == NULL)
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE);
+	else
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DRAW_BUFFER, GL_FALSE);
+
+
+	if (fb->_DepthBuffer && fb->_DepthBuffer->Wrapped) {
+		rrbDepth = radeon_renderbuffer(fb->_DepthBuffer->Wrapped);
+		if (rrbDepth && rrbDepth->bo) {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DEPTH_BUFFER, GL_FALSE);
+		} else {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DEPTH_BUFFER, GL_TRUE);
+		}
+	} else {
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_DEPTH_BUFFER, GL_FALSE);
+		rrbDepth = NULL;
+	}
+
+	if (fb->_StencilBuffer && fb->_StencilBuffer->Wrapped) {
+		rrbStencil = radeon_renderbuffer(fb->_StencilBuffer->Wrapped);
+		if (rrbStencil && rrbStencil->bo) {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_STENCIL_BUFFER, GL_FALSE);
+			/* need to re-compute stencil hw state */
+			if (!rrbDepth)
+				rrbDepth = rrbStencil;
+		} else {
+			radeon->vtbl.fallback(ctx, RADEON_FALLBACK_STENCIL_BUFFER, GL_TRUE);
+		}
+	} else {
+		radeon->vtbl.fallback(ctx, RADEON_FALLBACK_STENCIL_BUFFER, GL_FALSE);
+		if (ctx->Driver.Enable != NULL)
+			ctx->Driver.Enable(ctx, GL_STENCIL_TEST, ctx->Stencil.Enabled);
+		else
+			ctx->NewState |= _NEW_STENCIL;
+	}
+
+	/* Update culling direction which changes depending on the
+	 * orientation of the buffer:
+	 */
+	if (ctx->Driver.FrontFace)
+		ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+	else
+		ctx->NewState |= _NEW_POLYGON;
+
+	/*
+	 * Update depth test state
+	 */
+	if (ctx->Driver.Enable) {
+		ctx->Driver.Enable(ctx, GL_DEPTH_TEST,
+				   (ctx->Depth.Test && fb->Visual.depthBits > 0));
+		/* Need to update the derived ctx->Stencil._Enabled first */
+		ctx->Driver.Enable(ctx, GL_STENCIL_TEST,
+				   (ctx->Stencil.Enabled && fb->Visual.stencilBits > 0));
+	} else {
+		ctx->NewState |= (_NEW_DEPTH | _NEW_STENCIL);
+	}
+
+	_mesa_reference_renderbuffer(&radeon->state.depth.rb, &rrbDepth->base);
+	_mesa_reference_renderbuffer(&radeon->state.color.rb, &rrbColor->base);
+	radeon->state.color.draw_offset = offset;
+
+#if 0
+	/* update viewport since it depends on window size */
+	if (ctx->Driver.Viewport) {
+		ctx->Driver.Viewport(ctx, ctx->Viewport.X, ctx->Viewport.Y,
+				     ctx->Viewport.Width, ctx->Viewport.Height);
+	} else {
+
+	}
+#endif
+	ctx->NewState |= _NEW_VIEWPORT;
+
+	/* Set state we know depends on drawable parameters:
+	 */
+	radeonUpdateScissor(ctx);
+	radeon->NewGLState |= _NEW_SCISSOR;
+
+	if (ctx->Driver.DepthRange)
+		ctx->Driver.DepthRange(ctx,
+				       ctx->Viewport.Near,
+				       ctx->Viewport.Far);
+
+	/* Update culling direction which changes depending on the
+	 * orientation of the buffer:
+	 */
+	if (ctx->Driver.FrontFace)
+		ctx->Driver.FrontFace(ctx, ctx->Polygon.FrontFace);
+	else
+		ctx->NewState |= _NEW_POLYGON;
+}
+
+/**
+ * Called via glDrawBuffer.
+ */
+void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
+{
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "%s %s\n", __FUNCTION__,
+			_mesa_lookup_enum_by_nr( mode ));
+
+	if (ctx->DrawBuffer->Name == 0) {
+		radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+
+		const GLboolean was_front_buffer_rendering =
+			radeon->is_front_buffer_rendering;
+
+		radeon->is_front_buffer_rendering = (mode == GL_FRONT_LEFT) ||
+                                            (mode == GL_FRONT);
+
+      /* If we weren't front-buffer rendering before but we are now, make sure
+       * that the front-buffer has actually been allocated.
+       */
+		if (!was_front_buffer_rendering && radeon->is_front_buffer_rendering) {
+			radeon_update_renderbuffers(radeon->dri.context,
+				radeon->dri.context->driDrawablePriv);
+      }
+	}
+
+	radeon_draw_buffer(ctx, ctx->DrawBuffer);
+}
+
+void radeonReadBuffer( GLcontext *ctx, GLenum mode )
+{
+	if ((ctx->DrawBuffer != NULL) && (ctx->DrawBuffer->Name == 0)) {
+		struct radeon_context *const rmesa = RADEON_CONTEXT(ctx);
+		const GLboolean was_front_buffer_reading = rmesa->is_front_buffer_reading;
+		rmesa->is_front_buffer_reading = (mode == GL_FRONT_LEFT)
+					|| (mode == GL_FRONT);
+
+		if (!was_front_buffer_reading && rmesa->is_front_buffer_reading) {
+			radeon_update_renderbuffers(rmesa->dri.context,
+						    rmesa->dri.context->driReadablePriv);
+	 	}
+	}
+	/* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
+	if (ctx->ReadBuffer == ctx->DrawBuffer) {
+		/* This will update FBO completeness status.
+		 * A framebuffer will be incomplete if the GL_READ_BUFFER setting
+		 * refers to a missing renderbuffer.  Calling glReadBuffer can set
+		 * that straight and can make the drawing buffer complete.
+		 */
+		radeon_draw_buffer(ctx, ctx->DrawBuffer);
+	}
+}
+
+
+/* Turn on/off page flipping according to the flags in the sarea:
+ */
+void radeonUpdatePageFlipping(radeonContextPtr radeon)
+{
+	struct radeon_framebuffer *rfb = radeon_get_drawable(radeon)->driverPrivate;
+
+	rfb->pf_active = radeon->sarea->pfState;
+	rfb->pf_current_page = radeon->sarea->pfCurrentPage;
+	rfb->pf_num_pages = 2;
+	radeon_flip_renderbuffers(rfb);
+	radeon_draw_buffer(radeon->glCtx, radeon->glCtx->DrawBuffer);
+}
+
+void radeon_window_moved(radeonContextPtr radeon)
+{
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		radeonUpdatePageFlipping(radeon);
+	}
+	radeonSetCliprects(radeon);
+}
+
+void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	__DRIcontext *driContext = radeon->dri.context;
+	void (*old_viewport)(GLcontext *ctx, GLint x, GLint y,
+			     GLsizei w, GLsizei h);
+
+	if (!driContext->driScreenPriv->dri2.enabled)
+		return;
+
+	if (!radeon->meta.internal_viewport_call && ctx->DrawBuffer->Name == 0) {
+		if (radeon->is_front_buffer_rendering) {
+			radeonFlush(ctx);
+		}
+		radeon_update_renderbuffers(driContext, driContext->driDrawablePriv);
+		if (driContext->driDrawablePriv != driContext->driReadablePriv)
+			radeon_update_renderbuffers(driContext, driContext->driReadablePriv);
+	}
+
+	old_viewport = ctx->Driver.Viewport;
+	ctx->Driver.Viewport = NULL;
+	radeon_window_moved(radeon);
+	radeon_draw_buffer(ctx, radeon->glCtx->DrawBuffer);
+	ctx->Driver.Viewport = old_viewport;
+}
+
+static void radeon_print_state_atom(radeonContextPtr radeon, struct radeon_state_atom *state)
+{
+	int i, j, reg;
+	int dwords = (*state->check) (radeon->glCtx, state);
+	drm_r300_cmd_header_t cmd;
+
+	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
+
+	if (RADEON_DEBUG & DEBUG_VERBOSE) {
+		for (i = 0; i < dwords;) {
+			cmd = *((drm_r300_cmd_header_t *) &state->cmd[i]);
+			reg = (cmd.packet0.reghi << 8) | cmd.packet0.reglo;
+			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
+					state->name, i, reg, cmd.packet0.count);
+			++i;
+			for (j = 0; j < cmd.packet0.count && i < dwords; j++) {
+				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
+						state->name, i, reg, state->cmd[i]);
+				reg += 4;
+				++i;
+			}
+		}
+	}
+}
+
+static void radeon_print_state_atom_kmm(radeonContextPtr radeon, struct radeon_state_atom *state)
+{
+	int i, j, reg, count;
+	int dwords = (*state->check) (radeon->glCtx, state);
+	uint32_t packet0;
+
+	fprintf(stderr, "  emit %s %d/%d\n", state->name, dwords, state->cmd_size);
+
+	if (RADEON_DEBUG & DEBUG_VERBOSE) {
+		for (i = 0; i < dwords;) {
+			packet0 = state->cmd[i];
+			reg = (packet0 & 0x1FFF) << 2;
+			count = ((packet0 & 0x3FFF0000) >> 16) + 1;
+			fprintf(stderr, "      %s[%d]: cmdpacket0 (first reg=0x%04x, count=%d)\n",
+					state->name, i, reg, count);
+			++i;
+			for (j = 0; j < count && i < dwords; j++) {
+				fprintf(stderr, "      %s[%d]: 0x%04x = %08x\n",
+						state->name, i, reg, state->cmd[i]);
+				reg += 4;
+				++i;
+			}
+		}
+	}
+}
+
+static INLINE void radeonEmitAtoms(radeonContextPtr radeon, GLboolean dirty)
+{
+	BATCH_LOCALS(radeon);
+	struct radeon_state_atom *atom;
+	int dwords;
+
+	if (radeon->vtbl.pre_emit_atoms)
+		radeon->vtbl.pre_emit_atoms(radeon);
+
+	/* Emit actual atoms */
+	foreach(atom, &radeon->hw.atomlist) {
+		if ((atom->dirty || radeon->hw.all_dirty) == dirty) {
+			dwords = (*atom->check) (radeon->glCtx, atom);
+			if (dwords) {
+				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+					if (radeon->radeonScreen->kernel_mm)
+						radeon_print_state_atom_kmm(radeon, atom);
+					else
+						radeon_print_state_atom(radeon, atom);
+				}
+				if (atom->emit) {
+					(*atom->emit)(radeon->glCtx, atom);
+				} else {
+					BEGIN_BATCH_NO_AUTOSTATE(dwords);
+					OUT_BATCH_TABLE(atom->cmd, dwords);
+					END_BATCH();
+				}
+				atom->dirty = GL_FALSE;
+			} else {
+				if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_STATE) {
+					fprintf(stderr, "  skip state %s\n",
+						atom->name);
+				}
+			}
+		}
+	}
+
+	COMMIT_BATCH();
+}
+
+static GLboolean radeon_revalidate_bos(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	int ret;
+
+	ret = radeon_cs_space_check(radeon->cmdbuf.cs);
+	if (ret == RADEON_CS_SPACE_FLUSH)
+		return GL_FALSE;
+	return GL_TRUE;
+}
+
+void radeonEmitState(radeonContextPtr radeon)
+{
+	if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (radeon->vtbl.pre_emit_state)
+		radeon->vtbl.pre_emit_state(radeon);
+
+	/* this code used to return here but now it emits zbs */
+	if (radeon->cmdbuf.cs->cdw && !radeon->hw.is_dirty && !radeon->hw.all_dirty)
+		return;
+
+	/* To avoid going across the entire set of states multiple times, just check
+	 * for enough space for the case of emitting all state, and inline the
+	 * radeonAllocCmdBuf code here without all the checks.
+	 */
+	rcommonEnsureCmdBufSpace(radeon, radeon->hw.max_state_size, __FUNCTION__);
+
+	if (!radeon->cmdbuf.cs->cdw) {
+		if (RADEON_DEBUG & DEBUG_STATE)
+			fprintf(stderr, "Begin reemit state\n");
+
+		radeonEmitAtoms(radeon, GL_FALSE);
+	}
+
+	if (RADEON_DEBUG & DEBUG_STATE)
+		fprintf(stderr, "Begin dirty state\n");
+
+	radeonEmitAtoms(radeon, GL_TRUE);
+	radeon->hw.is_dirty = GL_FALSE;
+	radeon->hw.all_dirty = GL_FALSE;
+
+}
+
+
+void radeonFlush(GLcontext *ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, radeon->cmdbuf.cs->cdw);
+
+	/* okay if we have no cmds in the buffer &&
+	   we have no DMA flush &&
+	   we have no DMA buffer allocated.
+	   then no point flushing anything at all.
+	*/
+	if (!radeon->dma.flush && !radeon->cmdbuf.cs->cdw && !radeon->dma.current)
+		return;
+
+	if (radeon->dma.flush)
+		radeon->dma.flush( ctx );
+
+	radeonEmitState(radeon);
+
+	if (radeon->cmdbuf.cs->cdw)
+		rcommonFlushCmdBuf(radeon, __FUNCTION__);
+
+	if ((ctx->DrawBuffer->Name == 0) && radeon->front_buffer_dirty) {
+		__DRIscreen *const screen = radeon->radeonScreen->driScreen;
+
+		if (screen->dri2.loader && (screen->dri2.loader->base.version >= 2)
+			&& (screen->dri2.loader->flushFrontBuffer != NULL)) {
+			__DRIdrawablePrivate * drawable = radeon_get_drawable(radeon);
+			(*screen->dri2.loader->flushFrontBuffer)(drawable, drawable->loaderPrivate);
+
+			/* Only clear the dirty bit if front-buffer rendering is no longer
+			 * enabled.  This is done so that the dirty bit can only be set in
+			 * glDrawBuffer.  Otherwise the dirty bit would have to be set at
+			 * each of N places that do rendering.  This has worse performances,
+			 * but it is much easier to get correct.
+			 */
+			if (radeon->is_front_buffer_rendering) {
+				radeon->front_buffer_dirty = GL_FALSE;
+			}
+		}
+	}
+}
+
+/* Make sure all commands have been sent to the hardware and have
+ * completed processing.
+ */
+void radeonFinish(GLcontext * ctx)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	struct gl_framebuffer *fb = ctx->DrawBuffer;
+	int i;
+
+	if (ctx->Driver.Flush)
+		ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+	if (radeon->radeonScreen->kernel_mm) {
+		for (i = 0; i < fb->_NumColorDrawBuffers; i++) {
+			struct radeon_renderbuffer *rrb;
+			rrb = radeon_renderbuffer(fb->_ColorDrawBuffers[i]);
+			if (rrb && rrb->bo)
+				radeon_bo_wait(rrb->bo);
+		}
+		{
+			struct radeon_renderbuffer *rrb;
+			rrb = radeon_get_depthbuffer(radeon);
+			if (rrb && rrb->bo)
+				radeon_bo_wait(rrb->bo);
+		}
+	} else if (radeon->do_irqs) {
+		LOCK_HARDWARE(radeon);
+		radeonEmitIrqLocked(radeon);
+		UNLOCK_HARDWARE(radeon);
+		radeonWaitIrq(radeon);
+	} else {
+		radeonWaitForIdle(radeon);
+	}
+}
+
+/* cmdbuffer */
+/**
+ * Send the current command buffer via ioctl to the hardware.
+ */
+int rcommonFlushCmdBufLocked(radeonContextPtr rmesa, const char *caller)
+{
+	int ret = 0;
+
+	if (rmesa->cmdbuf.flushing) {
+		fprintf(stderr, "Recursive call into r300FlushCmdBufLocked!\n");
+		exit(-1);
+	}
+	rmesa->cmdbuf.flushing = 1;
+
+	if (RADEON_DEBUG & DEBUG_IOCTL) {
+		fprintf(stderr, "%s from %s - %i cliprects\n",
+			__FUNCTION__, caller, rmesa->numClipRects);
+	}
+
+	if (rmesa->cmdbuf.cs->cdw) {
+		ret = radeon_cs_emit(rmesa->cmdbuf.cs);
+		rmesa->hw.all_dirty = GL_TRUE;
+	}
+	radeon_cs_erase(rmesa->cmdbuf.cs);
+	rmesa->cmdbuf.flushing = 0;
+
+	if (radeon_revalidate_bos(rmesa->glCtx) == GL_FALSE) {
+		fprintf(stderr,"failed to revalidate buffers\n");
+	}
+
+	return ret;
+}
+
+int rcommonFlushCmdBuf(radeonContextPtr rmesa, const char *caller)
+{
+	int ret;
+
+	radeonReleaseDmaRegion(rmesa);
+
+	LOCK_HARDWARE(rmesa);
+	ret = rcommonFlushCmdBufLocked(rmesa, caller);
+	UNLOCK_HARDWARE(rmesa);
+
+	if (ret) {
+		fprintf(stderr, "drmRadeonCmdBuffer: %d\n", ret);
+		_mesa_exit(ret);
+	}
+
+	return ret;
+}
+
+/**
+ * Make sure that enough space is available in the command buffer
+ * by flushing if necessary.
+ *
+ * \param dwords The number of dwords we need to be free on the command buffer
+ */
+void rcommonEnsureCmdBufSpace(radeonContextPtr rmesa, int dwords, const char *caller)
+{
+	if ((rmesa->cmdbuf.cs->cdw + dwords + 128) > rmesa->cmdbuf.size ||
+	    radeon_cs_need_flush(rmesa->cmdbuf.cs)) {
+		rcommonFlushCmdBuf(rmesa, caller);
+	}
+}
+
+void rcommonInitCmdBuf(radeonContextPtr rmesa)
+{
+	GLuint size;
+	/* Initialize command buffer */
+	size = 256 * driQueryOptioni(&rmesa->optionCache,
+				     "command_buffer_size");
+	if (size < 2 * rmesa->hw.max_state_size) {
+		size = 2 * rmesa->hw.max_state_size + 65535;
+	}
+	if (size > 64 * 256)
+		size = 64 * 256;
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA)) {
+		fprintf(stderr, "sizeof(drm_r300_cmd_header_t)=%zd\n",
+			sizeof(drm_r300_cmd_header_t));
+		fprintf(stderr, "sizeof(drm_radeon_cmd_buffer_t)=%zd\n",
+			sizeof(drm_radeon_cmd_buffer_t));
+		fprintf(stderr,
+			"Allocating %d bytes command buffer (max state is %d bytes)\n",
+			size * 4, rmesa->hw.max_state_size * 4);
+	}
+
+	if (rmesa->radeonScreen->kernel_mm) {
+		int fd = rmesa->radeonScreen->driScreen->fd;
+		rmesa->cmdbuf.csm = radeon_cs_manager_gem_ctor(fd);
+	} else {
+		rmesa->cmdbuf.csm = radeon_cs_manager_legacy_ctor(rmesa);
+	}
+	if (rmesa->cmdbuf.csm == NULL) {
+		/* FIXME: fatal error */
+		return;
+	}
+	rmesa->cmdbuf.cs = radeon_cs_create(rmesa->cmdbuf.csm, size);
+	assert(rmesa->cmdbuf.cs != NULL);
+	rmesa->cmdbuf.size = size;
+
+	radeon_cs_space_set_flush(rmesa->cmdbuf.cs,
+				  (void (*)(void *))radeonFlush, rmesa->glCtx);
+
+	if (!rmesa->radeonScreen->kernel_mm) {
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, rmesa->radeonScreen->texSize[0]);
+		radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, rmesa->radeonScreen->gartTextures.size);
+	} else {
+		struct drm_radeon_gem_info mminfo = { 0 };
+
+		if (!drmCommandWriteRead(rmesa->dri.fd, DRM_RADEON_GEM_INFO, &mminfo, sizeof(mminfo)))
+		{
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_VRAM, mminfo.vram_visible);
+			radeon_cs_set_limit(rmesa->cmdbuf.cs, RADEON_GEM_DOMAIN_GTT, mminfo.gart_size);
+		}
+	}
+
+}
+/**
+ * Destroy the command buffer
+ */
+void rcommonDestroyCmdBuf(radeonContextPtr rmesa)
+{
+	radeon_cs_destroy(rmesa->cmdbuf.cs);
+	if (rmesa->radeonScreen->driScreen->dri2.enabled || rmesa->radeonScreen->kernel_mm) {
+		radeon_cs_manager_gem_dtor(rmesa->cmdbuf.csm);
+	} else {
+		radeon_cs_manager_legacy_dtor(rmesa->cmdbuf.csm);
+	}
+}
+
+void rcommonBeginBatch(radeonContextPtr rmesa, int n,
+		       int dostate,
+		       const char *file,
+		       const char *function,
+		       int line)
+{
+	rcommonEnsureCmdBufSpace(rmesa, n, function);
+	if (!rmesa->cmdbuf.cs->cdw && dostate) {
+		if (RADEON_DEBUG & DEBUG_IOCTL)
+			fprintf(stderr, "Reemit state after flush (from %s)\n", function);
+		radeonEmitState(rmesa);
+	}
+	radeon_cs_begin(rmesa->cmdbuf.cs, n, file, function, line);
+
+        if (DEBUG_CMDBUF && RADEON_DEBUG & DEBUG_IOCTL)
+                fprintf(stderr, "BEGIN_BATCH(%d) at %d, from %s:%i\n",
+                        n, rmesa->cmdbuf.cs->cdw, function, line);
+
+}
+
+void radeonUserClear(GLcontext *ctx, GLuint mask)
+{
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   meta_clear_tris(&rmesa->meta, mask);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_common.h b/src/mesa/drivers/dri/radeon/radeon_common.h
new file mode 100644
index 00000000000..cebae18b2d6
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common.h
@@ -0,0 +1,85 @@
+#ifndef COMMON_MISC_H
+#define COMMON_MISC_H
+
+#include "radeon_common_context.h"
+#include "radeon_dma.h"
+#include "radeon_texture.h"
+
+void radeonUserClear(GLcontext *ctx, GLuint mask);
+void radeonRecalcScissorRects(radeonContextPtr radeon);
+void radeonSetCliprects(radeonContextPtr radeon);
+void radeonUpdateScissor( GLcontext *ctx );
+void radeonScissor(GLcontext* ctx, GLint x, GLint y, GLsizei w, GLsizei h);
+
+void radeonWaitForIdleLocked(radeonContextPtr radeon);
+extern uint32_t radeonGetAge(radeonContextPtr radeon);
+void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
+		       const drm_clip_rect_t	  *rect);
+void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
+void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
+			 int x, int y, int w, int h );
+
+void radeonUpdatePageFlipping(radeonContextPtr rmesa);
+
+void radeonFlush(GLcontext *ctx);
+void radeonFinish(GLcontext * ctx);
+void radeonEmitState(radeonContextPtr radeon);
+
+void radeon_clear_tris(GLcontext *ctx, GLbitfield mask);
+
+void radeon_window_moved(radeonContextPtr radeon);
+void radeon_draw_buffer(GLcontext *ctx, struct gl_framebuffer *fb);
+void radeonDrawBuffer( GLcontext *ctx, GLenum mode );
+void radeonReadBuffer( GLcontext *ctx, GLenum mode );
+void radeon_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei width, GLsizei height);
+void radeon_get_cliprects(radeonContextPtr radeon,
+			  struct drm_clip_rect **cliprects,
+			  unsigned int *num_cliprects,
+			  int *x_off, int *y_off);
+void radeon_fbo_init(struct radeon_context *radeon);
+void
+radeon_renderbuffer_set_bo(struct radeon_renderbuffer *rb,
+			   struct radeon_bo *bo);
+struct radeon_renderbuffer *
+radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv);
+static inline struct radeon_renderbuffer *radeon_renderbuffer(struct gl_renderbuffer *rb)
+{
+	struct radeon_renderbuffer *rrb = (struct radeon_renderbuffer *)rb;
+	if (rrb && rrb->base.ClassID == RADEON_RB_CLASS)
+		return rrb;
+	else
+		return NULL;
+}
+
+static inline struct radeon_renderbuffer *radeon_get_renderbuffer(struct gl_framebuffer *fb, int att_index)
+{
+	if (att_index >= 0)
+		return radeon_renderbuffer(fb->Attachment[att_index].Renderbuffer);
+	else
+		return NULL;
+}
+
+static inline struct radeon_renderbuffer *radeon_get_depthbuffer(radeonContextPtr rmesa)
+{
+	struct radeon_renderbuffer *rrb;
+	rrb = radeon_renderbuffer(rmesa->state.depth.rb);
+	if (!rrb)
+		return NULL;
+
+	return rrb;
+}
+
+static inline struct radeon_renderbuffer *radeon_get_colorbuffer(radeonContextPtr rmesa)
+{
+	struct radeon_renderbuffer *rrb;
+
+	rrb = radeon_renderbuffer(rmesa->state.color.rb);
+	if (!rrb)
+		return NULL;
+	return rrb;
+}
+
+#include "radeon_cmdbuf.h"
+
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.c b/src/mesa/drivers/dri/radeon/radeon_common_context.c
new file mode 100644
index 00000000000..285e015c924
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.c
@@ -0,0 +1,864 @@
+/**************************************************************************
+
+Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
+                     VA Linux Systems Inc., Fremont, California.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+#include "radeon_common.h"
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+#include "utils.h"
+#include "vblank.h"
+#include "drirenderbuffer.h"
+#include "main/context.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/state.h"
+#include "main/simple_list.h"
+#include "swrast/swrast.h"
+#include "swrast_setup/swrast_setup.h"
+#include "tnl/tnl.h"
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
+#include "r600_context.h"
+#endif
+
+#define DRIVER_DATE "20090101"
+
+#ifndef RADEON_DEBUG
+int RADEON_DEBUG = (0);
+#endif
+
+
+static const char* get_chip_family_name(int chip_family)
+{
+	switch(chip_family) {
+	case CHIP_FAMILY_R100: return "R100";
+	case CHIP_FAMILY_RV100: return "RV100";
+	case CHIP_FAMILY_RS100: return "RS100";
+	case CHIP_FAMILY_RV200: return "RV200";
+	case CHIP_FAMILY_RS200: return "RS200";
+	case CHIP_FAMILY_R200: return "R200";
+	case CHIP_FAMILY_RV250: return "RV250";
+	case CHIP_FAMILY_RS300: return "RS300";
+	case CHIP_FAMILY_RV280: return "RV280";
+	case CHIP_FAMILY_R300: return "R300";
+	case CHIP_FAMILY_R350: return "R350";
+	case CHIP_FAMILY_RV350: return "RV350";
+	case CHIP_FAMILY_RV380: return "RV380";
+	case CHIP_FAMILY_R420: return "R420";
+	case CHIP_FAMILY_RV410: return "RV410";
+	case CHIP_FAMILY_RS400: return "RS400";
+	case CHIP_FAMILY_RS600: return "RS600";
+	case CHIP_FAMILY_RS690: return "RS690";
+	case CHIP_FAMILY_RS740: return "RS740";
+	case CHIP_FAMILY_RV515: return "RV515";
+	case CHIP_FAMILY_R520: return "R520";
+	case CHIP_FAMILY_RV530: return "RV530";
+	case CHIP_FAMILY_R580: return "R580";
+	case CHIP_FAMILY_RV560: return "RV560";
+	case CHIP_FAMILY_RV570: return "RV570";
+	default: return "unknown";
+	}
+}
+
+
+/* Return various strings for glGetString().
+ */
+static const GLubyte *radeonGetString(GLcontext * ctx, GLenum name)
+{
+	radeonContextPtr radeon = RADEON_CONTEXT(ctx);
+	static char buffer[128];
+
+	switch (name) {
+	case GL_VENDOR:
+		if (IS_R600_CLASS(radeon->radeonScreen))
+			return (GLubyte *) "Advanced Micro Devices, Inc.";
+		else if (IS_R300_CLASS(radeon->radeonScreen))
+			return (GLubyte *) "DRI R300 Project";
+		else
+			return (GLubyte *) "Tungsten Graphics, Inc.";
+
+	case GL_RENDERER:
+	{
+		unsigned offset;
+		GLuint agp_mode = (radeon->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
+			radeon->radeonScreen->AGPMode;
+		const char* chipclass;
+		char hardwarename[32];
+
+		if (IS_R600_CLASS(radeon->radeonScreen))
+			chipclass = "R600";
+		else if (IS_R300_CLASS(radeon->radeonScreen))
+			chipclass = "R300";
+		else if (IS_R200_CLASS(radeon->radeonScreen))
+			chipclass = "R200";
+		else
+			chipclass = "R100";
+
+		sprintf(hardwarename, "%s (%s %04X)",
+		        chipclass,
+		        get_chip_family_name(radeon->radeonScreen->chip_family),
+		        radeon->radeonScreen->device_id);
+
+		offset = driGetRendererString(buffer, hardwarename, DRIVER_DATE,
+					      agp_mode);
+
+		if (IS_R600_CLASS(radeon->radeonScreen)) {
+			sprintf(&buffer[offset], " TCL");
+		} else if (IS_R300_CLASS(radeon->radeonScreen)) {
+			sprintf(&buffer[offset], " %sTCL",
+				(radeon->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)
+				? "" : "NO-");
+		} else {
+			sprintf(&buffer[offset], " %sTCL",
+				!(radeon->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
+				? "" : "NO-");
+		}
+
+		if (radeon->radeonScreen->driScreen->dri2.enabled)
+			strcat(buffer, " DRI2");
+
+		return (GLubyte *) buffer;
+	}
+
+	default:
+		return NULL;
+	}
+}
+
+/* Initialize the driver's misc functions.
+ */
+static void radeonInitDriverFuncs(struct dd_function_table *functions)
+{
+	functions->GetString = radeonGetString;
+}
+
+/**
+ * Create and initialize all common fields of the context,
+ * including the Mesa context itself.
+ */
+GLboolean radeonInitContext(radeonContextPtr radeon,
+			    struct dd_function_table* functions,
+			    const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate)
+{
+	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+	GLcontext* ctx;
+	GLcontext* shareCtx;
+	int fthrottle_mode;
+
+	/* Fill in additional standard functions. */
+	radeonInitDriverFuncs(functions);
+
+	radeon->radeonScreen = screen;
+	/* Allocate and initialize the Mesa context */
+	if (sharedContextPrivate)
+		shareCtx = ((radeonContextPtr)sharedContextPrivate)->glCtx;
+	else
+		shareCtx = NULL;
+	radeon->glCtx = _mesa_create_context(glVisual, shareCtx,
+					    functions, (void *)radeon);
+	if (!radeon->glCtx)
+		return GL_FALSE;
+
+	ctx = radeon->glCtx;
+	driContextPriv->driverPrivate = radeon;
+
+	meta_init_metaops(ctx, &radeon->meta);
+	/* DRI fields */
+	radeon->dri.context = driContextPriv;
+	radeon->dri.screen = sPriv;
+	radeon->dri.hwContext = driContextPriv->hHWContext;
+	radeon->dri.hwLock = &sPriv->pSAREA->lock;
+	radeon->dri.fd = sPriv->fd;
+	radeon->dri.drmMinor = sPriv->drm_version.minor;
+
+	radeon->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+					       screen->sarea_priv_offset);
+
+	/* Setup IRQs */
+	fthrottle_mode = driQueryOptioni(&radeon->optionCache, "fthrottle_mode");
+	radeon->iw.irq_seq = -1;
+	radeon->irqsEmitted = 0;
+	if (IS_R600_CLASS(radeon->radeonScreen))
+		radeon->do_irqs = 0;
+	else
+		radeon->do_irqs = (fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS &&
+				   radeon->radeonScreen->irq);
+
+	radeon->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+
+	if (!radeon->do_irqs)
+		fprintf(stderr,
+			"IRQ's not enabled, falling back to %s: %d %d\n",
+			radeon->do_usleeps ? "usleeps" : "busy waits",
+			fthrottle_mode, radeon->radeonScreen->irq);
+
+        radeon->texture_depth = driQueryOptioni (&radeon->optionCache,
+					        "texture_depth");
+        if (radeon->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
+                radeon->texture_depth = ( glVisual->rgbBits > 16 ) ?
+	        DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
+
+	radeon->texture_row_align = 32;
+
+	return GL_TRUE;
+}
+
+
+
+/**
+ * Destroy the command buffer and state atoms.
+ */
+static void radeon_destroy_atom_list(radeonContextPtr radeon)
+{
+	struct radeon_state_atom *atom;
+
+	foreach(atom, &radeon->hw.atomlist) {
+		FREE(atom->cmd);
+		if (atom->lastcmd)
+			FREE(atom->lastcmd);
+	}
+
+}
+
+/**
+ * Cleanup common context fields.
+ * Called by r200DestroyContext/r300DestroyContext
+ */
+void radeonDestroyContext(__DRIcontextPrivate *driContextPriv )
+{
+#ifdef RADEON_BO_TRACK
+	FILE *track;
+#endif
+	GET_CURRENT_CONTEXT(ctx);
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+	radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
+
+    /* +r6/r7 */
+    __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
+	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+    /* --------- */
+
+	if (radeon == current) {
+		radeon_firevertices(radeon);
+		_mesa_make_current(NULL, NULL, NULL);
+	}
+
+	assert(radeon);
+	if (radeon) 
+    {
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
+	    if (IS_R600_CLASS(screen))
+        {
+		r600DestroyContext(driContextPriv);
+        }
+#endif
+
+		if (radeon->dma.current) {
+			rcommonFlushCmdBuf( radeon, __FUNCTION__ );
+		}
+
+		radeonReleaseArrays(radeon->glCtx, ~0);
+		meta_destroy_metaops(&radeon->meta);
+		if (radeon->vtbl.free_context)
+			radeon->vtbl.free_context(radeon->glCtx);
+		_swsetup_DestroyContext( radeon->glCtx );
+		_tnl_DestroyContext( radeon->glCtx );
+		_vbo_DestroyContext( radeon->glCtx );
+		_swrast_DestroyContext( radeon->glCtx );
+
+		/* free atom list */
+		/* free the Mesa context */
+		_mesa_destroy_context(radeon->glCtx);
+
+		/* _mesa_destroy_context() might result in calls to functions that
+		 * depend on the DriverCtx, so don't set it to NULL before.
+		 *
+		 * radeon->glCtx->DriverCtx = NULL;
+		 */
+		/* free the option cache */
+		driDestroyOptionCache(&radeon->optionCache);
+
+		rcommonDestroyCmdBuf(radeon);
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600) /* +r6/r7 */
+	    if (!IS_R600_CLASS(screen))
+#endif
+		radeon_destroy_atom_list(radeon);
+
+		if (radeon->state.scissor.pClipRects) {
+			FREE(radeon->state.scissor.pClipRects);
+			radeon->state.scissor.pClipRects = 0;
+		}
+	}
+#ifdef RADEON_BO_TRACK
+	track = fopen("/tmp/tracklog", "w");
+	if (track) {
+		radeon_tracker_print(&radeon->radeonScreen->bom->tracker, track);
+		fclose(track);
+	}
+#endif
+	FREE(radeon);
+}
+
+/* Force the context `c' to be unbound from its buffer.
+ */
+GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv)
+{
+	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "%s ctx %p\n", __FUNCTION__,
+			radeon->glCtx);
+
+	return GL_TRUE;
+}
+
+
+static void
+radeon_make_kernel_renderbuffer_current(radeonContextPtr radeon,
+					struct radeon_framebuffer *draw)
+{
+	/* if radeon->fake */
+	struct radeon_renderbuffer *rb;
+
+	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->frontOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Front Buf");
+#else
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->frontOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->frontPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->backOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Back Buf");
+#else
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->backOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->backPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_DEPTH].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Z Buf");
+#else
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_STENCIL].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Stencil Buf");
+#else
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+}
+
+static void
+radeon_make_renderbuffer_current(radeonContextPtr radeon,
+				 struct radeon_framebuffer *draw)
+{
+	int size = 4096*4096*4;
+	/* if radeon->fake */
+	struct radeon_renderbuffer *rb;
+
+	if (radeon->radeonScreen->kernel_mm) {
+		radeon_make_kernel_renderbuffer_current(radeon, draw);
+		return;
+	}
+
+
+	if ((rb = (void *)draw->base.Attachment[BUFFER_FRONT_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->frontOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Front Buf");
+#else
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->frontOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->frontPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->backOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Back Buf");
+#else
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->backOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->backPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_DEPTH].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Z Buf");
+#else
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+	if ((rb = (void *)draw->base.Attachment[BUFFER_STENCIL].Renderbuffer)) {
+		if (!rb->bo) {
+#ifdef RADEON_DEBUG_BO
+			rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0,
+                        "Stencil Buf");
+#else
+            rb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+						radeon->radeonScreen->depthOffset +
+						radeon->radeonScreen->fbLocation,
+						size,
+						4096,
+						RADEON_GEM_DOMAIN_VRAM,
+						0);
+#endif /* RADEON_DEBUG_BO */
+		}
+		rb->cpp = radeon->radeonScreen->cpp;
+		rb->pitch = radeon->radeonScreen->depthPitch * rb->cpp;
+	}
+}
+
+static unsigned
+radeon_bits_per_pixel(const struct radeon_renderbuffer *rb)
+{
+   switch (rb->base._ActualFormat) {
+   case GL_RGB5:
+   case GL_DEPTH_COMPONENT16:
+      return 16;
+   case GL_RGB8:
+   case GL_RGBA8:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH24_STENCIL8_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+      return 32;
+   default:
+      return 0;
+   }
+}
+
+void
+radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable)
+{
+	unsigned int attachments[10];
+	__DRIbuffer *buffers = NULL;
+	__DRIscreen *screen;
+	struct radeon_renderbuffer *rb;
+	int i, count;
+	struct radeon_framebuffer *draw;
+	radeonContextPtr radeon;
+	char *regname;
+	struct radeon_bo *depth_bo = NULL, *bo;
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+	    fprintf(stderr, "enter %s, drawable %p\n", __func__, drawable);
+
+	draw = drawable->driverPrivate;
+	screen = context->driScreenPriv;
+	radeon = (radeonContextPtr) context->driverPrivate;
+
+	if (screen->dri2.loader
+	   && (screen->dri2.loader->base.version > 2)
+	   && (screen->dri2.loader->getBuffersWithFormat != NULL)) {
+		struct radeon_renderbuffer *depth_rb;
+		struct radeon_renderbuffer *stencil_rb;
+
+		i = 0;
+		if ((radeon->is_front_buffer_rendering ||
+		     radeon->is_front_buffer_reading ||
+		     !draw->color_rb[1])
+		    && draw->color_rb[0]) {
+			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[0]);
+		}
+
+		if (draw->color_rb[1]) {
+			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+			attachments[i++] = radeon_bits_per_pixel(draw->color_rb[1]);
+		}
+
+		depth_rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+		stencil_rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+
+		if ((depth_rb != NULL) && (stencil_rb != NULL)) {
+			attachments[i++] = __DRI_BUFFER_DEPTH_STENCIL;
+			attachments[i++] = radeon_bits_per_pixel(depth_rb);
+		} else if (depth_rb != NULL) {
+			attachments[i++] = __DRI_BUFFER_DEPTH;
+			attachments[i++] = radeon_bits_per_pixel(depth_rb);
+		} else if (stencil_rb != NULL) {
+			attachments[i++] = __DRI_BUFFER_STENCIL;
+			attachments[i++] = radeon_bits_per_pixel(stencil_rb);
+		}
+
+		buffers = (*screen->dri2.loader->getBuffersWithFormat)(drawable,
+								&drawable->w,
+								&drawable->h,
+								attachments, i / 2,
+								&count,
+								drawable->loaderPrivate);
+	} else if (screen->dri2.loader) {
+		i = 0;
+		if (draw->color_rb[0])
+			attachments[i++] = __DRI_BUFFER_FRONT_LEFT;
+		if (draw->color_rb[1])
+			attachments[i++] = __DRI_BUFFER_BACK_LEFT;
+		if (radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH))
+			attachments[i++] = __DRI_BUFFER_DEPTH;
+		if (radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL))
+			attachments[i++] = __DRI_BUFFER_STENCIL;
+
+		buffers = (*screen->dri2.loader->getBuffers)(drawable,
+								 &drawable->w,
+								 &drawable->h,
+								 attachments, i,
+								 &count,
+								 drawable->loaderPrivate);
+	}
+
+	if (buffers == NULL)
+		return;
+
+	/* set one cliprect to cover the whole drawable */
+	drawable->x = 0;
+	drawable->y = 0;
+	drawable->backX = 0;
+	drawable->backY = 0;
+	drawable->numClipRects = 1;
+	drawable->pClipRects[0].x1 = 0;
+	drawable->pClipRects[0].y1 = 0;
+	drawable->pClipRects[0].x2 = drawable->w;
+	drawable->pClipRects[0].y2 = drawable->h;
+	drawable->numBackClipRects = 1;
+	drawable->pBackClipRects[0].x1 = 0;
+	drawable->pBackClipRects[0].y1 = 0;
+	drawable->pBackClipRects[0].x2 = drawable->w;
+	drawable->pBackClipRects[0].y2 = drawable->h;
+	for (i = 0; i < count; i++) {
+		switch (buffers[i].attachment) {
+		case __DRI_BUFFER_FRONT_LEFT:
+			rb = draw->color_rb[0];
+			regname = "dri2 front buffer";
+			break;
+		case __DRI_BUFFER_FAKE_FRONT_LEFT:
+			rb = draw->color_rb[0];
+			regname = "dri2 fake front buffer";
+			break;
+		case __DRI_BUFFER_BACK_LEFT:
+			rb = draw->color_rb[1];
+			regname = "dri2 back buffer";
+			break;
+		case __DRI_BUFFER_DEPTH:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+			regname = "dri2 depth buffer";
+			break;
+		case __DRI_BUFFER_DEPTH_STENCIL:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_DEPTH);
+			regname = "dri2 depth / stencil buffer";
+			break;
+		case __DRI_BUFFER_STENCIL:
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+			regname = "dri2 stencil buffer";
+			break;
+		case __DRI_BUFFER_ACCUM:
+		default:
+			fprintf(stderr,
+				"unhandled buffer attach event, attacment type %d\n",
+				buffers[i].attachment);
+			return;
+		}
+
+		if (rb == NULL)
+			continue;
+
+		if (rb->bo) {
+			uint32_t name = radeon_gem_name_bo(rb->bo);
+			if (name == buffers[i].name)
+				continue;
+		}
+
+		if (RADEON_DEBUG & DEBUG_DRI)
+			fprintf(stderr,
+				"attaching buffer %s, %d, at %d, cpp %d, pitch %d\n",
+				regname, buffers[i].name, buffers[i].attachment,
+				buffers[i].cpp, buffers[i].pitch);
+
+		rb->cpp = buffers[i].cpp;
+		rb->pitch = buffers[i].pitch;
+		rb->base.Width = drawable->w;
+		rb->base.Height = drawable->h;
+		rb->has_surface = 0;
+
+		if (buffers[i].attachment == __DRI_BUFFER_STENCIL && depth_bo) {
+			if (RADEON_DEBUG & DEBUG_DRI)
+				fprintf(stderr, "(reusing depth buffer as stencil)\n");
+			bo = depth_bo;
+			radeon_bo_ref(bo);
+		} else {
+#ifdef RADEON_DEBUG_BO
+            bo = radeon_bo_open(radeon->radeonScreen->bom,
+						buffers[i].name,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						buffers[i].flags,
+                        regname);
+#else
+			bo = radeon_bo_open(radeon->radeonScreen->bom,
+						buffers[i].name,
+						0,
+						0,
+						RADEON_GEM_DOMAIN_VRAM,
+						buffers[i].flags);
+#endif /* RADEON_DEBUG_BO */
+			if (bo == NULL) {
+
+				fprintf(stderr, "failed to attach %s %d\n",
+					regname, buffers[i].name);
+
+			}
+		}
+
+		if (buffers[i].attachment == __DRI_BUFFER_DEPTH) {
+			if (draw->base.Visual.depthBits == 16)
+				rb->cpp = 2;
+			depth_bo = bo;
+		}
+
+		radeon_renderbuffer_set_bo(rb, bo);
+		radeon_bo_unref(bo);
+
+		if (buffers[i].attachment == __DRI_BUFFER_DEPTH_STENCIL) {
+			rb = radeon_get_renderbuffer(&draw->base, BUFFER_STENCIL);
+			if (rb != NULL) {
+				struct radeon_bo *stencil_bo = NULL;
+
+				if (rb->bo) {
+					uint32_t name = radeon_gem_name_bo(rb->bo);
+					if (name == buffers[i].name)
+						continue;
+				}
+
+				stencil_bo = bo;
+				radeon_bo_ref(stencil_bo);
+				radeon_renderbuffer_set_bo(rb, stencil_bo);
+				radeon_bo_unref(stencil_bo);
+			}
+		}
+	}
+
+	driUpdateFramebufferSize(radeon->glCtx, drawable);
+}
+
+/* Force the context `c' to be the current context and associate with it
+ * buffer `b'.
+ */
+GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+			    __DRIdrawablePrivate * driDrawPriv,
+			    __DRIdrawablePrivate * driReadPriv)
+{
+	radeonContextPtr radeon;
+	struct radeon_framebuffer *drfb;
+	struct gl_framebuffer *readfb;
+
+	if (!driContextPriv) {
+		if (RADEON_DEBUG & DEBUG_DRI)
+			fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
+		_mesa_make_current(NULL, NULL, NULL);
+		return GL_TRUE;
+	}
+
+	radeon = (radeonContextPtr) driContextPriv->driverPrivate;
+	drfb = driDrawPriv->driverPrivate;
+	readfb = driReadPriv->driverPrivate;
+
+	if (driContextPriv->driScreenPriv->dri2.enabled) {
+		radeon_update_renderbuffers(driContextPriv, driDrawPriv);
+		if (driDrawPriv != driReadPriv)
+			radeon_update_renderbuffers(driContextPriv, driReadPriv);
+		_mesa_reference_renderbuffer(&radeon->state.color.rb,
+			&(radeon_get_renderbuffer(&drfb->base, BUFFER_BACK_LEFT)->base));
+		_mesa_reference_renderbuffer(&radeon->state.depth.rb,
+			&(radeon_get_renderbuffer(&drfb->base, BUFFER_DEPTH)->base));
+	} else {
+		radeon_make_renderbuffer_current(radeon, drfb);
+	}
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+	     fprintf(stderr, "%s ctx %p dfb %p rfb %p\n", __FUNCTION__, radeon->glCtx, drfb, readfb);
+
+	driUpdateFramebufferSize(radeon->glCtx, driDrawPriv);
+	if (driReadPriv != driDrawPriv)
+		driUpdateFramebufferSize(radeon->glCtx, driReadPriv);
+
+	_mesa_make_current(radeon->glCtx, &drfb->base, readfb);
+
+	_mesa_update_state(radeon->glCtx);
+
+	if (radeon->glCtx->DrawBuffer == &drfb->base) {
+		if (driDrawPriv->swap_interval == (unsigned)-1) {
+			int i;
+			driDrawPriv->vblFlags =
+				(radeon->radeonScreen->irq != 0)
+				? driGetDefaultVBlankFlags(&radeon->
+							   optionCache)
+				: VBLANK_FLAG_NO_IRQ;
+
+			driDrawableInitVBlank(driDrawPriv);
+			drfb->vbl_waited = driDrawPriv->vblSeq;
+
+			for (i = 0; i < 2; i++) {
+				if (drfb->color_rb[i])
+					drfb->color_rb[i]->vbl_pending = driDrawPriv->vblSeq;
+			}
+
+		}
+
+		radeon_window_moved(radeon);
+		radeon_draw_buffer(radeon->glCtx, &drfb->base);
+	}
+
+
+	if (RADEON_DEBUG & DEBUG_DRI)
+		fprintf(stderr, "End %s\n", __FUNCTION__);
+
+	return GL_TRUE;
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_common_context.h b/src/mesa/drivers/dri/radeon/radeon_common_context.h
new file mode 100644
index 00000000000..e4a8da05965
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_common_context.h
@@ -0,0 +1,596 @@
+
+#ifndef COMMON_CONTEXT_H
+#define COMMON_CONTEXT_H
+
+#include "main/mm.h"
+#include "math/m_vector.h"
+#include "texmem.h"
+#include "tnl/t_context.h"
+#include "main/colormac.h"
+
+#include "radeon_screen.h"
+#include "radeon_drm.h"
+#include "dri_util.h"
+#include "tnl/t_vertex.h"
+
+#include "dri_metaops.h"
+struct radeon_context;
+
+#include "radeon_bocs_wrapper.h"
+
+/* This union is used to avoid warnings/miscompilation
+   with float to uint32_t casts due to strict-aliasing */
+typedef union { GLfloat f; uint32_t ui32; } float_ui32_type;
+
+struct radeon_context;
+typedef struct radeon_context radeonContextRec;
+typedef struct radeon_context *radeonContextPtr;
+
+
+#define TEX_0   0x1
+#define TEX_1   0x2
+#define TEX_2   0x4
+#define TEX_3	0x8
+#define TEX_4	0x10
+#define TEX_5	0x20
+
+/* Rasterizing fallbacks */
+/* See correponding strings in r200_swtcl.c */
+#define RADEON_FALLBACK_TEXTURE		0x0001
+#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
+#define RADEON_FALLBACK_STENCIL		0x0004
+#define RADEON_FALLBACK_RENDER_MODE	0x0008
+#define RADEON_FALLBACK_BLEND_EQ	0x0010
+#define RADEON_FALLBACK_BLEND_FUNC	0x0020
+#define RADEON_FALLBACK_DISABLE 	0x0040
+#define RADEON_FALLBACK_BORDER_MODE	0x0080
+#define RADEON_FALLBACK_DEPTH_BUFFER	0x0100
+#define RADEON_FALLBACK_STENCIL_BUFFER  0x0200
+
+#define R200_FALLBACK_TEXTURE           0x01
+#define R200_FALLBACK_DRAW_BUFFER       0x02
+#define R200_FALLBACK_STENCIL           0x04
+#define R200_FALLBACK_RENDER_MODE       0x08
+#define R200_FALLBACK_DISABLE           0x10
+#define R200_FALLBACK_BORDER_MODE       0x20
+
+#define RADEON_TCL_FALLBACK_RASTER            0x1 /* rasterization */
+#define RADEON_TCL_FALLBACK_UNFILLED          0x2 /* unfilled tris */
+#define RADEON_TCL_FALLBACK_LIGHT_TWOSIDE     0x4 /* twoside tris */
+#define RADEON_TCL_FALLBACK_MATERIAL          0x8 /* material in vb */
+#define RADEON_TCL_FALLBACK_TEXGEN_0          0x10 /* texgen, unit 0 */
+#define RADEON_TCL_FALLBACK_TEXGEN_1          0x20 /* texgen, unit 1 */
+#define RADEON_TCL_FALLBACK_TEXGEN_2          0x40 /* texgen, unit 2 */
+#define RADEON_TCL_FALLBACK_TCL_DISABLE       0x80 /* user disable */
+#define RADEON_TCL_FALLBACK_FOGCOORDSPEC      0x100 /* fogcoord, sep. spec light */
+
+/* The blit width for texture uploads
+ */
+#define BLIT_WIDTH_BYTES 1024
+
+/* Use the templated vertex format:
+ */
+#define COLOR_IS_RGBA
+#define TAG(x) radeon##x
+#include "tnl_dd/t_dd_vertex.h"
+#undef TAG
+
+#define RADEON_RB_CLASS 0xdeadbeef
+
+struct radeon_renderbuffer
+{
+	struct gl_renderbuffer base;
+	struct radeon_bo *bo;
+	unsigned int cpp;
+	/* unsigned int offset; */
+	unsigned int pitch;
+
+	uint32_t draw_offset; /* FBO */
+	/* boo Xorg 6.8.2 compat */
+	int has_surface;
+
+	GLuint pf_pending;  /**< sequence number of pending flip */
+	GLuint vbl_pending;   /**< vblank sequence number of pending flip */
+	__DRIdrawablePrivate *dPriv;
+};
+
+struct radeon_framebuffer
+{
+	struct gl_framebuffer base;
+
+	struct radeon_renderbuffer *color_rb[2];
+
+	GLuint vbl_waited;
+
+	/* buffer swap */
+	int64_t swap_ust;
+	int64_t swap_missed_ust;
+
+	GLuint swap_count;
+	GLuint swap_missed_count;
+
+	/* Drawable page flipping state */
+	GLboolean pf_active;
+	GLint pf_current_page;
+	GLint pf_num_pages;
+
+};
+
+
+struct radeon_colorbuffer_state {
+	GLuint clear;
+	int roundEnable;
+	struct gl_renderbuffer *rb;
+	uint32_t draw_offset; /* offset into color renderbuffer - FBOs */
+};
+
+struct radeon_depthbuffer_state {
+	GLuint clear;
+	struct gl_renderbuffer *rb;
+};
+
+struct radeon_scissor_state {
+	drm_clip_rect_t rect;
+	GLboolean enabled;
+
+	GLuint numClipRects;	/* Cliprects active */
+	GLuint numAllocedClipRects;	/* Cliprects available */
+	drm_clip_rect_t *pClipRects;
+};
+
+struct radeon_stencilbuffer_state {
+	GLuint clear;		/* rb3d_stencilrefmask value */
+};
+
+struct radeon_stipple_state {
+	GLuint mask[32];
+};
+
+struct radeon_state_atom {
+	struct radeon_state_atom *next, *prev;
+	const char *name;	/* for debug */
+	int cmd_size;		/* size in bytes */
+        GLuint idx;
+	GLuint is_tcl;
+        GLuint *cmd;		/* one or more cmd's */
+	GLuint *lastcmd;		/* one or more cmd's */
+	GLboolean dirty;	/* dirty-mark in emit_state_list */
+        int (*check) (GLcontext *, struct radeon_state_atom *atom); /* is this state active? */
+        void (*emit) (GLcontext *, struct radeon_state_atom *atom);
+};
+
+struct radeon_hw_state {
+  	/* Head of the linked list of state atoms. */
+	struct radeon_state_atom atomlist;
+	int max_state_size;	/* Number of bytes necessary for a full state emit. */
+	GLboolean is_dirty, all_dirty;
+};
+
+
+/* Texture related */
+typedef struct _radeon_texture_image radeon_texture_image;
+
+struct _radeon_texture_image {
+	struct gl_texture_image base;
+
+	/**
+	 * If mt != 0, the image is stored in hardware format in the
+	 * given mipmap tree. In this case, base.Data may point into the
+	 * mapping of the buffer object that contains the mipmap tree.
+	 *
+	 * If mt == 0, the image is stored in normal memory pointed to
+	 * by base.Data.
+	 */
+	struct _radeon_mipmap_tree *mt;
+	struct radeon_bo *bo;
+
+	int mtlevel; /** if mt != 0, this is the image's level in the mipmap tree */
+	int mtface; /** if mt != 0, this is the image's face in the mipmap tree */
+};
+
+
+static INLINE radeon_texture_image *get_radeon_texture_image(struct gl_texture_image *image)
+{
+	return (radeon_texture_image*)image;
+}
+
+
+typedef struct radeon_tex_obj radeonTexObj, *radeonTexObjPtr;
+
+#define RADEON_TXO_MICRO_TILE               (1 << 3)
+
+/* Texture object in locally shared texture space.
+ */
+struct radeon_tex_obj {
+	struct gl_texture_object base;
+	struct _radeon_mipmap_tree *mt;
+
+	/**
+	 * This is true if we've verified that the mipmap tree above is complete
+	 * and so on.
+	 */
+	GLboolean validated;
+
+	GLuint override_offset;
+	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
+	GLuint tile_bits;	/* hw texture tile bits used on this texture */
+        struct radeon_bo *bo;
+
+	GLuint pp_txfilter;	/* hardware register values */
+	GLuint pp_txformat;
+	GLuint pp_txformat_x;
+	GLuint pp_txsize;	/* npot only */
+	GLuint pp_txpitch;	/* npot only */
+	GLuint pp_border_color;
+	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
+
+        GLuint pp_txfilter_1;	/*  r300 */
+
+	/* r700 texture states */
+	GLuint SQ_TEX_RESOURCE0;
+	GLuint SQ_TEX_RESOURCE1;
+	GLuint SQ_TEX_RESOURCE2;
+	GLuint SQ_TEX_RESOURCE3;
+	GLuint SQ_TEX_RESOURCE4;
+	GLuint SQ_TEX_RESOURCE5;
+	GLuint SQ_TEX_RESOURCE6;
+
+	GLuint SQ_TEX_SAMPLER0;
+	GLuint SQ_TEX_SAMPLER1;
+	GLuint SQ_TEX_SAMPLER2;
+
+	GLboolean border_fallback;
+
+
+};
+
+static INLINE radeonTexObj* radeon_tex_obj(struct gl_texture_object *texObj)
+{
+	return (radeonTexObj*)texObj;
+}
+
+/* Need refcounting on dma buffers:
+ */
+struct radeon_dma_buffer {
+	int refcount;		/* the number of retained regions in buf */
+	drmBufPtr buf;
+};
+
+struct radeon_aos {
+	struct radeon_bo *bo; /** Buffer object where vertex data is stored */
+	int offset; /** Offset into buffer object, in bytes */
+	int components; /** Number of components per vertex */
+	int stride; /** Stride in dwords (may be 0 for repeating) */
+	int count; /** Number of vertices */
+};
+
+struct radeon_dma {
+        /* Active dma region.  Allocations for vertices and retained
+         * regions come from here.  Also used for emitting random vertices,
+         * these may be flushed by calling flush_current();
+         */
+        struct radeon_bo *current; /** Buffer that DMA memory is allocated from */
+        int current_used; /** Number of bytes allocated and forgotten about */
+        int current_vertexptr; /** End of active vertex region */
+
+        /**
+         * If current_vertexptr != current_used then flush must be non-zero.
+         * flush must be called before non-active vertex allocations can be
+         * performed.
+         */
+        void (*flush) (GLcontext *);
+
+        /* Number of "in-flight" DMA buffers, i.e. the number of buffers
+         * for which a DISCARD command is currently queued in the command buffer
+.
+         */
+        GLuint nr_released_bufs;
+};
+
+/* radeon_swtcl.c
+ */
+struct radeon_swtcl_info {
+
+	GLuint RenderIndex;
+	GLuint vertex_size;
+	GLubyte *verts;
+
+	/* Fallback rasterization functions
+	 */
+	GLuint hw_primitive;
+	GLenum render_primitive;
+	GLuint numverts;
+
+	struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
+	GLuint vertex_attr_count;
+
+};
+
+#define RADEON_MAX_AOS_ARRAYS		16
+struct radeon_tcl_info {
+	struct radeon_aos aos[RADEON_MAX_AOS_ARRAYS];
+	GLuint aos_count;
+	struct radeon_bo *elt_dma_bo; /** Buffer object that contains element indices */
+	int elt_dma_offset; /** Offset into this buffer object, in bytes */
+};
+
+struct radeon_ioctl {
+	GLuint vertex_offset;
+        struct radeon_bo *bo;
+	GLuint vertex_size;
+};
+
+#define RADEON_MAX_PRIMS 64
+
+struct radeon_prim {
+	GLuint start;
+	GLuint end;
+	GLuint prim;
+};
+
+static INLINE GLuint radeonPackColor(GLuint cpp,
+                                     GLubyte r, GLubyte g,
+                                     GLubyte b, GLubyte a)
+{
+	switch (cpp) {
+	case 2:
+		return PACK_COLOR_565(r, g, b);
+	case 4:
+		return PACK_COLOR_8888(a, r, g, b);
+	default:
+		return 0;
+	}
+}
+
+#define MAX_CMD_BUF_SZ (16*1024)
+
+#define MAX_DMA_BUF_SZ (64*1024)
+
+struct radeon_store {
+	GLuint statenr;
+	GLuint primnr;
+	char cmd_buf[MAX_CMD_BUF_SZ];
+	int cmd_used;
+	int elts_start;
+};
+
+struct radeon_dri_mirror {
+	__DRIcontextPrivate *context;	/* DRI context */
+	__DRIscreenPrivate *screen;	/* DRI screen */
+
+	drm_context_t hwContext;
+	drm_hw_lock_t *hwLock;
+	int fd;
+	int drmMinor;
+};
+
+#define DEBUG_TEXTURE	0x001
+#define DEBUG_STATE	0x002
+#define DEBUG_IOCTL	0x004
+#define DEBUG_PRIMS	0x008
+#define DEBUG_VERTS	0x010
+#define DEBUG_FALLBACKS	0x020
+#define DEBUG_VFMT	0x040
+#define DEBUG_CODEGEN	0x080
+#define DEBUG_VERBOSE	0x100
+#define DEBUG_DRI       0x200
+#define DEBUG_DMA       0x400
+#define DEBUG_SANITY    0x800
+#define DEBUG_SYNC      0x1000
+#define DEBUG_PIXEL     0x2000
+#define DEBUG_MEMORY    0x4000
+
+
+typedef void (*radeon_tri_func) (radeonContextPtr,
+				 radeonVertex *,
+				 radeonVertex *, radeonVertex *);
+
+typedef void (*radeon_line_func) (radeonContextPtr,
+				  radeonVertex *, radeonVertex *);
+
+typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
+
+#define RADEON_MAX_BOS 32
+struct radeon_state {
+	struct radeon_colorbuffer_state color;
+	struct radeon_depthbuffer_state depth;
+	struct radeon_scissor_state scissor;
+	struct radeon_stencilbuffer_state stencil;
+
+	struct radeon_cs_space_check bos[RADEON_MAX_BOS];
+	int validated_bo_count;
+};
+
+/**
+ * This structure holds the command buffer while it is being constructed.
+ *
+ * The first batch of commands in the buffer is always the state that needs
+ * to be re-emitted when the context is lost. This batch can be skipped
+ * otherwise.
+ */
+struct radeon_cmdbuf {
+	struct radeon_cs_manager    *csm;
+	struct radeon_cs            *cs;
+	int size; /** # of dwords total */
+	unsigned int flushing:1; /** whether we're currently in FlushCmdBufLocked */
+};
+
+struct radeon_context {
+   GLcontext *glCtx;
+   radeonScreenPtr radeonScreen;	/* Screen private DRI data */
+
+   /* Texture object bookkeeping
+    */
+   int                   texture_depth;
+   float                 initialMaxAnisotropy;
+   uint32_t              texture_row_align;
+
+  struct radeon_dma dma;
+  struct radeon_hw_state hw;
+   /* Rasterization and vertex state:
+    */
+   GLuint TclFallback;
+   GLuint Fallback;
+   GLuint NewGLState;
+   DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+
+   /* Drawable, cliprect and scissor information */
+   GLuint numClipRects;	/* Cliprects for the draw buffer */
+   drm_clip_rect_t *pClipRects;
+   unsigned int lastStamp;
+   GLboolean lost_context;
+   drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+
+   /* Mirrors of some DRI state */
+   struct radeon_dri_mirror dri;
+
+   /* Busy waiting */
+   GLuint do_usleeps;
+   GLuint do_irqs;
+   GLuint irqsEmitted;
+   drm_radeon_irq_wait_t iw;
+
+   /* Derived state - for r300 only */
+   struct radeon_state state;
+
+   struct radeon_swtcl_info swtcl;
+   struct radeon_tcl_info tcl;
+   /* Configuration cache
+    */
+   driOptionCache optionCache;
+
+   struct radeon_cmdbuf cmdbuf;
+
+  drm_clip_rect_t fboRect;
+  GLboolean constant_cliprect; /* use for FBO or DRI2 rendering */
+  GLboolean front_cliprects;
+
+   /**
+    * Set if rendering has occured to the drawable's front buffer.
+    *
+    * This is used in the DRI2 case to detect that glFlush should also copy
+    * the contents of the fake front buffer to the real front buffer.
+    */
+   GLboolean front_buffer_dirty;
+
+   /**
+    * Track whether front-buffer rendering is currently enabled
+    *
+    * A separate flag is used to track this in order to support MRT more
+    * easily.
+    */
+   GLboolean is_front_buffer_rendering;
+
+   /**
+    * Track whether front-buffer is the current read target.
+    *
+    * This is closely associated with is_front_buffer_rendering, but may
+    * be set separately.  The DRI2 fake front buffer must be referenced
+    * either way.
+    */
+   GLboolean is_front_buffer_reading;
+
+   struct dri_metaops meta;
+
+   struct {
+	   void (*get_lock)(radeonContextPtr radeon);
+	   void (*update_viewport_offset)(GLcontext *ctx);
+	   void (*emit_cs_header)(struct radeon_cs *cs, radeonContextPtr rmesa);
+	   void (*swtcl_flush)(GLcontext *ctx, uint32_t offset);
+	   void (*pre_emit_atoms)(radeonContextPtr rmesa);
+	   void (*pre_emit_state)(radeonContextPtr rmesa);
+	   void (*fallback)(GLcontext *ctx, GLuint bit, GLboolean mode);
+	   void (*free_context)(GLcontext *ctx);
+   } vtbl;
+};
+
+#define RADEON_CONTEXT(glctx) ((radeonContextPtr)(ctx->DriverCtx))
+
+static inline __DRIdrawablePrivate* radeon_get_drawable(radeonContextPtr radeon)
+{
+	return radeon->dri.context->driDrawablePriv;
+}
+
+static inline __DRIdrawablePrivate* radeon_get_readable(radeonContextPtr radeon)
+{
+	return radeon->dri.context->driReadablePriv;
+}
+
+
+/**
+ * This function takes a float and packs it into a uint32_t
+ */
+static INLINE uint32_t radeonPackFloat32(float fl)
+{
+	union {
+		float fl;
+		uint32_t u;
+	} u;
+
+	u.fl = fl;
+	return u.u;
+}
+
+/* This is probably wrong for some values, I need to test this
+ * some more.  Range checking would be a good idea also..
+ *
+ * But it works for most things.  I'll fix it later if someone
+ * else with a better clue doesn't
+ */
+static INLINE uint32_t radeonPackFloat24(float f)
+{
+	float mantissa;
+	int exponent;
+	uint32_t float24 = 0;
+
+	if (f == 0.0)
+		return 0;
+
+	mantissa = frexpf(f, &exponent);
+
+	/* Handle -ve */
+	if (mantissa < 0) {
+		float24 |= (1 << 23);
+		mantissa = mantissa * -1.0;
+	}
+	/* Handle exponent, bias of 63 */
+	exponent += 62;
+	float24 |= (exponent << 16);
+	/* Kill 7 LSB of mantissa */
+	float24 |= (radeonPackFloat32(mantissa) & 0x7FFFFF) >> 7;
+
+	return float24;
+}
+
+GLboolean radeonInitContext(radeonContextPtr radeon,
+			    struct dd_function_table* functions,
+			    const __GLcontextModes * glVisual,
+			    __DRIcontextPrivate * driContextPriv,
+			    void *sharedContextPrivate);
+
+void radeonCleanupContext(radeonContextPtr radeon);
+GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
+void radeon_update_renderbuffers(__DRIcontext *context, __DRIdrawable *drawable);
+GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
+			    __DRIdrawablePrivate * driDrawPriv,
+			    __DRIdrawablePrivate * driReadPriv);
+extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
+
+/* ================================================================
+ * Debugging:
+ */
+#define DO_DEBUG		1
+
+#if DO_DEBUG
+extern int RADEON_DEBUG;
+#else
+#define RADEON_DEBUG		0
+#endif
+
+#ifndef HAVE_LIBDRM_RADEON
+#ifndef RADEON_DEBUG_BO
+#define RADEON_DEBUG_BO 1
+#endif
+#endif
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_compat.c b/src/mesa/drivers/dri/radeon/radeon_compat.c
deleted file mode 100644
index 46b490d61f7..00000000000
--- a/src/mesa/drivers/dri/radeon/radeon_compat.c
+++ /dev/null
@@ -1,301 +0,0 @@
-/**************************************************************************
-
-Copyright 2002 ATI Technologies Inc., Ontario, Canada, and
-               Tungsten Graphics Inc., Austin, Texas.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining a
-copy of this software and associated documentation files (the "Software"),
-to deal in the Software without restriction, including without limitation
-on the rights to use, copy, modify, merge, publish, distribute, sub
-license, and/or sell copies of the Software, and to permit persons to whom
-the Software is furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice (including the next
-paragraph) shall be included in all copies or substantial portions of the
-Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
-ATI, TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
-DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
-OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
-USE OR OTHER DEALINGS IN THE SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Keith Whitwell <[email protected]>
- *
- */
-
-#include "main/glheader.h"
-#include "main/imports.h"
-
-#include "radeon_context.h"
-#include "radeon_state.h"
-#include "radeon_ioctl.h"
-
-
-static struct { 
-	int start; 
-	int len; 
-	const char *name;
-} packet[RADEON_MAX_STATE_PACKETS] = {
-	{ RADEON_PP_MISC,7,"RADEON_PP_MISC" },
-	{ RADEON_PP_CNTL,3,"RADEON_PP_CNTL" },
-	{ RADEON_RB3D_COLORPITCH,1,"RADEON_RB3D_COLORPITCH" },
-	{ RADEON_RE_LINE_PATTERN,2,"RADEON_RE_LINE_PATTERN" },
-	{ RADEON_SE_LINE_WIDTH,1,"RADEON_SE_LINE_WIDTH" },
-	{ RADEON_PP_LUM_MATRIX,1,"RADEON_PP_LUM_MATRIX" },
-	{ RADEON_PP_ROT_MATRIX_0,2,"RADEON_PP_ROT_MATRIX_0" },
-	{ RADEON_RB3D_STENCILREFMASK,3,"RADEON_RB3D_STENCILREFMASK" },
-	{ RADEON_SE_VPORT_XSCALE,6,"RADEON_SE_VPORT_XSCALE" },
-	{ RADEON_SE_CNTL,2,"RADEON_SE_CNTL" },
-	{ RADEON_SE_CNTL_STATUS,1,"RADEON_SE_CNTL_STATUS" },
-	{ RADEON_RE_MISC,1,"RADEON_RE_MISC" },
-	{ RADEON_PP_TXFILTER_0,6,"RADEON_PP_TXFILTER_0" },
-	{ RADEON_PP_BORDER_COLOR_0,1,"RADEON_PP_BORDER_COLOR_0" },
-	{ RADEON_PP_TXFILTER_1,6,"RADEON_PP_TXFILTER_1" },
-	{ RADEON_PP_BORDER_COLOR_1,1,"RADEON_PP_BORDER_COLOR_1" },
-	{ RADEON_PP_TXFILTER_2,6,"RADEON_PP_TXFILTER_2" },
-	{ RADEON_PP_BORDER_COLOR_2,1,"RADEON_PP_BORDER_COLOR_2" },
-	{ RADEON_SE_ZBIAS_FACTOR,2,"RADEON_SE_ZBIAS_FACTOR" },
-	{ RADEON_SE_TCL_OUTPUT_VTX_FMT,11,"RADEON_SE_TCL_OUTPUT_VTX_FMT" },
-	{ RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED,17,"RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED" },
-};
-
-
-static void radeonCompatEmitPacket( radeonContextPtr rmesa, 
-				    struct radeon_state_atom *state )
-{
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-   drm_radeon_context_regs_t *ctx = &sarea->context_state;
-   drm_radeon_texture_regs_t *tex0 = &sarea->tex_state[0];
-   drm_radeon_texture_regs_t *tex1 = &sarea->tex_state[1];
-   int i;
-   int *buf = state->cmd;
-
-   for ( i = 0 ; i < state->cmd_size ; ) {
-      drm_radeon_cmd_header_t *header = (drm_radeon_cmd_header_t *)&buf[i++];
-
-      if (RADEON_DEBUG & DEBUG_STATE)
-	 fprintf(stderr, "%s %d: %s\n", __FUNCTION__, header->packet.packet_id,
-		 packet[(int)header->packet.packet_id].name);
-
-      switch (header->packet.packet_id) {
-      case RADEON_EMIT_PP_MISC:
-	 ctx->pp_misc = buf[i++]; 
-	 ctx->pp_fog_color = buf[i++];
-	 ctx->re_solid_color = buf[i++];
-	 ctx->rb3d_blendcntl = buf[i++];
-	 ctx->rb3d_depthoffset = buf[i++];
-	 ctx->rb3d_depthpitch = buf[i++];
-	 ctx->rb3d_zstencilcntl = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
-	 break;
-      case RADEON_EMIT_PP_CNTL:
-	 ctx->pp_cntl = buf[i++];
-	 ctx->rb3d_cntl = buf[i++];
-	 ctx->rb3d_coloroffset = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
-	 break;
-      case RADEON_EMIT_RB3D_COLORPITCH:
-	 ctx->rb3d_colorpitch = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT;
-	 break;
-      case RADEON_EMIT_RE_LINE_PATTERN:
-	 ctx->re_line_pattern = buf[i++];
-	 ctx->re_line_state = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_LINE;
-	 break;
-      case RADEON_EMIT_SE_LINE_WIDTH:
-	 ctx->se_line_width = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_LINE;
-	 break;
-      case RADEON_EMIT_PP_LUM_MATRIX:
-	 ctx->pp_lum_matrix = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
-	 break;
-      case RADEON_EMIT_PP_ROT_MATRIX_0:
-	 ctx->pp_rot_matrix_0 = buf[i++];
-	 ctx->pp_rot_matrix_1 = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_BUMPMAP;
-	 break;
-      case RADEON_EMIT_RB3D_STENCILREFMASK:
-	 ctx->rb3d_stencilrefmask = buf[i++];
-	 ctx->rb3d_ropcntl = buf[i++];
-	 ctx->rb3d_planemask = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_MASKS;
-	 break;
-      case RADEON_EMIT_SE_VPORT_XSCALE:
-	 ctx->se_vport_xscale = buf[i++];
-	 ctx->se_vport_xoffset = buf[i++];
-	 ctx->se_vport_yscale = buf[i++];
-	 ctx->se_vport_yoffset = buf[i++];
-	 ctx->se_vport_zscale = buf[i++];
-	 ctx->se_vport_zoffset = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_VIEWPORT;
-	 break;
-      case RADEON_EMIT_SE_CNTL:
-	 ctx->se_cntl = buf[i++];
-	 ctx->se_coord_fmt = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_CONTEXT | RADEON_UPLOAD_VERTFMT;
-	 break;
-      case RADEON_EMIT_SE_CNTL_STATUS:
-	 ctx->se_cntl_status = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_SETUP;
-	 break;
-      case RADEON_EMIT_RE_MISC:
-	 ctx->re_misc = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_MISC;
-	 break;
-      case RADEON_EMIT_PP_TXFILTER_0:
-	 tex0->pp_txfilter = buf[i++];
-	 tex0->pp_txformat = buf[i++];
-	 tex0->pp_txoffset = buf[i++];
-	 tex0->pp_txcblend = buf[i++];
-	 tex0->pp_txablend = buf[i++];
-	 tex0->pp_tfactor = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX0;
-	 break;
-      case RADEON_EMIT_PP_BORDER_COLOR_0:
-	 tex0->pp_border_color = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX0;
-	 break;
-      case RADEON_EMIT_PP_TXFILTER_1:
-	 tex1->pp_txfilter = buf[i++];
-	 tex1->pp_txformat = buf[i++];
-	 tex1->pp_txoffset = buf[i++];
-	 tex1->pp_txcblend = buf[i++];
-	 tex1->pp_txablend = buf[i++];
-	 tex1->pp_tfactor = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX1;
-	 break;
-      case RADEON_EMIT_PP_BORDER_COLOR_1:
-	 tex1->pp_border_color = buf[i++];
-	 sarea->dirty |= RADEON_UPLOAD_TEX1;
-	 break;
-
-      case RADEON_EMIT_SE_ZBIAS_FACTOR:
-	 i++;
-	 i++;
-	 break;
-
-      case RADEON_EMIT_PP_TXFILTER_2:
-      case RADEON_EMIT_PP_BORDER_COLOR_2:
-      case RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT:
-      case RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED:
-      default:
-	 /* These states aren't understood by radeon drm 1.1 */
-	 fprintf(stderr, "Tried to emit unsupported state\n");
-	 return;
-      }
-   }
-}
-
-
-
-static void radeonCompatEmitStateLocked( radeonContextPtr rmesa )
-{
-   struct radeon_state_atom *atom;
-
-   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   if (!rmesa->hw.is_dirty && !rmesa->hw.all_dirty)
-      return;
-
-   foreach(atom, &rmesa->hw.atomlist) {
-      if (rmesa->hw.all_dirty)
-	 atom->dirty = GL_TRUE;
-      if (atom->is_tcl)
-	 atom->dirty = GL_FALSE;
-      if (atom->dirty)
-	 radeonCompatEmitPacket(rmesa, atom);
-   }
- 
-   rmesa->hw.is_dirty = GL_FALSE;
-   rmesa->hw.all_dirty = GL_FALSE;
-}
-
-
-static void radeonCompatEmitPrimitiveLocked( radeonContextPtr rmesa,
-					     GLuint hw_primitive,
-					     GLuint nverts,
-					     drm_clip_rect_t *pbox,
-					     GLuint nbox )
-{
-   int i;
-
-   for ( i = 0 ; i < nbox ; ) {
-      int nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, nbox );
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      drm_radeon_vertex_t vtx;
-      
-      rmesa->sarea->dirty |= RADEON_UPLOAD_CLIPRECTS;
-      rmesa->sarea->nbox = nr - i;
-
-      for ( ; i < nr ; i++) 
-	 *b++ = pbox[i];
-      
-      if (RADEON_DEBUG & DEBUG_IOCTL)
-	 fprintf(stderr, 
-		 "RadeonFlushVertexBuffer: prim %x buf %d verts %d "
-		 "disc %d nbox %d\n",
-		 hw_primitive, 
-		 rmesa->dma.current.buf->buf->idx, 
-		 nverts, 
-		 nr == nbox,
-		 rmesa->sarea->nbox );
-
-      vtx.prim = hw_primitive;
-      vtx.idx = rmesa->dma.current.buf->buf->idx;
-      vtx.count = nverts;
-      vtx.discard = (nr == nbox);      
-
-      drmCommandWrite( rmesa->dri.fd, 
-		       DRM_RADEON_VERTEX,
-		       &vtx, sizeof(vtx));
-   }
-}
-
-
-
-/* No 'start' for 1.1 vertices ioctl: only one vertex prim/buffer!  
- */
-void radeonCompatEmitPrimitive( radeonContextPtr rmesa,
-				GLuint vertex_format,
-				GLuint hw_primitive,
-				GLuint nrverts )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   LOCK_HARDWARE( rmesa );
-
-   radeonCompatEmitStateLocked( rmesa );
-   rmesa->sarea->vc_format = vertex_format;
-   
-   if (rmesa->state.scissor.enabled) {
-      radeonCompatEmitPrimitiveLocked( rmesa, 
-				       hw_primitive,
-				       nrverts,
-				       rmesa->state.scissor.pClipRects,
-				       rmesa->state.scissor.numClipRects );
-   }
-   else {
-      radeonCompatEmitPrimitiveLocked( rmesa, 
-				       hw_primitive,
-				       nrverts,
-				       rmesa->pClipRects,
-				       rmesa->numClipRects );
-   }
-
-
-   UNLOCK_HARDWARE( rmesa );
-}
-
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.c b/src/mesa/drivers/dri/radeon/radeon_context.c
index b67bda7d06a..c457fb654ea 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.c
+++ b/src/mesa/drivers/dri/radeon/radeon_context.c
@@ -53,6 +53,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "drivers/common/driverfuncs.h"
 
+#include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
@@ -65,6 +66,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define need_GL_EXT_blend_minmax
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_secondary_color
+#define need_GL_EXT_framebuffer_object
 #include "extension_helper.h"
 
 #define DRIVER_DATE	"20061018"
@@ -72,40 +74,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "vblank.h"
 #include "utils.h"
 #include "xmlpool.h" /* for symbolic values of enum-type options */
-#ifndef RADEON_DEBUG
-int RADEON_DEBUG = (0);
-#endif
-
-
-/* Return various strings for glGetString().
- */
-static const GLubyte *radeonGetString( GLcontext *ctx, GLenum name )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   static char buffer[128];
-   unsigned   offset;
-   GLuint agp_mode = (rmesa->radeonScreen->card_type==RADEON_CARD_PCI) ? 0 :
-      rmesa->radeonScreen->AGPMode;
-
-   switch ( name ) {
-   case GL_VENDOR:
-      return (GLubyte *)"Tungsten Graphics, Inc.";
-
-   case GL_RENDERER:
-      offset = driGetRendererString( buffer, "Radeon", DRIVER_DATE,
-				     agp_mode );
-
-      sprintf( & buffer[ offset ], " %sTCL",
-	       !(rmesa->TclFallback & RADEON_TCL_FALLBACK_TCL_DISABLE)
-	       ? "" : "NO-" );
-
-      return (GLubyte *)buffer;
-
-   default:
-      return NULL;
-   }
-}
-
 
 /* Extension strings exported by the R100 driver.
  */
@@ -121,6 +89,7 @@ const struct dri_extension card_extensions[] =
     { "GL_EXT_blend_logic_op",             NULL },
     { "GL_EXT_blend_subtract",             GL_EXT_blend_minmax_functions },
     { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+    { "GL_EXT_packed_depth_stencil",	   NULL},
     { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
     { "GL_EXT_stencil_wrap",               NULL },
     { "GL_EXT_texture_edge_clamp",         NULL },
@@ -137,6 +106,11 @@ const struct dri_extension card_extensions[] =
     { NULL,                                NULL }
 };
 
+const struct dri_extension mm_extensions[] = {
+  { "GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions },
+  { NULL, NULL }
+};
+
 extern const struct tnl_pipeline_stage _radeon_render_stage;
 extern const struct tnl_pipeline_stage _radeon_tcl_stage;
 
@@ -160,15 +134,6 @@ static const struct tnl_pipeline_stage *radeon_pipeline[] = {
    NULL,
 };
 
-
-
-/* Initialize the driver's misc functions.
- */
-static void radeonInitDriverFuncs( struct dd_function_table *functions )
-{
-    functions->GetString	= radeonGetString;
-}
-
 static const struct dri_debug_control debug_control[] =
 {
     { "fall",  DEBUG_FALLBACKS },
@@ -188,19 +153,69 @@ static const struct dri_debug_control debug_control[] =
     { NULL,    0 }
 };
 
+static void r100_get_lock(radeonContextPtr radeon)
+{
+   r100ContextPtr rmesa = (r100ContextPtr)radeon;
+   drm_radeon_sarea_t *sarea = radeon->sarea;
+
+   RADEON_STATECHANGE(rmesa, ctx);
+   if (rmesa->radeon.sarea->tiling_enabled) {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
+	 RADEON_COLOR_TILE_ENABLE;
+   } else {
+      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
+	 ~RADEON_COLOR_TILE_ENABLE;
+   }
+   
+   if (sarea->ctx_owner != rmesa->radeon.dri.hwContext) {
+      sarea->ctx_owner = rmesa->radeon.dri.hwContext;
+      
+      if (!radeon->radeonScreen->kernel_mm)
+         radeon_bo_legacy_texture_age(radeon->radeonScreen->bom);
+   }
+}
+
+static void r100_vtbl_emit_cs_header(struct radeon_cs *cs, radeonContextPtr rmesa)
+{
+}
+
+static void r100_vtbl_pre_emit_state(radeonContextPtr radeon)
+{
+   r100ContextPtr rmesa = (r100ContextPtr)radeon;
+   
+   /* r100 always needs to emit ZBS to avoid TCL lockups */
+   rmesa->hw.zbs.dirty = 1;
+   radeon->hw.is_dirty = 1;
+}
+
+static void r100_vtbl_free_context(GLcontext *ctx)
+{
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   _mesa_vector4f_free( &rmesa->tcl.ObjClean );
+}
+
+static void r100_init_vtbl(radeonContextPtr radeon)
+{
+   radeon->vtbl.get_lock = r100_get_lock;
+   radeon->vtbl.update_viewport_offset = radeonUpdateViewportOffset;
+   radeon->vtbl.emit_cs_header = r100_vtbl_emit_cs_header;
+   radeon->vtbl.swtcl_flush = r100_swtcl_flush;
+   radeon->vtbl.pre_emit_state = r100_vtbl_pre_emit_state;
+   radeon->vtbl.fallback = radeonFallback;
+}
 
 /* Create the device specific context.
  */
 GLboolean
-radeonCreateContext( const __GLcontextModes *glVisual,
+r100CreateContext( const __GLcontextModes *glVisual,
                      __DRIcontextPrivate *driContextPriv,
                      void *sharedContextPrivate)
 {
    __DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
    radeonScreenPtr screen = (radeonScreenPtr)(sPriv->private);
    struct dd_function_table functions;
-   radeonContextPtr rmesa;
-   GLcontext *ctx, *shareCtx;
+   r100ContextPtr rmesa;
+   GLcontext *ctx;
    int i;
    int tcl_mode, fthrottle_mode;
 
@@ -209,10 +224,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
    assert(screen);
 
    /* Allocate the Radeon context */
-   rmesa = (radeonContextPtr) CALLOC( sizeof(*rmesa) );
+   rmesa = (r100ContextPtr) CALLOC( sizeof(*rmesa) );
    if ( !rmesa )
       return GL_FALSE;
 
+   r100_init_vtbl(&rmesa->radeon);
+
    /* init exp fog table data */
    radeonInitStaticFogData();
    
@@ -220,12 +237,12 @@ radeonCreateContext( const __GLcontextModes *glVisual,
     * Do this here so that initialMaxAnisotropy is set before we create
     * the default textures.
     */
-   driParseConfigFiles (&rmesa->optionCache, &screen->optionCache,
+   driParseConfigFiles (&rmesa->radeon.optionCache, &screen->optionCache,
 			screen->driScreen->myNum, "radeon");
-   rmesa->initialMaxAnisotropy = driQueryOptionf(&rmesa->optionCache,
+   rmesa->radeon.initialMaxAnisotropy = driQueryOptionf(&rmesa->radeon.optionCache,
                                                  "def_max_anisotropy");
 
-   if ( driQueryOptionb( &rmesa->optionCache, "hyperz" ) ) {
+   if ( driQueryOptionb( &rmesa->radeon.optionCache, "hyperz" ) ) {
       if ( sPriv->drm_version.minor < 13 )
 	 fprintf( stderr, "DRM version 1.%d too old to support HyperZ, "
 			  "disabling.\n", sPriv->drm_version.minor );
@@ -240,65 +257,17 @@ radeonCreateContext( const __GLcontextModes *glVisual,
     * (the texture functions are especially important)
     */
    _mesa_init_driver_functions( &functions );
-   radeonInitDriverFuncs( &functions );
    radeonInitTextureFuncs( &functions );
 
-   /* Allocate the Mesa context */
-   if (sharedContextPrivate)
-      shareCtx = ((radeonContextPtr) sharedContextPrivate)->glCtx;
-   else
-      shareCtx = NULL;
-   rmesa->glCtx = _mesa_create_context(glVisual, shareCtx,
-                                       &functions, (void *) rmesa);
-   if (!rmesa->glCtx) {
-      FREE(rmesa);
-      return GL_FALSE;
-   }
-   driContextPriv->driverPrivate = rmesa;
-
-   /* Init radeon context data */
-   rmesa->dri.context = driContextPriv;
-   rmesa->dri.screen = sPriv;
-   rmesa->dri.drawable = NULL;
-   rmesa->dri.readable = NULL;
-   rmesa->dri.hwContext = driContextPriv->hHWContext;
-   rmesa->dri.hwLock = &sPriv->pSAREA->lock;
-   rmesa->dri.fd = sPriv->fd;
-   rmesa->dri.drmMinor = sPriv->drm_version.minor;
-
-   rmesa->radeonScreen = screen;
-   rmesa->sarea = (drm_radeon_sarea_t *)((GLubyte *)sPriv->pSAREA +
-				       screen->sarea_priv_offset);
-
-
-   rmesa->dma.buf0_address = rmesa->radeonScreen->buffers->list[0].address;
-
-   (void) memset( rmesa->texture_heaps, 0, sizeof( rmesa->texture_heaps ) );
-   make_empty_list( & rmesa->swapped );
-
-   rmesa->nr_heaps = screen->numTexHeaps;
-   for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-      rmesa->texture_heaps[i] = driCreateTextureHeap( i, rmesa,
-	    screen->texSize[i],
-	    12,
-	    RADEON_NR_TEX_REGIONS,
-	    (drmTextureRegionPtr)rmesa->sarea->tex_list[i],
-	    & rmesa->sarea->tex_age[i],
-	    & rmesa->swapped,
-	    sizeof( radeonTexObj ),
-	    (destroy_texture_object_t *) radeonDestroyTexObj );
-
-      driSetTextureSwapCounterLocation( rmesa->texture_heaps[i],
-					& rmesa->c_textureSwaps );
+   if (!radeonInitContext(&rmesa->radeon, &functions,
+			  glVisual, driContextPriv,
+			  sharedContextPrivate)) {
+     FREE(rmesa);
+     return GL_FALSE;
    }
-   rmesa->texture_depth = driQueryOptioni (&rmesa->optionCache,
-					   "texture_depth");
-   if (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FB)
-      rmesa->texture_depth = ( screen->cpp == 4 ) ?
-	 DRI_CONF_TEXTURE_DEPTH_32 : DRI_CONF_TEXTURE_DEPTH_16;
 
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->hw.all_dirty = GL_TRUE;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.hw.all_dirty = GL_TRUE;
 
    /* Set the maximum texture size small enough that we can guarentee that
     * all texture units can bind a maximal texture and have all of them in
@@ -306,26 +275,20 @@ radeonCreateContext( const __GLcontextModes *glVisual,
     * setting allow larger textures.
     */
 
-   ctx = rmesa->glCtx;
-   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->optionCache,
+   ctx = rmesa->radeon.glCtx;
+   ctx->Const.MaxTextureUnits = driQueryOptioni (&rmesa->radeon.optionCache,
 						 "texture_units");
    ctx->Const.MaxTextureImageUnits = ctx->Const.MaxTextureUnits;
    ctx->Const.MaxTextureCoordUnits = ctx->Const.MaxTextureUnits;
 
-   i = driQueryOptioni( &rmesa->optionCache, "allow_large_textures");
-
-   driCalculateMaxTextureLevels( rmesa->texture_heaps,
-				 rmesa->nr_heaps,
-				 & ctx->Const,
-				 4,
-				 11, /* max 2D texture size is 2048x2048 */
-				 8,  /* 256^3 */
-				 9,  /* \todo: max cube texture size seems to be 512x512(x6) */
-				 11, /* max rect texture size is 2048x2048. */
-				 12,
-				 GL_FALSE,
-				 i );
+   i = driQueryOptioni( &rmesa->radeon.optionCache, "allow_large_textures");
 
+   /* FIXME: When no memory manager is available we should set this 
+    * to some reasonable value based on texture memory pool size */
+   ctx->Const.MaxTextureLevels = 12;
+   ctx->Const.Max3DTextureLevels = 9;
+   ctx->Const.MaxCubeTextureLevels = 12;
+   ctx->Const.MaxTextureRectSize = 2048;
 
    ctx->Const.MaxTextureMaxAnisotropy = 16.0;
 
@@ -390,38 +353,39 @@ radeonCreateContext( const __GLcontextModes *glVisual,
    }
 
    driInitExtensions( ctx, card_extensions, GL_TRUE );
-   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     driInitExtensions(ctx, mm_extensions, GL_FALSE);
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
       _mesa_enable_extension( ctx, "GL_ARB_texture_cube_map" );
-   if (rmesa->glCtx->Mesa_DXTn) {
+   if (rmesa->radeon.glCtx->Mesa_DXTn) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
       _mesa_enable_extension( ctx, "GL_S3_s3tc" );
    }
-   else if (driQueryOptionb (&rmesa->optionCache, "force_s3tc_enable")) {
+   else if (driQueryOptionb (&rmesa->radeon.optionCache, "force_s3tc_enable")) {
       _mesa_enable_extension( ctx, "GL_EXT_texture_compression_s3tc" );
    }
 
-   if (rmesa->dri.drmMinor >= 9)
+   if (rmesa->radeon.radeonScreen->kernel_mm || rmesa->radeon.dri.drmMinor >= 9)
       _mesa_enable_extension( ctx, "GL_NV_texture_rectangle");
 
    /* XXX these should really go right after _mesa_init_driver_functions() */
+   radeon_fbo_init(&rmesa->radeon);
+   radeonInitSpanFuncs( ctx );
    radeonInitIoctlFuncs( ctx );
    radeonInitStateFuncs( ctx );
-   radeonInitSpanFuncs( ctx );
    radeonInitState( rmesa );
    radeonInitSwtcl( ctx );
 
    _mesa_vector4f_alloc( &rmesa->tcl.ObjClean, 0, 
 			 ctx->Const.MaxArrayLockSize, 32 );
 
-   fthrottle_mode = driQueryOptioni(&rmesa->optionCache, "fthrottle_mode");
-   rmesa->iw.irq_seq = -1;
-   rmesa->irqsEmitted = 0;
-   rmesa->do_irqs = (rmesa->radeonScreen->irq != 0 &&
-		     fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
-
-   rmesa->do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
+   fthrottle_mode = driQueryOptioni(&rmesa->radeon.optionCache, "fthrottle_mode");
+   rmesa->radeon.iw.irq_seq = -1;
+   rmesa->radeon.irqsEmitted = 0;
+   rmesa->radeon.do_irqs = (rmesa->radeon.radeonScreen->irq != 0 &&
+			    fthrottle_mode == DRI_CONF_FTHROTTLE_IRQS);
 
-   (*sPriv->systemTime->getUST)( & rmesa->swap_ust );
+   rmesa->radeon.do_usleeps = (fthrottle_mode == DRI_CONF_FTHROTTLE_USLEEPS);
 
 
 #if DO_DEBUG
@@ -429,206 +393,21 @@ radeonCreateContext( const __GLcontextModes *glVisual,
 				       debug_control );
 #endif
 
-   tcl_mode = driQueryOptioni(&rmesa->optionCache, "tcl_mode");
-   if (driQueryOptionb(&rmesa->optionCache, "no_rast")) {
+   tcl_mode = driQueryOptioni(&rmesa->radeon.optionCache, "tcl_mode");
+   if (driQueryOptionb(&rmesa->radeon.optionCache, "no_rast")) {
       fprintf(stderr, "disabling 3D acceleration\n");
       FALLBACK(rmesa, RADEON_FALLBACK_DISABLE, 1);
    } else if (tcl_mode == DRI_CONF_TCL_SW ||
-	      !(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
-      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
-	 rmesa->radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	      !(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+	 rmesa->radeon.radeonScreen->chip_flags &= ~RADEON_CHIPSET_TCL;
 	 fprintf(stderr, "Disabling HW TCL support\n");
       }
-      TCL_FALLBACK(rmesa->glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
+      TCL_FALLBACK(rmesa->radeon.glCtx, RADEON_TCL_FALLBACK_TCL_DISABLE, 1);
    }
 
-   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+   if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
 /*       _tnl_need_dlist_norm_lengths( ctx, GL_FALSE ); */
    }
    return GL_TRUE;
 }
-
-
-/* Destroy the device specific context.
- */
-/* Destroy the Mesa and driver specific context data.
- */
-void radeonDestroyContext( __DRIcontextPrivate *driContextPriv )
-{
-   GET_CURRENT_CONTEXT(ctx);
-   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
-   radeonContextPtr current = ctx ? RADEON_CONTEXT(ctx) : NULL;
-
-   /* check if we're deleting the currently bound context */
-   if (rmesa == current) {
-      RADEON_FIREVERTICES( rmesa );
-      _mesa_make_current(NULL, NULL, NULL);
-   }
-
-   /* Free radeon context resources */
-   assert(rmesa); /* should never be null */
-   if ( rmesa ) {
-      GLboolean   release_texture_heaps;
-
-
-      release_texture_heaps = (rmesa->glCtx->Shared->RefCount == 1);
-      _swsetup_DestroyContext( rmesa->glCtx );
-      _tnl_DestroyContext( rmesa->glCtx );
-      _vbo_DestroyContext( rmesa->glCtx );
-      _swrast_DestroyContext( rmesa->glCtx );
-
-      radeonDestroySwtcl( rmesa->glCtx );
-      radeonReleaseArrays( rmesa->glCtx, ~0 );
-      if (rmesa->dma.current.buf) {
-	 radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-	 radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-      }
-
-      _mesa_vector4f_free( &rmesa->tcl.ObjClean );
-
-      if (rmesa->state.scissor.pClipRects) {
-	 FREE(rmesa->state.scissor.pClipRects);
-	 rmesa->state.scissor.pClipRects = NULL;
-      }
-
-      if ( release_texture_heaps ) {
-         /* This share group is about to go away, free our private
-          * texture object data.
-          */
-         int i;
-
-         for ( i = 0 ; i < rmesa->nr_heaps ; i++ ) {
-	    driDestroyTextureHeap( rmesa->texture_heaps[ i ] );
-	    rmesa->texture_heaps[ i ] = NULL;
-         }
-
-	 assert( is_empty_list( & rmesa->swapped ) );
-      }
-
-      /* free the Mesa context */
-      rmesa->glCtx->DriverCtx = NULL;
-      _mesa_destroy_context( rmesa->glCtx );
-
-      /* free the option cache */
-      driDestroyOptionCache (&rmesa->optionCache);
-
-      FREE( rmesa );
-   }
-}
-
-
-
-
-void
-radeonSwapBuffers( __DRIdrawablePrivate *dPriv )
-{
-
-   if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-      radeonContextPtr rmesa;
-      GLcontext *ctx;
-      rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-      ctx = rmesa->glCtx;
-      if (ctx->Visual.doubleBufferMode) {
-         _mesa_notifySwapBuffers( ctx );  /* flush pending rendering comands */
-
-         if ( rmesa->doPageFlip ) {
-            radeonPageFlip( dPriv );
-         }
-         else {
-	     radeonCopyBuffer( dPriv, NULL );
-         }
-      }
-   }
-   else {
-      /* XXX this shouldn't be an error but we can't handle it for now */
-      _mesa_problem(NULL, "%s: drawable has no context!", __FUNCTION__);
-   }
-}
-
-void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-			 int x, int y, int w, int h )
-{
-    if (dPriv->driContextPriv && dPriv->driContextPriv->driverPrivate) {
-	radeonContextPtr radeon;
-	GLcontext *ctx;
-
-	radeon = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-	ctx = radeon->glCtx;
-
-	if (ctx->Visual.doubleBufferMode) {
-	    drm_clip_rect_t rect;
-	    rect.x1 = x + dPriv->x;
-	    rect.y1 = (dPriv->h - y - h) + dPriv->y;
-	    rect.x2 = rect.x1 + w;
-	    rect.y2 = rect.y1 + h;
-	    _mesa_notifySwapBuffers(ctx);	/* flush pending rendering comands */
-	    radeonCopyBuffer(dPriv, &rect);
-	}
-    } else {
-	/* XXX this shouldn't be an error but we can't handle it for now */
-	_mesa_problem(NULL, "%s: drawable has no context!",
-		      __FUNCTION__);
-    }
-}
-
-/* Make context `c' the current context and bind it to the given
- * drawing and reading surfaces.
- */
-GLboolean
-radeonMakeCurrent( __DRIcontextPrivate *driContextPriv,
-                   __DRIdrawablePrivate *driDrawPriv,
-                   __DRIdrawablePrivate *driReadPriv )
-{
-   if ( driContextPriv ) {
-      radeonContextPtr newCtx = 
-	 (radeonContextPtr) driContextPriv->driverPrivate;
-
-      if (RADEON_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) newCtx->glCtx);
-
-      newCtx->dri.readable = driReadPriv;
-
-      if ( (newCtx->dri.drawable != driDrawPriv) ||
-           newCtx->lastStamp != driDrawPriv->lastStamp ) {
-	 if (driDrawPriv->swap_interval == (unsigned)-1) {
-	    driDrawPriv->vblFlags = (newCtx->radeonScreen->irq != 0)
-	       ? driGetDefaultVBlankFlags(&newCtx->optionCache)
-	       : VBLANK_FLAG_NO_IRQ;
-
-	    driDrawableInitVBlank( driDrawPriv );
-	 }
-
-	 newCtx->dri.drawable = driDrawPriv;
-
-	 radeonSetCliprects(newCtx);
-	 radeonUpdateViewportOffset( newCtx->glCtx );
-      }
-
-      _mesa_make_current( newCtx->glCtx,
-			  (GLframebuffer *) driDrawPriv->driverPrivate,
-			  (GLframebuffer *) driReadPriv->driverPrivate );
-
-      _mesa_update_state( newCtx->glCtx );
-   } else {
-      if (RADEON_DEBUG & DEBUG_DRI)
-	 fprintf(stderr, "%s ctx is null\n", __FUNCTION__);
-      _mesa_make_current( NULL, NULL, NULL );
-   }
-
-   if (RADEON_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "End %s\n", __FUNCTION__);
-   return GL_TRUE;
-}
-
-/* Force the context `c' to be unbound from its buffer.
- */
-GLboolean
-radeonUnbindContext( __DRIcontextPrivate *driContextPriv )
-{
-   radeonContextPtr rmesa = (radeonContextPtr) driContextPriv->driverPrivate;
-
-   if (RADEON_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s ctx %p\n", __FUNCTION__, (void *) rmesa->glCtx);
-
-   return GL_TRUE;
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_context.h b/src/mesa/drivers/dri/radeon/radeon_context.h
index 53df766f8cb..1795d8bdb6d 100644
--- a/src/mesa/drivers/dri/radeon/radeon_context.h
+++ b/src/mesa/drivers/dri/radeon/radeon_context.h
@@ -48,91 +48,23 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "drm.h"
 #include "radeon_drm.h"
 #include "texmem.h"
-
 #include "main/macros.h"
 #include "main/mtypes.h"
 #include "main/colormac.h"
-
-struct radeon_context;
-typedef struct radeon_context radeonContextRec;
-typedef struct radeon_context *radeonContextPtr;
-
-/* This union is used to avoid warnings/miscompilation
-   with float to uint32_t casts due to strict-aliasing */
-typedef union {
-	GLfloat f;
-	uint32_t ui32;
-} float_ui32_type;
-
-#include "radeon_lock.h"
 #include "radeon_screen.h"
-#include "main/mm.h"
-
-#include "math/m_vector.h"
-
-#define TEX_0   0x1
-#define TEX_1   0x2
-#define TEX_2   0x4
-#define TEX_ALL 0x7
-
-/* Rasterizing fallbacks */
-/* See correponding strings in r200_swtcl.c */
-#define RADEON_FALLBACK_TEXTURE		0x0001
-#define RADEON_FALLBACK_DRAW_BUFFER	0x0002
-#define RADEON_FALLBACK_STENCIL		0x0004
-#define RADEON_FALLBACK_RENDER_MODE	0x0008
-#define RADEON_FALLBACK_BLEND_EQ	0x0010
-#define RADEON_FALLBACK_BLEND_FUNC	0x0020
-#define RADEON_FALLBACK_DISABLE 	0x0040
-#define RADEON_FALLBACK_BORDER_MODE	0x0080
-
-/* The blit width for texture uploads
- */
-#define BLIT_WIDTH_BYTES 1024
 
-/* Use the templated vertex format:
- */
-#define COLOR_IS_RGBA
-#define TAG(x) radeon##x
-#include "tnl_dd/t_dd_vertex.h"
-#undef TAG
-
-typedef void (*radeon_tri_func) (radeonContextPtr,
-				 radeonVertex *,
-				 radeonVertex *, radeonVertex *);
-
-typedef void (*radeon_line_func) (radeonContextPtr,
-				  radeonVertex *, radeonVertex *);
+#include "radeon_common.h"
 
-typedef void (*radeon_point_func) (radeonContextPtr, radeonVertex *);
-
-struct radeon_colorbuffer_state {
-	GLuint clear;
-	int roundEnable;
-};
 
-struct radeon_depthbuffer_state {
-	GLuint clear;
-	GLfloat scale;
-};
+struct r100_context;
+typedef struct r100_context r100ContextRec;
+typedef struct r100_context *r100ContextPtr;
 
-struct radeon_scissor_state {
-	drm_clip_rect_t rect;
-	GLboolean enabled;
+#include "radeon_lock.h"
 
-	GLuint numClipRects;	/* Cliprects active */
-	GLuint numAllocedClipRects;	/* Cliprects available */
-	drm_clip_rect_t *pClipRects;
-};
 
-struct radeon_stencilbuffer_state {
-	GLboolean hwBuffer;
-	GLuint clear;		/* rb3d_stencilrefmask value */
-};
 
-struct radeon_stipple_state {
-	GLuint mask[32];
-};
+#define R100_TEX_ALL 0x7
 
 /* used for both tcl_vtx and vc_frmt tex bits (they are identical) */
 #define RADEON_ST_BIT(unit) \
@@ -141,42 +73,6 @@ struct radeon_stipple_state {
 #define RADEON_Q_BIT(unit) \
 (unit == 0 ? RADEON_CP_VC_FRMT_Q0 : (RADEON_CP_VC_FRMT_Q1 >> 2) << (2 * unit))
 
-typedef struct radeon_tex_obj radeonTexObj, *radeonTexObjPtr;
-
-/* Texture object in locally shared texture space.
- */
-struct radeon_tex_obj {
-	driTextureObject base;
-
-	GLuint bufAddr;		/* Offset to start of locally
-				   shared texture block */
-
-	GLuint dirty_state;	/* Flags (1 per texunit) for
-				   whether or not this texobj
-				   has dirty hardware state
-				   (pp_*) that needs to be
-				   brought into the
-				   texunit. */
-
-	drm_radeon_tex_image_t image[6][RADEON_MAX_TEXTURE_LEVELS];
-	/* Six, for the cube faces */
-
-	GLboolean image_override; /* Image overridden by GLX_EXT_tfp */
-
-	GLuint pp_txfilter;	/* hardware register values */
-	GLuint pp_txformat;
-	GLuint pp_txoffset;	/* Image location in texmem.
-				   All cube faces follow. */
-	GLuint pp_txsize;	/* npot only */
-	GLuint pp_txpitch;	/* npot only */
-	GLuint pp_border_color;
-	GLuint pp_cubic_faces;	/* cube face 1,2,3,4 log2 sizes */
-
-	GLboolean border_fallback;
-
-	GLuint tile_bits;	/* hw texture tile bits used on this texture */
-};
-
 struct radeon_texture_env_state {
 	radeonTexObjPtr texobj;
 	GLenum format;
@@ -187,17 +83,6 @@ struct radeon_texture_state {
 	struct radeon_texture_env_state unit[RADEON_MAX_TEXTURE_UNITS];
 };
 
-struct radeon_state_atom {
-	struct radeon_state_atom *next, *prev;
-	const char *name;	/* for debug */
-	int cmd_size;		/* size in bytes */
-	GLuint is_tcl;
-	int *cmd;		/* one or more cmd's */
-	int *lastcmd;		/* one or more cmd's */
-	GLboolean dirty;	/* dirty-mark in emit_state_list */
-	 GLboolean(*check) (GLcontext *);	/* is this state active? */
-};
-
 /* Trying to keep these relatively short as the variables are becoming
  * extravagently long.  Drop the driver name prefix off the front of
  * everything - I think we know which driver we're in by now, and keep the
@@ -410,10 +295,7 @@ struct radeon_state_atom {
 #define SHN_SHININESS      1
 #define SHN_STATE_SIZE     2
 
-struct radeon_hw_state {
-	/* Head of the linked list of state atoms. */
-	struct radeon_state_atom atomlist;
-
+struct r100_hw_state {
 	/* Hardware state, stored as cmdbuf commands:  
 	 *   -- Need to doublebuffer for
 	 *           - eliding noop statechange loops? (except line stipple count)
@@ -438,89 +320,19 @@ struct radeon_hw_state {
 	struct radeon_state_atom glt;
 	struct radeon_state_atom txr[3];	/* for NPOT */
 
-	int max_state_size;	/* Number of bytes necessary for a full state emit. */
-	GLboolean is_dirty, all_dirty;
 };
 
-struct radeon_state {
-	/* Derived state for internal purposes:
-	 */
-	struct radeon_colorbuffer_state color;
-	struct radeon_depthbuffer_state depth;
-	struct radeon_scissor_state scissor;
-	struct radeon_stencilbuffer_state stencil;
+
+struct r100_state {
 	struct radeon_stipple_state stipple;
 	struct radeon_texture_state texture;
 };
 
-/* Need refcounting on dma buffers:
- */
-struct radeon_dma_buffer {
-	int refcount;		/* the number of retained regions in buf */
-	drmBufPtr buf;
-};
-
-#define GET_START(rvb) (rmesa->radeonScreen->gart_buffer_offset +			\
-			(rvb)->address - rmesa->dma.buf0_address +	\
-			(rvb)->start)
-
-/* A retained region, eg vertices for indexed vertices.
- */
-struct radeon_dma_region {
-	struct radeon_dma_buffer *buf;
-	char *address;		/* == buf->address */
-	int start, end, ptr;	/* offsets from start of buf */
-	int aos_start;
-	int aos_stride;
-	int aos_size;
-};
-
-struct radeon_dma {
-	/* Active dma region.  Allocations for vertices and retained
-	 * regions come from here.  Also used for emitting random vertices,
-	 * these may be flushed by calling flush_current();
-	 */
-	struct radeon_dma_region current;
-
-	void (*flush) (radeonContextPtr);
-
-	char *buf0_address;	/* start of buf[0], for index calcs */
-	GLuint nr_released_bufs;	/* flush after so many buffers released */
-};
-
-struct radeon_dri_mirror {
-	__DRIcontextPrivate *context;	/* DRI context */
-	__DRIscreenPrivate *screen;	/* DRI screen */
-
-   /**
-    * DRI drawable bound to this context for drawing.
-    */
-	__DRIdrawablePrivate *drawable;
-
-   /**
-    * DRI drawable bound to this context for reading.
-    */
-	__DRIdrawablePrivate *readable;
-
-	drm_context_t hwContext;
-	drm_hw_lock_t *hwLock;
-	int fd;
-	int drmMinor;
-};
-
 #define RADEON_CMD_BUF_SZ  (8*1024)
-
-struct radeon_store {
-	GLuint statenr;
-	GLuint primnr;
-	char cmd_buf[RADEON_CMD_BUF_SZ];
-	int cmd_used;
-	int elts_start;
-};
-
+#define R200_ELT_BUF_SZ  (8*1024)
 /* radeon_tcl.c
  */
-struct radeon_tcl_info {
+struct r100_tcl_info {
 	GLuint vertex_format;
 	GLuint hw_primitive;
 
@@ -529,30 +341,18 @@ struct radeon_tcl_info {
 	 */
 	GLvector4f ObjClean;
 
-	struct radeon_dma_region *aos_components[8];
-	GLuint nr_aos_components;
-
 	GLuint *Elts;
 
-	struct radeon_dma_region indexed_verts;
-	struct radeon_dma_region obj;
-	struct radeon_dma_region rgba;
-	struct radeon_dma_region spec;
-	struct radeon_dma_region fog;
-	struct radeon_dma_region tex[RADEON_MAX_TEXTURE_UNITS];
-	struct radeon_dma_region norm;
+        int elt_cmd_offset;
+	int elt_cmd_start;
+        int elt_used;
 };
 
 /* radeon_swtcl.c
  */
-struct radeon_swtcl_info {
-	GLuint RenderIndex;
-	GLuint vertex_size;
+struct r100_swtcl_info {
 	GLuint vertex_format;
 
-	struct tnl_attr_map vertex_attrs[VERT_ATTRIB_MAX];
-	GLuint vertex_attr_count;
-
 	GLubyte *verts;
 
 	/* Fallback rasterization functions
@@ -561,10 +361,6 @@ struct radeon_swtcl_info {
 	radeon_line_func draw_line;
 	radeon_tri_func draw_tri;
 
-	GLuint hw_primitive;
-	GLenum render_primitive;
-	GLuint numverts;
-
    /**
     * Offset of the 4UB color data within a hardware (swtcl) vertex.
     */
@@ -576,22 +372,9 @@ struct radeon_swtcl_info {
 	GLuint specoffset;
 
 	GLboolean needproj;
-
-	struct radeon_dma_region indexed_verts;
 };
 
-struct radeon_ioctl {
-	GLuint vertex_offset;
-	GLuint vertex_size;
-};
-
-#define RADEON_MAX_PRIMS 64
 
-struct radeon_prim {
-	GLuint start;
-	GLuint end;
-	GLuint prim;
-};
 
 /* A maximum total of 20 elements per vertex:  3 floats for position, 3
  * floats for normal, 4 floats for color, 4 bytes for secondary color,
@@ -602,59 +385,18 @@ struct radeon_prim {
  */
 #define RADEON_MAX_VERTEX_SIZE 20
 
-struct radeon_context {
-	GLcontext *glCtx;	/* Mesa context */
+struct r100_context {
+        struct radeon_context radeon;
 
 	/* Driver and hardware state management
 	 */
-	struct radeon_hw_state hw;
-	struct radeon_state state;
-
-	/* Texture object bookkeeping
-	 */
-	unsigned nr_heaps;
-	driTexHeap *texture_heaps[RADEON_NR_TEX_HEAPS];
-	driTextureObject swapped;
-	int texture_depth;
-	float initialMaxAnisotropy;
-
-	/* Rasterization and vertex state:
-	 */
-	GLuint TclFallback;
-	GLuint Fallback;
-	GLuint NewGLState;
-	 DECLARE_RENDERINPUTS(tnl_index_bitset);	/* index of bits for last tnl_install_attrs */
+	struct r100_hw_state hw;
+	struct r100_state state;
 
 	/* Vertex buffers
 	 */
 	struct radeon_ioctl ioctl;
-	struct radeon_dma dma;
 	struct radeon_store store;
-	/* A full state emit as of the first state emit in the main store, in case
-	 * the context is lost.
-	 */
-	struct radeon_store backup_store;
-
-	/* Page flipping
-	 */
-	GLuint doPageFlip;
-
-	/* Busy waiting
-	 */
-	GLuint do_usleeps;
-	GLuint do_irqs;
-	GLuint irqsEmitted;
-	drm_radeon_irq_wait_t iw;
-
-	/* Drawable, cliprect and scissor information
-	 */
-	GLuint numClipRects;	/* Cliprects for the draw buffer */
-	drm_clip_rect_t *pClipRects;
-	unsigned int lastStamp;
-	GLboolean lost_context;
-	GLboolean save_on_next_emit;
-	radeonScreenPtr radeonScreen;	/* Screen private DRI data */
-	drm_radeon_sarea_t *sarea;	/* Private SAREA data */
 
 	/* TCL stuff
 	 */
@@ -667,29 +409,13 @@ struct radeon_context {
 	GLmatrix tmpmat[RADEON_MAX_TEXTURE_UNITS];
 	GLuint last_ReallyEnabled;
 
-	/* VBI
-	 */
-	int64_t swap_ust;
-	int64_t swap_missed_ust;
-
-	GLuint swap_count;
-	GLuint swap_missed_count;
-
 	/* radeon_tcl.c
 	 */
-	struct radeon_tcl_info tcl;
+	struct r100_tcl_info tcl;
 
 	/* radeon_swtcl.c
 	 */
-	struct radeon_swtcl_info swtcl;
-
-	/* Mirrors of some DRI state
-	 */
-	struct radeon_dri_mirror dri;
-
-	/* Configuration cache
-	 */
-	driOptionCache optionCache;
+	struct r100_swtcl_info swtcl;
 
 	GLboolean using_hyperz;
 	GLboolean texmicrotile;
@@ -703,61 +429,19 @@ struct radeon_context {
 	GLuint c_textureSwaps;
 	GLuint c_textureBytes;
 	GLuint c_vertexBuffers;
+
 };
 
-#define RADEON_CONTEXT(ctx)		((radeonContextPtr)(ctx->DriverCtx))
-
-static INLINE GLuint radeonPackColor(GLuint cpp,
-                                     GLubyte r, GLubyte g,
-                                     GLubyte b, GLubyte a)
-{
-	switch (cpp) {
-	case 2:
-		return PACK_COLOR_565(r, g, b);
-	case 4:
-		return PACK_COLOR_8888(a, r, g, b);
-	default:
-		return 0;
-	}
-}
+
+#define R100_CONTEXT(ctx)		((r100ContextPtr)(ctx->DriverCtx))
+
 
 #define RADEON_OLD_PACKETS 1
 
-extern void radeonDestroyContext(__DRIcontextPrivate * driContextPriv);
-extern GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
-				     __DRIcontextPrivate * driContextPriv,
-				     void *sharedContextPrivate);
-extern void radeonSwapBuffers(__DRIdrawablePrivate * dPriv);
-extern void radeonCopySubBuffer(__DRIdrawablePrivate * dPriv,
-				int x, int y, int w, int h);
-extern GLboolean radeonMakeCurrent(__DRIcontextPrivate * driContextPriv,
-				   __DRIdrawablePrivate * driDrawPriv,
-				   __DRIdrawablePrivate * driReadPriv);
-extern GLboolean radeonUnbindContext(__DRIcontextPrivate * driContextPriv);
-
-/* ================================================================
- * Debugging:
- */
-#define DO_DEBUG		1
-
-#if DO_DEBUG
-extern int RADEON_DEBUG;
-#else
-#define RADEON_DEBUG		0
-#endif
-
-#define DEBUG_TEXTURE	0x0001
-#define DEBUG_STATE	0x0002
-#define DEBUG_IOCTL	0x0004
-#define DEBUG_PRIMS	0x0008
-#define DEBUG_VERTS	0x0010
-#define DEBUG_FALLBACKS	0x0020
-#define DEBUG_VFMT	0x0040
-#define DEBUG_CODEGEN	0x0080
-#define DEBUG_VERBOSE	0x0100
-#define DEBUG_DRI       0x0200
-#define DEBUG_DMA       0x0400
-#define DEBUG_SANITY    0x0800
-#define DEBUG_SYNC      0x1000
+extern GLboolean r100CreateContext( const __GLcontextModes *glVisual,
+				    __DRIcontextPrivate *driContextPriv,
+				    void *sharedContextPrivate);
+  
+
 
 #endif				/* __RADEON_CONTEXT_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_drm.h b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
new file mode 100644
index 00000000000..ee403d173cc
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_drm.h
@@ -0,0 +1,237 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <[email protected]>
+ *      Nicolai Haehnle <[email protected]>
+ *      Jérôme Glisse <[email protected]>
+ */
+#ifndef RADEON_CS_H
+#define RADEON_CS_H
+
+#include <stdint.h>
+#include <string.h>
+#include "drm.h"
+#include "radeon_drm.h"
+
+struct radeon_cs_reloc {
+    struct radeon_bo    *bo;
+    uint32_t            read_domain;
+    uint32_t            write_domain;
+    uint32_t            flags;
+};
+
+
+#define RADEON_CS_SPACE_OK 0
+#define RADEON_CS_SPACE_OP_TO_BIG 1
+#define RADEON_CS_SPACE_FLUSH 2
+
+struct radeon_cs_space_check {
+    struct radeon_bo *bo;
+    uint32_t read_domains;
+    uint32_t write_domain;
+    uint32_t new_accounted;
+};
+
+#define MAX_SPACE_BOS (32)
+
+struct radeon_cs_manager;
+
+struct radeon_cs {
+    struct radeon_cs_manager    *csm;
+    void                        *relocs;
+    uint32_t                    *packets;
+    unsigned                    crelocs;
+    unsigned                    relocs_total_size;
+    unsigned                    cdw;
+    unsigned                    ndw;
+    int                         section;
+    unsigned                    section_ndw;
+    unsigned                    section_cdw;
+    const char                  *section_file;
+    const char                  *section_func;
+    int                         section_line;
+    struct radeon_cs_space_check bos[MAX_SPACE_BOS];
+    int                         bo_count;
+    void                        (*space_flush_fn)(void *);
+    void                        *space_flush_data;
+};
+
+/* cs functions */
+struct radeon_cs_funcs {
+    struct radeon_cs *(*cs_create)(struct radeon_cs_manager *csm,
+                                   uint32_t ndw);
+    int (*cs_write_reloc)(struct radeon_cs *cs,
+                          struct radeon_bo *bo,
+                          uint32_t read_domain,
+                          uint32_t write_domain,
+                          uint32_t flags);
+    int (*cs_begin)(struct radeon_cs *cs,
+                    uint32_t ndw,
+                    const char *file,
+                    const char *func,
+                    int line);
+    int (*cs_end)(struct radeon_cs *cs,
+                  const char *file,
+                  const char *func,
+                  int line);
+    int (*cs_emit)(struct radeon_cs *cs);
+    int (*cs_destroy)(struct radeon_cs *cs);
+    int (*cs_erase)(struct radeon_cs *cs);
+    int (*cs_need_flush)(struct radeon_cs *cs);
+    void (*cs_print)(struct radeon_cs *cs, FILE *file);
+};
+
+struct radeon_cs_manager {
+    struct radeon_cs_funcs  *funcs;
+    int                     fd;
+    int32_t vram_limit, gart_limit;
+    int32_t vram_write_used, gart_write_used;
+    int32_t read_used;
+};
+
+static inline struct radeon_cs *radeon_cs_create(struct radeon_cs_manager *csm,
+                                                 uint32_t ndw)
+{
+    return csm->funcs->cs_create(csm, ndw);
+}
+
+static inline int radeon_cs_write_reloc(struct radeon_cs *cs,
+                                        struct radeon_bo *bo,
+                                        uint32_t read_domain,
+                                        uint32_t write_domain,
+                                        uint32_t flags)
+{
+    return cs->csm->funcs->cs_write_reloc(cs,
+                                          bo,
+                                          read_domain,
+                                          write_domain,
+                                          flags);
+}
+
+static inline int radeon_cs_begin(struct radeon_cs *cs,
+                                  uint32_t ndw,
+                                  const char *file,
+                                  const char *func,
+                                  int line)
+{
+    return cs->csm->funcs->cs_begin(cs, ndw, file, func, line);
+}
+
+static inline int radeon_cs_end(struct radeon_cs *cs,
+                                const char *file,
+                                const char *func,
+                                int line)
+{
+    return cs->csm->funcs->cs_end(cs, file, func, line);
+}
+
+static inline int radeon_cs_emit(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_emit(cs);
+}
+
+static inline int radeon_cs_destroy(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_destroy(cs);
+}
+
+static inline int radeon_cs_erase(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_erase(cs);
+}
+
+static inline int radeon_cs_need_flush(struct radeon_cs *cs)
+{
+    return cs->csm->funcs->cs_need_flush(cs);
+}
+
+static inline void radeon_cs_print(struct radeon_cs *cs, FILE *file)
+{
+    cs->csm->funcs->cs_print(cs, file);
+}
+
+static inline void radeon_cs_set_limit(struct radeon_cs *cs, uint32_t domain, uint32_t limit)
+{
+    
+    if (domain == RADEON_GEM_DOMAIN_VRAM)
+	cs->csm->vram_limit = limit;
+    else
+	cs->csm->gart_limit = limit;
+}
+
+static inline void radeon_cs_write_dword(struct radeon_cs *cs, uint32_t dword)
+{
+    cs->packets[cs->cdw++] = dword;
+    if (cs->section) {
+        cs->section_cdw++;
+    }
+}
+
+static inline void radeon_cs_write_qword(struct radeon_cs *cs, uint64_t qword)
+{
+
+    memcpy(cs->packets + cs->cdw, &qword, sizeof(qword));
+    cs->cdw+=2;
+    if (cs->section) {
+        cs->section_cdw+=2;
+    }
+}
+
+static inline void radeon_cs_space_set_flush(struct radeon_cs *cs, void (*fn)(void *), void *data)
+{
+    cs->space_flush_fn = fn;
+    cs->space_flush_data = data;
+}
+
+
+/*
+ * add a persistent BO to the list
+ * a persistent BO is one that will be referenced across flushes,
+ * i.e. colorbuffer, textures etc.
+ * They get reset when a new "operation" happens, where an operation
+ * is a state emission with a color/textures etc followed by a bunch of vertices.
+ */
+void radeon_cs_space_add_persistent_bo(struct radeon_cs *cs,
+				       struct radeon_bo *bo,
+				       uint32_t read_domains,
+				       uint32_t write_domain);
+
+/* reset the persistent BO list */
+void radeon_cs_space_reset_bos(struct radeon_cs *cs);
+
+/* do a space check with the current persistent BO list */
+int radeon_cs_space_check(struct radeon_cs *cs);
+
+/* do a space check with the current persistent BO list and a temporary BO
+ * a temporary BO is like a DMA buffer, which  gets flushed with the
+ * command buffer */
+int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
+				  struct radeon_bo *bo,
+				  uint32_t read_domains,
+				  uint32_t write_domain);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
new file mode 100644
index 00000000000..4f1065ebcf3
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.c
@@ -0,0 +1,408 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <[email protected]>
+ *      Nicolai Haehnle <[email protected]>
+ *      Jérôme Glisse <[email protected]>
+ */
+#include <errno.h>
+
+#include "radeon_bocs_wrapper.h"
+
+struct cs_manager_legacy {
+    struct radeon_cs_manager    base;
+    struct radeon_context       *ctx;
+    /* hack for scratch stuff */
+    uint32_t                    pending_age;
+    uint32_t                    pending_count;
+
+
+};
+
+struct cs_reloc_legacy {
+    struct radeon_cs_reloc  base;
+    uint32_t                cindices;
+    uint32_t                *indices;
+};
+
+
+static struct radeon_cs *cs_create(struct radeon_cs_manager *csm,
+                                   uint32_t ndw)
+{
+    struct radeon_cs *cs;
+
+    cs = (struct radeon_cs*)calloc(1, sizeof(struct radeon_cs));
+    if (cs == NULL) {
+        return NULL;
+    }
+    cs->csm = csm;
+    cs->ndw = (ndw + 0x3FF) & (~0x3FF);
+    cs->packets = (uint32_t*)malloc(4*cs->ndw);
+    if (cs->packets == NULL) {
+        free(cs);
+        return NULL;
+    }
+    cs->relocs_total_size = 0;
+    return cs;
+}
+
+static int cs_write_reloc(struct radeon_cs *cs,
+                          struct radeon_bo *bo,
+                          uint32_t read_domain,
+                          uint32_t write_domain,
+                          uint32_t flags)
+{
+    struct cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct cs_reloc_legacy *)cs->relocs;
+    /* check domains */
+    if ((read_domain && write_domain) || (!read_domain && !write_domain)) {
+        /* in one CS a bo can only be in read or write domain but not
+         * in read & write domain at the same sime
+         */
+        return -EINVAL;
+    }
+    if (read_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    if (write_domain == RADEON_GEM_DOMAIN_CPU) {
+        return -EINVAL;
+    }
+    /* check if bo is already referenced */
+    for(i = 0; i < cs->crelocs; i++) {
+        uint32_t *indices;
+
+        if (relocs[i].base.bo->handle == bo->handle) {
+            /* Check domains must be in read or write. As we check already
+             * checked that in argument one of the read or write domain was
+             * set we only need to check that if previous reloc as the read
+             * domain set then the read_domain should also be set for this
+             * new relocation.
+             */
+            if (relocs[i].base.read_domain && !read_domain) {
+                return -EINVAL;
+            }
+            if (relocs[i].base.write_domain && !write_domain) {
+                return -EINVAL;
+            }
+            relocs[i].base.read_domain |= read_domain;
+            relocs[i].base.write_domain |= write_domain;
+            /* save indice */
+            relocs[i].cindices++;
+            indices = (uint32_t*)realloc(relocs[i].indices,
+                                         relocs[i].cindices * 4);
+            if (indices == NULL) {
+                relocs[i].cindices -= 1;
+                return -ENOMEM;
+            }
+            relocs[i].indices = indices;
+            relocs[i].indices[relocs[i].cindices - 1] = cs->cdw - 1;
+            return 0;
+        }
+    }
+    /* add bo to reloc */
+    relocs = (struct cs_reloc_legacy*)
+             realloc(cs->relocs,
+                     sizeof(struct cs_reloc_legacy) * (cs->crelocs + 1));
+    if (relocs == NULL) {
+        return -ENOMEM;
+    }
+    cs->relocs = relocs;
+    relocs[cs->crelocs].base.bo = bo;
+    relocs[cs->crelocs].base.read_domain = read_domain;
+    relocs[cs->crelocs].base.write_domain = write_domain;
+    relocs[cs->crelocs].base.flags = flags;
+    relocs[cs->crelocs].indices = (uint32_t*)malloc(4);
+    if (relocs[cs->crelocs].indices == NULL) {
+        return -ENOMEM;
+    }
+    relocs[cs->crelocs].indices[0] = cs->cdw - 1;
+    relocs[cs->crelocs].cindices = 1;
+    cs->relocs_total_size += radeon_bo_legacy_relocs_size(bo);
+    cs->crelocs++;
+    radeon_bo_ref(bo);
+    return 0;
+}
+
+static int cs_begin(struct radeon_cs *cs,
+                    uint32_t ndw,
+                    const char *file,
+                    const char *func,
+                    int line)
+{
+    if (cs->section) {
+        fprintf(stderr, "CS already in a section(%s,%s,%d)\n",
+                cs->section_file, cs->section_func, cs->section_line);
+        fprintf(stderr, "CS can't start section(%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    cs->section = 1;
+    cs->section_ndw = ndw;
+    cs->section_cdw = 0;
+    cs->section_file = file;
+    cs->section_func = func;
+    cs->section_line = line;
+
+
+    if (cs->cdw + ndw > cs->ndw) {
+        uint32_t tmp, *ptr;
+	int num = (ndw > 0x3FF) ? ndw : 0x3FF;
+
+        tmp = (cs->cdw + 1 + num) & (~num);
+        ptr = (uint32_t*)realloc(cs->packets, 4 * tmp);
+        if (ptr == NULL) {
+            return -ENOMEM;
+        }
+        cs->packets = ptr;
+        cs->ndw = tmp;
+    }
+
+    return 0;
+}
+
+static int cs_end(struct radeon_cs *cs,
+                  const char *file,
+                  const char *func,
+                  int line)
+
+{
+    if (!cs->section) {
+        fprintf(stderr, "CS no section to end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    cs->section = 0;
+    if (cs->section_ndw != cs->section_cdw) {
+        fprintf(stderr, "CS section size missmatch start at (%s,%s,%d) %d vs %d\n",
+                cs->section_file, cs->section_func, cs->section_line, cs->section_ndw, cs->section_cdw);
+        fprintf(stderr, "CS section end at (%s,%s,%d)\n",
+                file, func, line);
+        return -EPIPE;
+    }
+    return 0;
+}
+
+static int cs_process_relocs(struct radeon_cs *cs)
+{
+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
+    struct cs_reloc_legacy *relocs;
+    int i, j, r;
+
+    csm = (struct cs_manager_legacy*)cs->csm;
+    relocs = (struct cs_reloc_legacy *)cs->relocs;
+restart:
+    for (i = 0; i < cs->crelocs; i++) 
+    {
+        for (j = 0; j < relocs[i].cindices; j++) 
+        {
+            uint32_t soffset, eoffset;
+
+            r = radeon_bo_legacy_validate(relocs[i].base.bo,
+                                           &soffset, &eoffset);
+	        if (r == -EAGAIN)
+            {
+	             goto restart;
+            }
+            if (r) 
+            {
+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+                        relocs[i].base.bo, soffset, eoffset);
+                return r;
+            }
+            cs->packets[relocs[i].indices[j]] += soffset;
+            if (cs->packets[relocs[i].indices[j]] >= eoffset) 
+            {
+	      /*                radeon_bo_debug(relocs[i].base.bo, 12); */
+                fprintf(stderr, "validated %p [0x%08X, 0x%08X]\n",
+                        relocs[i].base.bo, soffset, eoffset);
+                fprintf(stderr, "above end: %p 0x%08X 0x%08X\n",
+                        relocs[i].base.bo,
+                        cs->packets[relocs[i].indices[j]],
+                        eoffset);
+                exit(0);
+                return -EINVAL;
+            }
+        }
+    }
+    return 0;
+}
+
+static int cs_set_age(struct radeon_cs *cs)
+{
+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
+    struct cs_reloc_legacy *relocs;
+    int i;
+
+    relocs = (struct cs_reloc_legacy *)cs->relocs;
+    for (i = 0; i < cs->crelocs; i++) {
+        radeon_bo_legacy_pending(relocs[i].base.bo, csm->pending_age);
+        radeon_bo_unref(relocs[i].base.bo);
+    }
+    return 0;
+}
+
+static int cs_emit(struct radeon_cs *cs)
+{
+    struct cs_manager_legacy *csm = (struct cs_manager_legacy*)cs->csm;
+    drm_radeon_cmd_buffer_t cmd;
+    drm_r300_cmd_header_t age;
+    uint64_t ull;
+    int r;
+
+    csm->ctx->vtbl.emit_cs_header(cs, csm->ctx);
+
+    /* append buffer age */
+    if ( IS_R300_CLASS(csm->ctx->radeonScreen) )
+    { 
+      age.scratch.cmd_type = R300_CMD_SCRATCH;
+      /* Scratch register 2 corresponds to what radeonGetAge polls */
+      csm->pending_age = 0;
+      csm->pending_count = 1;
+      ull = (uint64_t) (intptr_t) &csm->pending_age;
+      age.scratch.reg = 2;
+      age.scratch.n_bufs = 1;
+      age.scratch.flags = 0;
+      radeon_cs_write_dword(cs, age.u);
+      radeon_cs_write_qword(cs, ull);
+      radeon_cs_write_dword(cs, 0);
+    }
+
+    r = cs_process_relocs(cs);
+    if (r) {
+        return 0;
+    }
+
+    cmd.buf = (char *)cs->packets;
+    cmd.bufsz = cs->cdw * 4;
+    if (csm->ctx->state.scissor.enabled) {
+        cmd.nbox = csm->ctx->state.scissor.numClipRects;
+        cmd.boxes = (drm_clip_rect_t *) csm->ctx->state.scissor.pClipRects;
+    } else {
+        cmd.nbox = csm->ctx->numClipRects;
+        cmd.boxes = (drm_clip_rect_t *) csm->ctx->pClipRects;
+    }
+
+    //dump_cmdbuf(cs);
+
+    r = drmCommandWrite(cs->csm->fd, DRM_RADEON_CMDBUF, &cmd, sizeof(cmd));
+    if (r) {
+        return r;
+    }
+    if ((!IS_R300_CLASS(csm->ctx->radeonScreen)) &&
+        (!IS_R600_CLASS(csm->ctx->radeonScreen))) { /* +r6/r7 : No irq for r6/r7 yet. */
+	drm_radeon_irq_emit_t emit_cmd;
+	emit_cmd.irq_seq = &csm->pending_age;
+	r = drmCommandWrite(cs->csm->fd, DRM_RADEON_IRQ_EMIT, &emit_cmd, sizeof(emit_cmd));
+	if (r) {
+		return r;
+	}
+    }
+    cs_set_age(cs);
+
+    cs->csm->read_used = 0;
+    cs->csm->vram_write_used = 0;
+    cs->csm->gart_write_used = 0;
+    return 0;
+}
+
+static void inline cs_free_reloc(void *relocs_p, int crelocs)
+{
+    struct cs_reloc_legacy *relocs = relocs_p;
+    int i;
+    if (!relocs_p)
+      return;
+    for (i = 0; i < crelocs; i++)
+      free(relocs[i].indices);
+}
+
+static int cs_destroy(struct radeon_cs *cs)
+{
+    cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    free(cs->packets);
+    free(cs);
+    return 0;
+}
+
+static int cs_erase(struct radeon_cs *cs)
+{
+    cs_free_reloc(cs->relocs, cs->crelocs);
+    free(cs->relocs);
+    cs->relocs_total_size = 0;
+    cs->relocs = NULL;
+    cs->crelocs = 0;
+    cs->cdw = 0;
+    cs->section = 0;
+    return 0;
+}
+
+static int cs_need_flush(struct radeon_cs *cs)
+{
+    /* this function used to flush when the BO usage got to
+     * a certain size, now the higher levels handle this better */
+    return 0;
+}
+
+static void cs_print(struct radeon_cs *cs, FILE *file)
+{
+}
+
+static struct radeon_cs_funcs  radeon_cs_legacy_funcs = {
+    cs_create,
+    cs_write_reloc,
+    cs_begin,
+    cs_end,
+    cs_emit,
+    cs_destroy,
+    cs_erase,
+    cs_need_flush,
+    cs_print,
+};
+
+struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx)
+{
+    struct cs_manager_legacy *csm;
+
+    csm = (struct cs_manager_legacy*)
+          calloc(1, sizeof(struct cs_manager_legacy));
+    if (csm == NULL) {
+        return NULL;
+    }
+    csm->base.funcs = &radeon_cs_legacy_funcs;
+    csm->base.fd = ctx->dri.fd;
+    csm->ctx = ctx;
+    csm->pending_age = 1;
+    return (struct radeon_cs_manager*)csm;
+}
+
+void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm)
+{
+    free(csm);
+}
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
new file mode 100644
index 00000000000..e177b4bafe2
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_legacy.h
@@ -0,0 +1,40 @@
+/* 
+ * Copyright © 2008 Nicolai Haehnle
+ * Copyright © 2008 Jérôme Glisse
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ * Authors:
+ *      Aapo Tahkola <[email protected]>
+ *      Nicolai Haehnle <[email protected]>
+ *      Jérôme Glisse <[email protected]>
+ */
+#ifndef RADEON_CS_LEGACY_H
+#define RADEON_CS_LEGACY_H
+
+#include "radeon_common.h"
+
+struct radeon_cs_manager *radeon_cs_manager_legacy_ctor(struct radeon_context *ctx);
+void radeon_cs_manager_legacy_dtor(struct radeon_cs_manager *csm);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c b/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
new file mode 100644
index 00000000000..5a8df7bb8cd
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_cs_space_drm.c
@@ -0,0 +1,234 @@
+/* 
+ * Copyright © 2009 Red Hat Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
+ * OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS, AUTHORS
+ * AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ */
+/*
+ */
+#include <assert.h>
+#include <errno.h>
+#include <stdlib.h>
+#include "radeon_bocs_wrapper.h"
+
+struct rad_sizes {
+    int32_t op_read;
+    int32_t op_gart_write;
+    int32_t op_vram_write;
+};
+
+static inline int radeon_cs_setup_bo(struct radeon_cs_space_check *sc, struct rad_sizes *sizes)
+{
+    uint32_t read_domains, write_domain;
+    struct radeon_bo *bo;
+
+    bo = sc->bo;
+    sc->new_accounted = 0;
+    read_domains = sc->read_domains;
+    write_domain = sc->write_domain;
+
+    /* legacy needs a static check */
+    if (radeon_bo_is_static(bo)) {
+	bo->space_accounted = sc->new_accounted = (read_domains << 16) | write_domain;
+	return 0;
+    }
+
+    /* already accounted this bo */
+    if (write_domain && (write_domain == bo->space_accounted)) {
+	sc->new_accounted = bo->space_accounted;
+	return 0;
+    }
+    if (read_domains && ((read_domains << 16) == bo->space_accounted)) {
+	sc->new_accounted = bo->space_accounted;
+	return 0;
+    }
+
+    if (bo->space_accounted == 0) {
+	if (write_domain == RADEON_GEM_DOMAIN_VRAM)
+	    sizes->op_vram_write += bo->size;
+	else if (write_domain == RADEON_GEM_DOMAIN_GTT)
+	  sizes->op_gart_write += bo->size;
+	else
+	    sizes->op_read += bo->size;
+	sc->new_accounted = (read_domains << 16) | write_domain;
+    } else {
+	uint16_t old_read, old_write;
+	
+	old_read = bo->space_accounted >> 16;
+	old_write = bo->space_accounted & 0xffff;
+	
+	if (write_domain && (old_read & write_domain)) {
+	    sc->new_accounted = write_domain;
+	    /* moving from read to a write domain */
+	    if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
+		sizes->op_read -= bo->size;
+		sizes->op_vram_write += bo->size;
+	    } else if (write_domain == RADEON_GEM_DOMAIN_VRAM) {
+		sizes->op_read -= bo->size;
+		sizes->op_gart_write += bo->size;
+	    }
+	} else if (read_domains & old_write) {
+	    sc->new_accounted = bo->space_accounted & 0xffff;
+	} else {
+	    /* rewrite the domains */
+	    if (write_domain != old_write)
+		fprintf(stderr,"WRITE DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, write_domain, old_write);
+	    if (read_domains != old_read)
+		fprintf(stderr,"READ DOMAIN RELOC FAILURE 0x%x %d %d\n", bo->handle, read_domains, old_read);
+	    return RADEON_CS_SPACE_FLUSH;
+	}
+    }
+    return 0;
+}
+
+static int radeon_cs_do_space_check(struct radeon_cs *cs, struct radeon_cs_space_check *new_tmp)
+{
+    struct radeon_cs_manager *csm = cs->csm;
+    int i;
+    struct radeon_bo *bo;
+    struct rad_sizes sizes;
+    int ret;
+
+    /* check the totals for this operation */
+
+    if (cs->bo_count == 0 && !new_tmp)
+	return 0;
+
+    memset(&sizes, 0, sizeof(struct rad_sizes));
+
+    /* prepare */
+    for (i = 0; i < cs->bo_count; i++) {
+	ret = radeon_cs_setup_bo(&cs->bos[i], &sizes);
+	if (ret)
+	    return ret;
+    }
+
+    if (new_tmp) {
+	ret = radeon_cs_setup_bo(new_tmp, &sizes);
+	if (ret)
+	    return ret;
+    }
+	
+    if (sizes.op_read < 0)
+	    sizes.op_read = 0;
+
+    /* check sizes - operation first */
+    if ((sizes.op_read + sizes.op_gart_write > csm->gart_limit) ||
+	(sizes.op_vram_write > csm->vram_limit)) {
+	    return RADEON_CS_SPACE_OP_TO_BIG;
+    }
+    
+    if (((csm->vram_write_used + sizes.op_vram_write) > csm->vram_limit) ||
+	((csm->read_used + csm->gart_write_used + sizes.op_gart_write + sizes.op_read) > csm->gart_limit)) {
+	    return RADEON_CS_SPACE_FLUSH;
+    }
+    
+    csm->gart_write_used += sizes.op_gart_write;
+    csm->vram_write_used += sizes.op_vram_write;
+    csm->read_used += sizes.op_read;
+    /* commit */
+    for (i = 0; i < cs->bo_count; i++) {
+	    bo = cs->bos[i].bo;
+	    bo->space_accounted = cs->bos[i].new_accounted;
+    }
+    if (new_tmp)
+	new_tmp->bo->space_accounted = new_tmp->new_accounted;
+    
+    return RADEON_CS_SPACE_OK;
+}
+
+void radeon_cs_space_add_persistent_bo(struct radeon_cs *cs, struct radeon_bo *bo, uint32_t read_domains, uint32_t write_domain)
+{
+    int i;
+    for (i = 0; i < cs->bo_count; i++) {
+	if (cs->bos[i].bo == bo &&
+	    cs->bos[i].read_domains == read_domains &&
+	    cs->bos[i].write_domain == write_domain)
+	    return;
+    }
+    radeon_bo_ref(bo);
+    i = cs->bo_count;
+    cs->bos[i].bo = bo;
+    cs->bos[i].read_domains = read_domains;
+    cs->bos[i].write_domain = write_domain;
+    cs->bos[i].new_accounted = 0;
+    cs->bo_count++;
+
+    assert(cs->bo_count < MAX_SPACE_BOS);
+}
+
+static int radeon_cs_check_space_internal(struct radeon_cs *cs, struct radeon_cs_space_check *tmp_bo)
+{
+    int ret;
+    int flushed = 0;
+
+again:
+    ret = radeon_cs_do_space_check(cs, tmp_bo);
+    if (ret == RADEON_CS_SPACE_OP_TO_BIG)
+	return -1;
+    if (ret == RADEON_CS_SPACE_FLUSH) {
+	(*cs->space_flush_fn)(cs->space_flush_data);
+	if (flushed)
+	    return -1;
+	flushed = 1;
+	goto again;
+    }
+    return 0;
+}
+
+int radeon_cs_space_check_with_bo(struct radeon_cs *cs,
+				  struct radeon_bo *bo,
+				  uint32_t read_domains, uint32_t write_domain)
+{									
+    struct radeon_cs_space_check temp_bo;
+    int ret = 0;
+
+    if (bo) {
+	temp_bo.bo = bo;
+	temp_bo.read_domains = read_domains;
+	temp_bo.write_domain = write_domain;
+	temp_bo.new_accounted = 0;
+    }
+
+    ret = radeon_cs_check_space_internal(cs, bo ? &temp_bo : NULL);
+    return ret;
+}
+
+int radeon_cs_space_check(struct radeon_cs *cs)
+{
+    return radeon_cs_check_space_internal(cs, NULL);
+}
+
+void radeon_cs_space_reset_bos(struct radeon_cs *cs)
+{
+    int i;
+    for (i = 0; i < cs->bo_count; i++) {
+	radeon_bo_unref(cs->bos[i].bo);
+	cs->bos[i].bo = NULL;
+	cs->bos[i].read_domains = 0;
+	cs->bos[i].write_domain = 0;
+	cs->bos[i].new_accounted = 0;
+    }
+    cs->bo_count = 0;
+}
+
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.c b/src/mesa/drivers/dri/radeon/radeon_dma.c
new file mode 100644
index 00000000000..48114a00126
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.c
@@ -0,0 +1,347 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+#include "radeon_common.h"
+
+#if defined(USE_X86_ASM)
+#define COPY_DWORDS( dst, src, nr )					\
+do {									\
+	int __tmp;							\
+	__asm__ __volatile__( "rep ; movsl"				\
+			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
+			      : "0" (nr),				\
+			        "D" ((long)dst),			\
+			        "S" ((long)src) );			\
+} while (0)
+#else
+#define COPY_DWORDS( dst, src, nr )		\
+do {						\
+   int j;					\
+   for ( j = 0 ; j < nr ; j++ )			\
+      dst[j] = ((int *)src)[j];			\
+   dst += nr;					\
+} while (0)
+#endif
+
+static void radeonEmitVec4(uint32_t *out, GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 4)
+		COPY_DWORDS(out, data, count);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out++;
+			data += stride;
+		}
+}
+
+void radeonEmitVec8(uint32_t *out, GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 8)
+		COPY_DWORDS(out, data, count * 2);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out += 2;
+			data += stride;
+		}
+}
+
+void radeonEmitVec12(uint32_t *out, GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 12) {
+		COPY_DWORDS(out, data, count * 3);
+    }
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out += 3;
+			data += stride;
+		}
+}
+
+static void radeonEmitVec16(uint32_t *out, GLvoid * data, int stride, int count)
+{
+	int i;
+
+	if (RADEON_DEBUG & DEBUG_VERTS)
+		fprintf(stderr, "%s count %d stride %d out %p data %p\n",
+			__FUNCTION__, count, stride, (void *)out, (void *)data);
+
+	if (stride == 16)
+		COPY_DWORDS(out, data, count * 4);
+	else
+		for (i = 0; i < count; i++) {
+			out[0] = *(int *)data;
+			out[1] = *(int *)(data + 4);
+			out[2] = *(int *)(data + 8);
+			out[3] = *(int *)(data + 12);
+			out += 4;
+			data += stride;
+		}
+}
+
+void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
+			 GLvoid * data, int size, int stride, int count)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	uint32_t *out;
+
+	if (stride == 0) {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+		count = 1;
+		aos->stride = 0;
+	} else {
+		radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * count * 4, 32);
+		aos->stride = size;
+	}
+
+	aos->components = size;
+	aos->count = count;
+
+	out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
+	switch (size) {
+	case 1: radeonEmitVec4(out, data, stride, count); break;
+	case 2: radeonEmitVec8(out, data, stride, count); break;
+	case 3: radeonEmitVec12(out, data, stride, count); break;
+	case 4: radeonEmitVec16(out, data, stride, count); break;
+	default:
+		assert(0);
+		break;
+	}
+}
+
+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size)
+{
+
+	size = MAX2(size, MAX_DMA_BUF_SZ);
+
+	if (RADEON_DEBUG & (DEBUG_IOCTL | DEBUG_DMA))
+		fprintf(stderr, "%s\n", __FUNCTION__);
+
+	if (rmesa->dma.flush) {
+		rmesa->dma.flush(rmesa->glCtx);
+	}
+
+	if (rmesa->dma.nr_released_bufs > 4) {
+		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
+		rmesa->dma.nr_released_bufs = 0;
+	}
+
+	if (rmesa->dma.current) {
+		radeon_bo_unmap(rmesa->dma.current);
+		radeon_bo_unref(rmesa->dma.current);
+		rmesa->dma.current = 0;
+	}
+
+again_alloc:
+#ifdef RADEON_DEBUG_BO
+    rmesa->dma.current = radeon_bo_open(rmesa->radeonScreen->bom,
+					    0, size, 4, RADEON_GEM_DOMAIN_GTT,
+					    0, "dma.current");
+#else    
+	rmesa->dma.current = radeon_bo_open(rmesa->radeonScreen->bom,
+					    0, size, 4, RADEON_GEM_DOMAIN_GTT,
+					    0);
+#endif /* RADEON_DEBUG_BO */
+
+	if (!rmesa->dma.current) {
+		rcommonFlushCmdBuf(rmesa, __FUNCTION__);
+		rmesa->dma.nr_released_bufs = 0;
+		goto again_alloc;
+	}
+
+	rmesa->dma.current_used = 0;
+	rmesa->dma.current_vertexptr = 0;
+	
+	if (radeon_cs_space_check_with_bo(rmesa->cmdbuf.cs,
+					  rmesa->dma.current,
+					  RADEON_GEM_DOMAIN_GTT, 0))
+		fprintf(stderr,"failure to revalidate BOs - badness\n");
+
+	if (!rmesa->dma.current) {
+        /* Cmd buff have been flushed in radeon_revalidate_bos */
+		rmesa->dma.nr_released_bufs = 0;
+		goto again_alloc;
+	}
+
+	radeon_bo_map(rmesa->dma.current, 1);
+}
+
+/* Allocates a region from rmesa->dma.current.  If there isn't enough
+ * space in current, grab a new buffer (and discard what was left of current)
+ */
+void radeonAllocDmaRegion(radeonContextPtr rmesa,
+			  struct radeon_bo **pbo, int *poffset,
+			  int bytes, int alignment)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
+
+	if (rmesa->dma.flush)
+		rmesa->dma.flush(rmesa->glCtx);
+
+	assert(rmesa->dma.current_used == rmesa->dma.current_vertexptr);
+
+	alignment--;
+	rmesa->dma.current_used = (rmesa->dma.current_used + alignment) & ~alignment;
+
+	if (!rmesa->dma.current || rmesa->dma.current_used + bytes > rmesa->dma.current->size)
+		radeonRefillCurrentDmaRegion(rmesa, (bytes + 15) & ~15);
+
+	*poffset = rmesa->dma.current_used;
+	*pbo = rmesa->dma.current;
+	radeon_bo_ref(*pbo);
+
+	/* Always align to at least 16 bytes */
+	rmesa->dma.current_used = (rmesa->dma.current_used + bytes + 15) & ~15;
+	rmesa->dma.current_vertexptr = rmesa->dma.current_used;
+
+	assert(rmesa->dma.current_used <= rmesa->dma.current->size);
+}
+
+void radeonReleaseDmaRegion(radeonContextPtr rmesa)
+{
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %p\n", __FUNCTION__, rmesa->dma.current);
+	if (rmesa->dma.current) {
+		rmesa->dma.nr_released_bufs++;
+		radeon_bo_unmap(rmesa->dma.current);
+	        radeon_bo_unref(rmesa->dma.current);
+	}
+	rmesa->dma.current = NULL;
+}
+
+
+/* Flush vertices in the current dma region.
+ */
+void rcommon_flush_last_swtcl_prim( GLcontext *ctx  )
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	struct radeon_dma *dma = &rmesa->dma;
+		
+
+	if (RADEON_DEBUG & DEBUG_IOCTL)
+		fprintf(stderr, "%s %p\n", __FUNCTION__, dma->current);
+	dma->flush = NULL;
+
+	if (dma->current) {
+	    GLuint current_offset = dma->current_used;
+
+	    assert (dma->current_used +
+		    rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+		    dma->current_vertexptr);
+
+	    if (dma->current_used != dma->current_vertexptr) {
+		    dma->current_used = dma->current_vertexptr;
+
+		    rmesa->vtbl.swtcl_flush(ctx, current_offset);
+	    }
+	    rmesa->swtcl.numverts = 0;
+	}
+}
+/* Alloc space in the current dma region.
+ */
+void *
+rcommonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
+{
+	GLuint bytes = vsize * nverts;
+	void *head;
+restart:
+	if (!rmesa->dma.current || rmesa->dma.current_vertexptr + bytes > rmesa->dma.current->size) {
+                radeonRefillCurrentDmaRegion(rmesa, bytes);
+	}
+
+        if (!rmesa->dma.flush) {
+		/* make sure we have enough space to use this in cmdbuf */
+   		rcommonEnsureCmdBufSpace(rmesa,
+			      rmesa->hw.max_state_size + (20*sizeof(int)),
+			      __FUNCTION__);
+		/* if cmdbuf flushed DMA restart */
+		if (!rmesa->dma.current)
+			goto restart;
+                rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+                rmesa->dma.flush = rcommon_flush_last_swtcl_prim;
+        }
+
+	ASSERT( vsize == rmesa->swtcl.vertex_size * 4 );
+        ASSERT( rmesa->dma.flush == rcommon_flush_last_swtcl_prim );
+        ASSERT( rmesa->dma.current_used +
+                rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
+                rmesa->dma.current_vertexptr );
+
+	head = (rmesa->dma.current->ptr + rmesa->dma.current_vertexptr);
+	rmesa->dma.current_vertexptr += bytes;
+	rmesa->swtcl.numverts += nverts;
+	return head;
+}
+
+void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
+{
+   radeonContextPtr radeon = RADEON_CONTEXT( ctx );
+   int i;
+
+   if (radeon->dma.flush) {
+       radeon->dma.flush(radeon->glCtx);
+   }
+   if (radeon->tcl.elt_dma_bo) {
+	   radeon_bo_unref(radeon->tcl.elt_dma_bo);
+	   radeon->tcl.elt_dma_bo = NULL;
+   }
+   for (i = 0; i < radeon->tcl.aos_count; i++) {
+      if (radeon->tcl.aos[i].bo) {
+         radeon_bo_unref(radeon->tcl.aos[i].bo);
+         radeon->tcl.aos[i].bo = NULL;
+      }
+   }
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_dma.h b/src/mesa/drivers/dri/radeon/radeon_dma.h
new file mode 100644
index 00000000000..06e388fc1de
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_dma.h
@@ -0,0 +1,52 @@
+/**************************************************************************
+
+Copyright (C) 2004 Nicolai Haehnle.
+Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+
+The Weather Channel (TM) funded Tungsten Graphics to develop the
+initial release of the Radeon 8500 driver under the XFree86 license.
+This notice must be preserved.
+
+All Rights Reserved.
+
+Permission is hereby granted, free of charge, to any person obtaining a
+copy of this software and associated documentation files (the "Software"),
+to deal in the Software without restriction, including without limitation
+on the rights to use, copy, modify, merge, publish, distribute, sub
+license, and/or sell copies of the Software, and to permit persons to whom
+the Software is furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice (including the next
+paragraph) shall be included in all copies or substantial portions of the
+Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ATI, VA LINUX SYSTEMS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+USE OR OTHER DEALINGS IN THE SOFTWARE.
+
+**************************************************************************/
+
+#ifndef RADEON_DMA_H
+#define RADEON_DMA_H
+
+void radeonEmitVec8(uint32_t *out, GLvoid * data, int stride, int count);
+void radeonEmitVec12(uint32_t *out, GLvoid * data, int stride, int count);
+
+void rcommon_emit_vector(GLcontext * ctx, struct radeon_aos *aos,
+			 GLvoid * data, int size, int stride, int count);
+
+void radeonRefillCurrentDmaRegion(radeonContextPtr rmesa, int size);
+void radeonAllocDmaRegion(radeonContextPtr rmesa,
+			  struct radeon_bo **pbo, int *poffset,
+			  int bytes, int alignment);
+void radeonReleaseDmaRegion(radeonContextPtr rmesa);
+
+void rcommon_flush_last_swtcl_prim(GLcontext *ctx);
+
+void *rcommonAllocDmaLowVerts(radeonContextPtr rmesa, int nverts, int vsize);
+void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs );
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_fbo.c b/src/mesa/drivers/dri/radeon/radeon_fbo.c
new file mode 100644
index 00000000000..f28efa33e9a
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_fbo.c
@@ -0,0 +1,615 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Red Hat Inc.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "main/imports.h"
+#include "main/macros.h"
+#include "main/mtypes.h"
+#include "main/fbobject.h"
+#include "main/framebuffer.h"
+#include "main/renderbuffer.h"
+#include "main/context.h"
+#include "main/texformat.h"
+#include "main/texrender.h"
+
+#include "radeon_common.h"
+#include "radeon_mipmap_tree.h"
+
+#define FILE_DEBUG_FLAG DEBUG_TEXTURE
+#define DBG(...) do {                                           \
+        if (RADEON_DEBUG & FILE_DEBUG_FLAG)                      \
+                _mesa_printf(__VA_ARGS__);                      \
+} while(0)
+
+static struct gl_framebuffer *
+radeon_new_framebuffer(GLcontext *ctx, GLuint name)
+{
+  return _mesa_new_framebuffer(ctx, name);
+}
+
+static void
+radeon_delete_renderbuffer(struct gl_renderbuffer *rb)
+{
+  struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+
+  ASSERT(rrb);
+
+  if (rrb && rrb->bo) {
+    radeon_bo_unref(rrb->bo);
+  }
+  _mesa_free(rrb);
+}
+
+static void *
+radeon_get_pointer(GLcontext *ctx, struct gl_renderbuffer *rb,
+		   GLint x, GLint y)
+{
+  return NULL;
+}
+
+/**
+ * Called via glRenderbufferStorageEXT() to set the format and allocate
+ * storage for a user-created renderbuffer.
+ */
+static GLboolean
+radeon_alloc_renderbuffer_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
+                                 GLenum internalFormat,
+                                 GLuint width, GLuint height)
+{
+  struct radeon_context *radeon = RADEON_CONTEXT(ctx);
+  struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+  GLboolean software_buffer = GL_FALSE;
+  int cpp;
+
+   ASSERT(rb->Name != 0);
+  switch (internalFormat) {
+   case GL_R3_G3_B2:
+   case GL_RGB4:
+   case GL_RGB5:
+      rb->_ActualFormat = GL_RGB5;
+      rb->DataType = GL_UNSIGNED_BYTE;
+      rb->RedBits = 5;
+      rb->GreenBits = 6;
+      rb->BlueBits = 5;
+      cpp = 2;
+      break;
+   case GL_RGB:
+   case GL_RGB8:
+   case GL_RGB10:
+   case GL_RGB12:
+   case GL_RGB16:
+      rb->_ActualFormat = GL_RGB8;
+      rb->DataType = GL_UNSIGNED_BYTE;
+      rb->RedBits = 8;
+      rb->GreenBits = 8;
+      rb->BlueBits = 8;
+      rb->AlphaBits = 0;
+      cpp = 4;
+      break;
+   case GL_RGBA:
+   case GL_RGBA2:
+   case GL_RGBA4:
+   case GL_RGB5_A1:
+   case GL_RGBA8:
+   case GL_RGB10_A2:
+   case GL_RGBA12:
+   case GL_RGBA16:
+      rb->_ActualFormat = GL_RGBA8;
+      rb->DataType = GL_UNSIGNED_BYTE;
+      rb->RedBits = 8;
+      rb->GreenBits = 8;
+      rb->BlueBits = 8;
+      rb->AlphaBits = 8;
+      cpp = 4;
+      break;
+   case GL_STENCIL_INDEX:
+   case GL_STENCIL_INDEX1_EXT:
+   case GL_STENCIL_INDEX4_EXT:
+   case GL_STENCIL_INDEX8_EXT:
+   case GL_STENCIL_INDEX16_EXT:
+      /* alloc a depth+stencil buffer */
+      rb->_ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+      rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
+      rb->StencilBits = 8;
+      cpp = 4;
+      break;
+   case GL_DEPTH_COMPONENT16:
+      rb->_ActualFormat = GL_DEPTH_COMPONENT16;
+      rb->DataType = GL_UNSIGNED_SHORT;
+      rb->DepthBits = 16;
+      cpp = 2;
+      break;
+   case GL_DEPTH_COMPONENT:
+   case GL_DEPTH_COMPONENT24:
+   case GL_DEPTH_COMPONENT32:
+      rb->_ActualFormat = GL_DEPTH_COMPONENT24;
+      rb->DataType = GL_UNSIGNED_INT;
+      rb->DepthBits = 24;
+      cpp = 4;
+      break;
+   case GL_DEPTH_STENCIL_EXT:
+   case GL_DEPTH24_STENCIL8_EXT:
+      rb->_ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+      rb->DataType = GL_UNSIGNED_INT_24_8_EXT;
+      rb->DepthBits = 24;
+      rb->StencilBits = 8;
+      cpp = 4;
+      break;
+   default:
+      _mesa_problem(ctx,
+                    "Unexpected format in intel_alloc_renderbuffer_storage");
+      return GL_FALSE;
+   }
+
+  if (ctx->Driver.Flush)
+	  ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+  if (rrb->bo)
+    radeon_bo_unref(rrb->bo);
+  
+    
+   if (software_buffer) {
+      return _mesa_soft_renderbuffer_storage(ctx, rb, internalFormat,
+                                             width, height);
+   }
+   else {
+     uint32_t size = width * height * cpp;
+     uint32_t pitch = ((cpp * width + 63) & ~63) / cpp;
+
+     fprintf(stderr,"Allocating %d x %d radeon RBO (pitch %d)\n", width,
+	  height, pitch);
+
+     rrb->pitch = pitch * cpp;
+     rrb->cpp = cpp;
+#ifdef RADEON_DEBUG_BO
+     rrb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+			      0,
+			      size,
+			      0,
+			      RADEON_GEM_DOMAIN_VRAM,
+			      0,
+                  "Radeon RBO");
+#else
+     rrb->bo = radeon_bo_open(radeon->radeonScreen->bom,
+			      0,
+			      size,
+			      0,
+			      RADEON_GEM_DOMAIN_VRAM,
+			      0);
+#endif /* RADEON_DEBUG_BO */
+     rb->Width = width;
+     rb->Height = height;
+       return GL_TRUE;
+   }    
+   
+}
+
+
+/**
+ * Called for each hardware renderbuffer when a _window_ is resized.
+ * Just update fields.
+ * Not used for user-created renderbuffers!
+ */
+static GLboolean
+radeon_alloc_window_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
+                           GLenum internalFormat, GLuint width, GLuint height)
+{
+   ASSERT(rb->Name == 0);
+   rb->Width = width;
+   rb->Height = height;
+   rb->_ActualFormat = internalFormat;
+
+   return GL_TRUE;
+}
+
+
+static void
+radeon_resize_buffers(GLcontext *ctx, struct gl_framebuffer *fb,
+		     GLuint width, GLuint height)
+{
+     struct radeon_framebuffer *radeon_fb = (struct radeon_framebuffer*)fb;
+   int i;
+
+   _mesa_resize_framebuffer(ctx, fb, width, height);
+
+   fb->Initialized = GL_TRUE; /* XXX remove someday */
+
+   if (fb->Name != 0) {
+      return;
+   }
+
+   /* Make sure all window system renderbuffers are up to date */
+   for (i = 0; i < 2; i++) {
+      struct gl_renderbuffer *rb = &radeon_fb->color_rb[i]->base;
+
+      /* only resize if size is changing */
+      if (rb && (rb->Width != width || rb->Height != height)) {
+	 rb->AllocStorage(ctx, rb, rb->InternalFormat, width, height);
+      }
+   }
+}
+
+
+/** Dummy function for gl_renderbuffer::AllocStorage() */
+static GLboolean
+radeon_nop_alloc_storage(GLcontext * ctx, struct gl_renderbuffer *rb,
+			 GLenum internalFormat, GLuint width, GLuint height)
+{
+   _mesa_problem(ctx, "radeon_op_alloc_storage should never be called.");
+   return GL_FALSE;
+}
+
+struct radeon_renderbuffer *
+radeon_create_renderbuffer(GLenum format, __DRIdrawablePrivate *driDrawPriv)
+{
+    struct radeon_renderbuffer *rrb;
+
+    rrb = CALLOC_STRUCT(radeon_renderbuffer);
+    if (!rrb)
+	return NULL;
+
+    _mesa_init_renderbuffer(&rrb->base, 0);
+    rrb->base.ClassID = RADEON_RB_CLASS;
+
+    /* XXX format junk */
+    switch (format) {
+	case GL_RGB5:
+	    rrb->base._ActualFormat = GL_RGB5;
+	    rrb->base._BaseFormat = GL_RGBA;
+	    rrb->base.RedBits = 5;
+	    rrb->base.GreenBits = 6;
+	    rrb->base.BlueBits = 5;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_RGB8:
+	    rrb->base._ActualFormat = GL_RGB8;
+	    rrb->base._BaseFormat = GL_RGB;
+	    rrb->base.RedBits = 8;
+	    rrb->base.GreenBits = 8;
+	    rrb->base.BlueBits = 8;
+	    rrb->base.AlphaBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_RGBA8:
+	    rrb->base._ActualFormat = GL_RGBA8;
+	    rrb->base._BaseFormat = GL_RGBA;
+	    rrb->base.RedBits = 8;
+	    rrb->base.GreenBits = 8;
+	    rrb->base.BlueBits = 8;
+	    rrb->base.AlphaBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_STENCIL_INDEX8_EXT:
+	    rrb->base._ActualFormat = GL_STENCIL_INDEX8_EXT;
+	    rrb->base._BaseFormat = GL_STENCIL_INDEX;
+	    rrb->base.StencilBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_BYTE;
+	    break;
+	case GL_DEPTH_COMPONENT16:
+	    rrb->base._ActualFormat = GL_DEPTH_COMPONENT16;
+	    rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+	    rrb->base.DepthBits = 16;
+	    rrb->base.DataType = GL_UNSIGNED_SHORT;
+	    break;
+	case GL_DEPTH_COMPONENT24:
+	    rrb->base._ActualFormat = GL_DEPTH_COMPONENT24;
+	    rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+	    rrb->base.DepthBits = 24;
+	    rrb->base.DataType = GL_UNSIGNED_INT;
+	    break;
+	case GL_DEPTH24_STENCIL8_EXT:
+	    rrb->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+	    rrb->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
+	    rrb->base.DepthBits = 24;
+	    rrb->base.StencilBits = 8;
+	    rrb->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+	    break;
+	default:
+	    fprintf(stderr, "%s: Unknown format 0x%04x\n", __FUNCTION__, format);
+	    _mesa_delete_renderbuffer(&rrb->base);
+	    return NULL;
+    }
+
+    rrb->dPriv = driDrawPriv;
+    rrb->base.InternalFormat = format;
+
+    rrb->base.Delete = radeon_delete_renderbuffer;
+    rrb->base.AllocStorage = radeon_alloc_window_storage;
+    rrb->base.GetPointer = radeon_get_pointer;
+
+    rrb->bo = NULL;
+    return rrb;
+}
+
+static struct gl_renderbuffer *
+radeon_new_renderbuffer(GLcontext * ctx, GLuint name)
+{
+  struct radeon_renderbuffer *rrb;
+
+  rrb = CALLOC_STRUCT(radeon_renderbuffer);
+  if (!rrb)
+    return NULL;
+
+  _mesa_init_renderbuffer(&rrb->base, name);
+  rrb->base.ClassID = RADEON_RB_CLASS;
+
+  rrb->base.Delete = radeon_delete_renderbuffer;
+  rrb->base.AllocStorage = radeon_alloc_renderbuffer_storage;
+  rrb->base.GetPointer = radeon_get_pointer;
+
+  return &rrb->base;
+}
+
+static void
+radeon_bind_framebuffer(GLcontext * ctx, GLenum target,
+                       struct gl_framebuffer *fb, struct gl_framebuffer *fbread)
+{
+   if (target == GL_FRAMEBUFFER_EXT || target == GL_DRAW_FRAMEBUFFER_EXT) {
+      radeon_draw_buffer(ctx, fb);
+   }
+   else {
+      /* don't need to do anything if target == GL_READ_FRAMEBUFFER_EXT */
+   }
+}
+
+static void
+radeon_framebuffer_renderbuffer(GLcontext * ctx,
+                               struct gl_framebuffer *fb,
+                               GLenum attachment, struct gl_renderbuffer *rb)
+{
+
+	if (ctx->Driver.Flush)
+		ctx->Driver.Flush(ctx); /* +r6/r7 */
+
+   _mesa_framebuffer_renderbuffer(ctx, fb, attachment, rb);
+   radeon_draw_buffer(ctx, fb);
+}
+
+
+static GLboolean
+radeon_update_wrapper(GLcontext *ctx, struct radeon_renderbuffer *rrb, 
+		     struct gl_texture_image *texImage)
+{
+	int retry = 0;
+restart:
+	if (texImage->TexFormat == &_mesa_texformat_argb8888) {
+		rrb->cpp = 4;
+		rrb->base._ActualFormat = GL_RGBA8;
+		rrb->base._BaseFormat = GL_RGBA;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to RGBA8 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_rgb565) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_RGB5;
+		rrb->base._BaseFormat = GL_RGB;
+		rrb->base.DataType = GL_UNSIGNED_SHORT;
+		DBG("Render to RGB5 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_argb1555) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_RGB5_A1;
+		rrb->base._BaseFormat = GL_RGBA;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to ARGB1555 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_argb4444) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_RGBA4;
+		rrb->base._BaseFormat = GL_RGBA;
+		rrb->base.DataType = GL_UNSIGNED_BYTE;
+		DBG("Render to ARGB1555 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_z16) {
+		rrb->cpp = 2;
+		rrb->base._ActualFormat = GL_DEPTH_COMPONENT16;
+		rrb->base._BaseFormat = GL_DEPTH_COMPONENT;
+		rrb->base.DataType = GL_UNSIGNED_SHORT;
+		DBG("Render to DEPTH16 texture OK\n");
+	}
+	else if (texImage->TexFormat == &_mesa_texformat_s8_z24) {
+		rrb->cpp = 4;
+		rrb->base._ActualFormat = GL_DEPTH24_STENCIL8_EXT;
+		rrb->base._BaseFormat = GL_DEPTH_STENCIL_EXT;
+		rrb->base.DataType = GL_UNSIGNED_INT_24_8_EXT;
+		DBG("Render to DEPTH_STENCIL texture OK\n");
+	}
+	else {
+		/* try redoing the FBO */
+		if (retry == 1) {
+			DBG("Render to texture BAD FORMAT %d\n",
+			    texImage->TexFormat->MesaFormat);
+			return GL_FALSE;
+		}
+		texImage->TexFormat = radeonChooseTextureFormat(ctx, texImage->InternalFormat, 0,
+								texImage->TexFormat->DataType,
+								1);
+
+		retry++;
+		goto restart;
+	}
+	
+	rrb->pitch = texImage->Width * rrb->cpp;
+	rrb->base.InternalFormat = rrb->base._ActualFormat;
+	rrb->base.Width = texImage->Width;
+	rrb->base.Height = texImage->Height;
+	rrb->base.RedBits = texImage->TexFormat->RedBits;
+	rrb->base.GreenBits = texImage->TexFormat->GreenBits;
+	rrb->base.BlueBits = texImage->TexFormat->BlueBits;
+	rrb->base.AlphaBits = texImage->TexFormat->AlphaBits;
+	rrb->base.DepthBits = texImage->TexFormat->DepthBits;
+	rrb->base.StencilBits = texImage->TexFormat->StencilBits;
+	
+	rrb->base.Delete = radeon_delete_renderbuffer;
+	rrb->base.AllocStorage = radeon_nop_alloc_storage;
+	
+	return GL_TRUE;
+}
+
+
+static struct radeon_renderbuffer *
+radeon_wrap_texture(GLcontext * ctx, struct gl_texture_image *texImage)
+{
+  const GLuint name = ~0;   /* not significant, but distinct for debugging */
+  struct radeon_renderbuffer *rrb;
+
+   /* make an radeon_renderbuffer to wrap the texture image */
+   rrb = CALLOC_STRUCT(radeon_renderbuffer);
+   if (!rrb) {
+      _mesa_error(ctx, GL_OUT_OF_MEMORY, "glFramebufferTexture");
+      return NULL;
+   }
+
+   _mesa_init_renderbuffer(&rrb->base, name);
+   rrb->base.ClassID = RADEON_RB_CLASS;
+
+   if (!radeon_update_wrapper(ctx, rrb, texImage)) {
+      _mesa_free(rrb);
+      return NULL;
+   }
+
+   return rrb;
+  
+}
+static void
+radeon_render_texture(GLcontext * ctx,
+                     struct gl_framebuffer *fb,
+                     struct gl_renderbuffer_attachment *att)
+{
+   struct gl_texture_image *newImage
+      = att->Texture->Image[att->CubeMapFace][att->TextureLevel];
+   struct radeon_renderbuffer *rrb = radeon_renderbuffer(att->Renderbuffer);
+   radeon_texture_image *radeon_image;
+   GLuint imageOffset;
+
+   (void) fb;
+
+   ASSERT(newImage);
+
+   if (newImage->Border != 0) {
+      /* Fallback on drawing to a texture with a border, which won't have a
+       * miptree.
+       */
+      _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
+      _mesa_render_texture(ctx, fb, att);
+      return;
+   }
+   else if (!rrb) {
+      rrb = radeon_wrap_texture(ctx, newImage);
+      if (rrb) {
+         /* bind the wrapper to the attachment point */
+         _mesa_reference_renderbuffer(&att->Renderbuffer, &rrb->base);
+      }
+      else {
+         /* fallback to software rendering */
+         _mesa_render_texture(ctx, fb, att);
+         return;
+      }
+   }
+
+   if (!radeon_update_wrapper(ctx, rrb, newImage)) {
+       _mesa_reference_renderbuffer(&att->Renderbuffer, NULL);
+       _mesa_render_texture(ctx, fb, att);
+       return;
+   }
+
+   DBG("Begin render texture tid %x tex=%u w=%d h=%d refcount=%d\n",
+       _glthread_GetID(),
+       att->Texture->Name, newImage->Width, newImage->Height,
+       rrb->base.RefCount);
+
+   /* point the renderbufer's region to the texture image region */
+   radeon_image = (radeon_texture_image *)newImage;
+   if (rrb->bo != radeon_image->mt->bo) {
+      if (rrb->bo)
+  	radeon_bo_unref(rrb->bo);
+      rrb->bo = radeon_image->mt->bo;
+      radeon_bo_ref(rrb->bo);
+   }
+
+   /* compute offset of the particular 2D image within the texture region */
+   imageOffset = radeon_miptree_image_offset(radeon_image->mt,
+                                            att->CubeMapFace,
+                                            att->TextureLevel);
+
+   if (att->Texture->Target == GL_TEXTURE_3D) {
+      GLuint offsets[6];
+      radeon_miptree_depth_offsets(radeon_image->mt, att->TextureLevel,
+				   offsets);
+      imageOffset += offsets[att->Zoffset];
+   }
+
+   /* store that offset in the region */
+   rrb->draw_offset = imageOffset;
+
+   /* update drawing region, etc */
+   radeon_draw_buffer(ctx, fb);
+}
+
+static void
+radeon_finish_render_texture(GLcontext * ctx,
+                            struct gl_renderbuffer_attachment *att)
+{
+
+}
+static void
+radeon_validate_framebuffer(GLcontext *ctx, struct gl_framebuffer *fb)
+{
+}
+
+static void
+radeon_blit_framebuffer(GLcontext *ctx,
+                       GLint srcX0, GLint srcY0, GLint srcX1, GLint srcY1,
+                       GLint dstX0, GLint dstY0, GLint dstX1, GLint dstY1,
+                       GLbitfield mask, GLenum filter)
+{
+}
+
+void radeon_fbo_init(struct radeon_context *radeon)
+{
+  radeon->glCtx->Driver.NewFramebuffer = radeon_new_framebuffer;
+  radeon->glCtx->Driver.NewRenderbuffer = radeon_new_renderbuffer;
+  radeon->glCtx->Driver.BindFramebuffer = radeon_bind_framebuffer;
+  radeon->glCtx->Driver.FramebufferRenderbuffer = radeon_framebuffer_renderbuffer;
+  radeon->glCtx->Driver.RenderTexture = radeon_render_texture;
+  radeon->glCtx->Driver.FinishRenderTexture = radeon_finish_render_texture;
+  radeon->glCtx->Driver.ResizeBuffers = radeon_resize_buffers;
+  radeon->glCtx->Driver.ValidateFramebuffer = radeon_validate_framebuffer;
+  radeon->glCtx->Driver.BlitFramebuffer = radeon_blit_framebuffer;
+}
+
+  
+void radeon_renderbuffer_set_bo(struct radeon_renderbuffer *rb,
+				struct radeon_bo *bo)
+{
+  struct radeon_bo *old;
+  old = rb->bo;
+  rb->bo = bo;
+  radeon_bo_ref(bo);
+  if (old)
+    radeon_bo_unref(old);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.c b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
index 09acf6b4f85..a5e4df79415 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.c
@@ -35,7 +35,21 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */
 
 #include <sched.h>
-#include <errno.h> 
+#include <errno.h>
+
+#include "main/attrib.h"
+#include "main/enable.h"
+#include "main/blend.h"
+#include "main/bufferobj.h"
+#include "main/buffers.h"
+#include "main/depth.h"
+#include "main/shaders.h"
+#include "main/texstate.h"
+#include "main/varray.h"
+#include "glapi/dispatch.h"
+#include "swrast/swrast.h"
+#include "main/stencil.h"
+#include "main/matrix.h"
 
 #include "main/glheader.h"
 #include "main/imports.h"
@@ -43,6 +57,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast/swrast.h"
 
 #include "radeon_context.h"
+#include "radeon_common.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
 #include "radeon_tcl.h"
@@ -58,75 +73,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define RADEON_IDLE_RETRY           16
 
 
-static void radeonWaitForIdle( radeonContextPtr rmesa );
-static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
-				    const char * caller );
-
-static void print_state_atom( struct radeon_state_atom *state )
-{
-   int i;
-
-   fprintf(stderr, "emit %s/%d\n", state->name, state->cmd_size);
-
-   if (RADEON_DEBUG & DEBUG_VERBOSE) 
-      for (i = 0 ; i < state->cmd_size ; i++) 
-	 fprintf(stderr, "\t%s[%d]: %x\n", state->name, i, state->cmd[i]);
-
-}
-
-static void radeonSaveHwState( radeonContextPtr rmesa )
-{
-   struct radeon_state_atom *atom;
-   char * dest = rmesa->backup_store.cmd_buf;
-
-   if (RADEON_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-   
-   rmesa->backup_store.cmd_used = 0;
-
-   foreach( atom, &rmesa->hw.atomlist ) {
-      if ( atom->check( rmesa->glCtx ) ) {
-	 int size = atom->cmd_size * 4;
-	 memcpy( dest, atom->cmd, size);
-	 dest += size;
-	 rmesa->backup_store.cmd_used += size;
-	 if (RADEON_DEBUG & DEBUG_STATE)
-	    print_state_atom( atom );
-      }
-   }
-
-   assert( rmesa->backup_store.cmd_used <= RADEON_CMD_BUF_SZ );
-   if (RADEON_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Returning to radeonEmitState\n");
-}
-
-/* At this point we were in FlushCmdBufLocked but we had lost our context, so
- * we need to unwire our current cmdbuf, hook the one with the saved state in
- * it, flush it, and then put the current one back.  This is so commands at the
- * start of a cmdbuf can rely on the state being kept from the previous one.
- */
-static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
-{
-   GLuint nr_released_bufs;
-   struct radeon_store saved_store;
-
-   if (rmesa->backup_store.cmd_used == 0)
-      return;
-
-   if (RADEON_DEBUG & DEBUG_STATE)
-      fprintf(stderr, "Emitting backup state on lost context\n");
-
-   rmesa->lost_context = GL_FALSE;
-
-   nr_released_bufs = rmesa->dma.nr_released_bufs;
-   saved_store = rmesa->store;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->store = rmesa->backup_store;
-   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
-   rmesa->dma.nr_released_bufs = nr_released_bufs;
-   rmesa->store = saved_store;
-}
-
 /* =============================================================
  * Kernel command buffer handling
  */
@@ -134,965 +80,381 @@ static void radeonBackUpAndEmitLostStateLocked( radeonContextPtr rmesa )
 /* The state atoms will be emitted in the order they appear in the atom list,
  * so this step is important.
  */
-void radeonSetUpAtomList( radeonContextPtr rmesa )
+void radeonSetUpAtomList( r100ContextPtr rmesa )
 {
-   int i, mtu = rmesa->glCtx->Const.MaxTextureUnits;
-
-   make_empty_list(&rmesa->hw.atomlist);
-   rmesa->hw.atomlist.name = "atom-list";
-
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ctx);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.set);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lin);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msk);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.vpt);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tcl);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.msc);
+   int i, mtu = rmesa->radeon.glCtx->Const.MaxTextureUnits;
+
+   make_empty_list(&rmesa->radeon.hw.atomlist);
+   rmesa->radeon.hw.atomlist.name = "atom-list";
+
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.ctx);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.set);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.lin);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.msk);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.vpt);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.tcl);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.msc);
    for (i = 0; i < mtu; ++i) {
-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.tex[i]);
-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.txr[i]);
-       insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.cube[i]);
+       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.tex[i]);
+       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.txr[i]);
+       insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.cube[i]);
    }
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.zbs);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mtl);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.zbs);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.mtl);
    for (i = 0; i < 3 + mtu; ++i)
-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.mat[i]);
+      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.mat[i]);
    for (i = 0; i < 8; ++i)
-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.lit[i]);
+      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.lit[i]);
    for (i = 0; i < 6; ++i)
-      insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.ucp[i]);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.eye);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.grd);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.fog);
-   insert_at_tail(&rmesa->hw.atomlist, &rmesa->hw.glt);
+      insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.ucp[i]);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.eye);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.grd);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.fog);
+   insert_at_tail(&rmesa->radeon.hw.atomlist, &rmesa->hw.glt);
 }
 
-void radeonEmitState( radeonContextPtr rmesa )
+void radeonEmitScissor(r100ContextPtr rmesa)
 {
-   struct radeon_state_atom *atom;
-   char *dest;
-
-   if (RADEON_DEBUG & (DEBUG_STATE|DEBUG_PRIMS))
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   if (rmesa->save_on_next_emit) {
-      radeonSaveHwState(rmesa);
-      rmesa->save_on_next_emit = GL_FALSE;
-   }
-
-   /* this code used to return here but now it emits zbs */
-
-   /* To avoid going across the entire set of states multiple times, just check
-    * for enough space for the case of emitting all state, and inline the
-    * radeonAllocCmdBuf code here without all the checks.
-    */
-   radeonEnsureCmdBufSpace(rmesa, rmesa->hw.max_state_size);
-   dest = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-
-   /* We always always emit zbs, this is due to a bug found by keithw in
-      the hardware and rediscovered after Erics changes by me.
-      if you ever touch this code make sure you emit zbs otherwise
-      you get tcl lockups on at least M7/7500 class of chips - airlied */
-   rmesa->hw.zbs.dirty=1;
-
-   if (RADEON_DEBUG & DEBUG_STATE) {
-      foreach(atom, &rmesa->hw.atomlist) {
-	 if (atom->dirty || rmesa->hw.all_dirty) {
-	    if (atom->check(rmesa->glCtx))
-	       print_state_atom(atom);
-	    else
-	       fprintf(stderr, "skip state %s\n", atom->name);
-	 }
-      }
-   }
-
-   foreach(atom, &rmesa->hw.atomlist) {
-      if (rmesa->hw.all_dirty)
-	 atom->dirty = GL_TRUE;
-      if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) &&
-	   atom->is_tcl)
-	 atom->dirty = GL_FALSE;
-      if (atom->dirty) {
-	 if (atom->check(rmesa->glCtx)) {
-	    int size = atom->cmd_size * 4;
-	    memcpy(dest, atom->cmd, size);
-	    dest += size;
-	    rmesa->store.cmd_used += size;
-	    atom->dirty = GL_FALSE;
-	 }
-      }
-   }
-
-   assert(rmesa->store.cmd_used <= RADEON_CMD_BUF_SZ);
- 
-   rmesa->hw.is_dirty = GL_FALSE;
-   rmesa->hw.all_dirty = GL_FALSE;
+    BATCH_LOCALS(&rmesa->radeon);
+    if (!rmesa->radeon.radeonScreen->kernel_mm) {
+       return;
+    }
+    if (rmesa->radeon.state.scissor.enabled) {
+        BEGIN_BATCH(6);
+        OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 0));
+        OUT_BATCH(rmesa->hw.ctx.cmd[CTX_PP_CNTL] | RADEON_SCISSOR_ENABLE);
+        OUT_BATCH(CP_PACKET0(RADEON_RE_TOP_LEFT, 0));
+        OUT_BATCH((rmesa->radeon.state.scissor.rect.y1 << 16) |
+                  rmesa->radeon.state.scissor.rect.x1);
+        OUT_BATCH(CP_PACKET0(RADEON_RE_WIDTH_HEIGHT, 0));
+        OUT_BATCH(((rmesa->radeon.state.scissor.rect.y2 - 1) << 16) |
+                  (rmesa->radeon.state.scissor.rect.x2 - 1));
+        END_BATCH();
+    } else {
+        BEGIN_BATCH(2);
+        OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 0));
+        OUT_BATCH(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & ~RADEON_SCISSOR_ENABLE);
+        END_BATCH();
+    }
 }
 
 /* Fire a section of the retained (indexed_verts) buffer as a regular
- * primtive.  
+ * primtive.
  */
-extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
+extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
 				GLuint vertex_format,
 				GLuint primitive,
 				GLuint vertex_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
-
+   BATCH_LOCALS(&rmesa->radeon);
 
    assert(!(primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   
-   radeonEmitState( rmesa );
 
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s cmd_used/4: %d\n", __FUNCTION__,
-	      rmesa->store.cmd_used/4);
-   
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VBUF_BUFSZ,
-						       __FUNCTION__ );
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitScissor(rmesa);
+
 #if RADEON_OLD_PACKETS
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM | (3 << 16);
-   cmd[2].i = rmesa->ioctl.vertex_offset;
-   cmd[3].i = vertex_nr;
-   cmd[4].i = vertex_format;
-   cmd[5].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
-	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+   BEGIN_BATCH(8);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM, 3);
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+     OUT_BATCH_RELOC(rmesa->ioctl.vertex_offset, rmesa->ioctl.bo, rmesa->ioctl.vertex_offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   } else {
+     OUT_BATCH(rmesa->ioctl.vertex_offset);
+   }
 
-   if (RADEON_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x offt 0x%x vfmt 0x%x vfcntl %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, cmd[2].i, cmd[4].i, cmd[5].i);
-#else
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_VBUF | (1 << 16);
-   cmd[2].i = vertex_format;
-   cmd[3].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
-	       (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+   OUT_BATCH(vertex_nr);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |  RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+	     (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			   rmesa->ioctl.bo,
+			   RADEON_GEM_DOMAIN_GTT,
+			   0, 0);
+   }
 
+   END_BATCH();
 
-   if (RADEON_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x vfmt 0x%x vfcntl %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, cmd[2].i, cmd[3].i);
+#else
+   BEGIN_BATCH(4);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_DRAW_VBUF, 1);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |
+	     RADEON_CP_VC_CNTL_PRIM_WALK_LIST |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE |
+	     (vertex_nr << RADEON_CP_VC_CNTL_NUM_SHIFT));
+   END_BATCH();
 #endif
 }
 
-
-void radeonFlushElts( radeonContextPtr rmesa )
+void radeonFlushElts( GLcontext *ctx )
 {
-   int *cmd = (int *)(rmesa->store.cmd_buf + rmesa->store.elts_start);
-   int dwords;
-#if RADEON_OLD_PACKETS
-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 24)) / 2;
-#else
-   int nr = (rmesa->store.cmd_used - (rmesa->store.elts_start + 16)) / 2;
-#endif
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&rmesa->radeon);
+   int nr;
+   uint32_t *cmd = (uint32_t *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_start);
+   int dwords = (rmesa->radeon.cmdbuf.cs->section_ndw - rmesa->radeon.cmdbuf.cs->section_cdw);
 
    if (RADEON_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-   assert( rmesa->dma.flush == radeonFlushElts );
-   rmesa->dma.flush = NULL;
+   assert( rmesa->radeon.dma.flush == radeonFlushElts );
+   rmesa->radeon.dma.flush = NULL;
 
-   /* Cope with odd number of elts:
-    */
-   rmesa->store.cmd_used = (rmesa->store.cmd_used + 2) & ~2;
-   dwords = (rmesa->store.cmd_used - rmesa->store.elts_start) / 4;
+   nr = rmesa->tcl.elt_used;
+
+#if RADEON_OLD_PACKETS
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     dwords -= 2;
+   }
+#endif
 
 #if RADEON_OLD_PACKETS
-   cmd[1] |= (dwords - 3) << 16;
+   cmd[1] |= (dwords + 3) << 16;
    cmd[5] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
 #else
-   cmd[1] |= (dwords - 3) << 16;
+   cmd[1] |= (dwords + 2) << 16;
    cmd[3] |= nr << RADEON_CP_VC_CNTL_NUM_SHIFT;
 #endif
 
+   rmesa->radeon.cmdbuf.cs->cdw += dwords;
+   rmesa->radeon.cmdbuf.cs->section_cdw += dwords;
+
+#if RADEON_OLD_PACKETS
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			    rmesa->ioctl.bo,
+			    RADEON_GEM_DOMAIN_GTT,
+			    0, 0);
+   }
+#endif
+
+   END_BATCH();
+
    if (RADEON_DEBUG & DEBUG_SYNC) {
       fprintf(stderr, "%s: Syncing\n", __FUNCTION__);
-      radeonFinish( rmesa->glCtx );
+      radeonFinish( rmesa->radeon.glCtx );
    }
-}
 
+}
 
-GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
+GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
 				    GLuint vertex_format,
 				    GLuint primitive,
 				    GLuint min_nr )
 {
-   drm_radeon_cmd_header_t *cmd;
    GLushort *retval;
+   int align_min_nr;
+   BATCH_LOCALS(&rmesa->radeon);
 
    if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d\n", __FUNCTION__, min_nr);
+      fprintf(stderr, "%s %d prim %x\n", __FUNCTION__, min_nr, primitive);
 
    assert((primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
-   
-   radeonEmitState( rmesa );
-   
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa,
-						       ELTS_BUFSZ(min_nr),
-						       __FUNCTION__ );
+
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitScissor(rmesa);
+
+   rmesa->tcl.elt_cmd_start = rmesa->radeon.cmdbuf.cs->cdw;
+
+   /* round up min_nr to align the state */
+   align_min_nr = (min_nr + 1) & ~1;
+
 #if RADEON_OLD_PACKETS
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM;
-   cmd[2].i = rmesa->ioctl.vertex_offset;
-   cmd[3].i = 0xffff;
-   cmd[4].i = vertex_format;
-   cmd[5].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
-
-   retval = (GLushort *)(cmd+6);
-#else   
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3_CLIP;
-   cmd[1].i = RADEON_CP_PACKET3_3D_DRAW_INDX;
-   cmd[2].i = vertex_format;
-   cmd[3].i = (primitive | 
-	       RADEON_CP_VC_CNTL_PRIM_WALK_IND |
-	       RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
-	       RADEON_CP_VC_CNTL_MAOS_ENABLE |
-	       RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
-
-   retval = (GLushort *)(cmd+4);
+   BEGIN_BATCH_NO_AUTOSTATE(2+ELTS_BUFSZ(align_min_nr)/4);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_3D_RNDR_GEN_INDX_PRIM, 0);
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+     OUT_BATCH_RELOC(rmesa->ioctl.vertex_offset, rmesa->ioctl.bo, rmesa->ioctl.vertex_offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   } else {
+     OUT_BATCH(rmesa->ioctl.vertex_offset);
+   }
+   OUT_BATCH(0xffff);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |
+	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
+#else
+   BEGIN_BATCH_NO_AUTOSTATE(ELTS_BUFSZ(align_min_nr)/4);
+   OUT_BATCH_PACKET3_CLIP(RADEON_CP_PACKET3_DRAW_INDX, 0);
+   OUT_BATCH(vertex_format);
+   OUT_BATCH(primitive |
+	     RADEON_CP_VC_CNTL_PRIM_WALK_IND |
+	     RADEON_CP_VC_CNTL_COLOR_ORDER_RGBA |
+	     RADEON_CP_VC_CNTL_MAOS_ENABLE |
+	     RADEON_CP_VC_CNTL_VTX_FMT_RADEON_MODE);
 #endif
 
-   if (RADEON_DEBUG & DEBUG_PRIMS)
-      fprintf(stderr, "%s: header 0x%x vfmt 0x%x prim %x \n",
-	      __FUNCTION__,
-	      cmd[1].i, vertex_format, primitive);
 
-   assert(!rmesa->dma.flush);
-   rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-   rmesa->dma.flush = radeonFlushElts;
+   rmesa->tcl.elt_cmd_offset = rmesa->radeon.cmdbuf.cs->cdw;
+   rmesa->tcl.elt_used = min_nr;
 
-   rmesa->store.elts_start = ((char *)cmd) - rmesa->store.cmd_buf;
+   retval = (GLushort *)(rmesa->radeon.cmdbuf.cs->packets + rmesa->tcl.elt_cmd_offset);
 
-   return retval;
-}
+   if (RADEON_DEBUG & DEBUG_PRIMS)
+      fprintf(stderr, "%s: header prim %x \n",
+	      __FUNCTION__, primitive);
 
+   assert(!rmesa->radeon.dma.flush);
+   rmesa->radeon.glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
+   rmesa->radeon.dma.flush = radeonFlushElts;
 
+   return retval;
+}
 
-void radeonEmitVertexAOS( radeonContextPtr rmesa,
+void radeonEmitVertexAOS( r100ContextPtr rmesa,
 			  GLuint vertex_size,
+			  struct radeon_bo *bo,
 			  GLuint offset )
 {
 #if RADEON_OLD_PACKETS
-   rmesa->ioctl.vertex_size = vertex_size;
    rmesa->ioctl.vertex_offset = offset;
+   rmesa->ioctl.bo = bo;
 #else
-   drm_radeon_cmd_header_t *cmd;
+   BATCH_LOCALS(&rmesa->radeon);
 
    if (RADEON_DEBUG & (DEBUG_PRIMS|DEBUG_IOCTL))
       fprintf(stderr, "%s:  vertex_size 0x%x offset 0x%x \n",
 	      __FUNCTION__, vertex_size, offset);
 
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, VERT_AOS_BUFSZ,
-						  __FUNCTION__ );
+   BEGIN_BATCH(7);
+   OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, 2);
+   OUT_BATCH(1);
+   OUT_BATCH(vertex_size | (vertex_size << 8));
+   OUT_BATCH_RELOC(offset, bo, offset, RADEON_GEM_DOMAIN_GTT, 0, 0);
+   END_BATCH();
 
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (2 << 16);
-   cmd[2].i = 1;
-   cmd[3].i = vertex_size | (vertex_size << 8);
-   cmd[4].i = offset;
 #endif
 }
-		       
 
-void radeonEmitAOS( radeonContextPtr rmesa,
-		    struct radeon_dma_region **component,
+
+void radeonEmitAOS( r100ContextPtr rmesa,
 		    GLuint nr,
 		    GLuint offset )
 {
 #if RADEON_OLD_PACKETS
    assert( nr == 1 );
-   assert( component[0]->aos_size == component[0]->aos_stride );
-   rmesa->ioctl.vertex_size = component[0]->aos_size;
-   rmesa->ioctl.vertex_offset = 
-      (component[0]->aos_start + offset * component[0]->aos_stride * 4);
+   rmesa->ioctl.bo = rmesa->radeon.tcl.aos[0].bo;
+   rmesa->ioctl.vertex_offset =
+     (rmesa->radeon.tcl.aos[0].offset + offset * rmesa->radeon.tcl.aos[0].stride * 4);
 #else
-   drm_radeon_cmd_header_t *cmd;
-   int sz = AOS_BUFSZ(nr);
+   BATCH_LOCALS(&rmesa->radeon);
+   uint32_t voffset;
+   //   int sz = AOS_BUFSZ(nr);
+   int sz = 1 + (nr >> 1) * 3 + (nr & 1) * 2;
    int i;
-   int *tmp;
 
    if (RADEON_DEBUG & DEBUG_IOCTL)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sz,
-						  __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_3D_LOAD_VBPNTR | (((sz / sizeof(int))-3) << 16);
-   cmd[2].i = nr;
-   tmp = &cmd[0].i;
-   cmd += 3;
-
-   for (i = 0 ; i < nr ; i++) {
-      if (i & 1) {
-	 cmd[0].i |= ((component[i]->aos_stride << 24) | 
-		      (component[i]->aos_size << 16));
-	 cmd[2].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
-	 cmd += 3;
+   BEGIN_BATCH(sz+2+(nr * 2));
+   OUT_BATCH_PACKET3(RADEON_CP_PACKET3_3D_LOAD_VBPNTR, sz - 1);
+   OUT_BATCH(nr);
+
+   if (!rmesa->radeon.radeonScreen->kernel_mm) {
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[i+1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
       }
-      else {
-	 cmd[0].i = ((component[i]->aos_stride << 8) | 
-		     (component[i]->aos_size << 0));
-	 cmd[1].i = (component[i]->aos_start + 
-		     offset * component[i]->aos_stride * 4);
-      }
-   }
 
-   if (RADEON_DEBUG & DEBUG_VERTS) {
-      fprintf(stderr, "%s:\n", __FUNCTION__);
-      for (i = 0 ; i < sz ; i++)
-	 fprintf(stderr, "   %d: %x\n", i, tmp[i]);
-   }
-#endif
-}
-
-/* using already shifted color_fmt! */
-void radeonEmitBlit( radeonContextPtr rmesa, /* FIXME: which drmMinor is required? */
-		   GLuint color_fmt,
-		   GLuint src_pitch,
-		   GLuint src_offset,
-		   GLuint dst_pitch,
-		   GLuint dst_offset,
-		   GLint srcx, GLint srcy,
-		   GLint dstx, GLint dsty,
-		   GLuint w, GLuint h )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s src %x/%x %d,%d dst: %x/%x %d,%d sz: %dx%d\n",
-	      __FUNCTION__, 
-	      src_pitch, src_offset, srcx, srcy,
-	      dst_pitch, dst_offset, dstx, dsty,
-	      w, h);
-
-   assert( (src_pitch & 63) == 0 );
-   assert( (dst_pitch & 63) == 0 );
-   assert( (src_offset & 1023) == 0 ); 
-   assert( (dst_offset & 1023) == 0 ); 
-   assert( w < (1<<16) );
-   assert( h < (1<<16) );
-
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 8 * sizeof(int),
-						  __FUNCTION__ );
-
-
-   cmd[0].i = 0;
-   cmd[0].header.cmd_type = RADEON_CMD_PACKET3;
-   cmd[1].i = RADEON_CP_PACKET3_CNTL_BITBLT_MULTI | (5 << 16);
-   cmd[2].i = (RADEON_GMC_SRC_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_DST_PITCH_OFFSET_CNTL |
-	       RADEON_GMC_BRUSH_NONE |
-	       color_fmt |
-	       RADEON_GMC_SRC_DATATYPE_COLOR |
-	       RADEON_ROP3_S |
-	       RADEON_DP_SRC_SOURCE_MEMORY |
-	       RADEON_GMC_CLR_CMP_CNTL_DIS |
-	       RADEON_GMC_WR_MSK_DIS );
-
-   cmd[3].i = ((src_pitch/64)<<22) | (src_offset >> 10);
-   cmd[4].i = ((dst_pitch/64)<<22) | (dst_offset >> 10);
-   cmd[5].i = (srcx << 16) | srcy;
-   cmd[6].i = (dstx << 16) | dsty; /* dst */
-   cmd[7].i = (w << 16) | h;
-}
-
-
-void radeonEmitWait( radeonContextPtr rmesa, GLuint flags )
-{
-   drm_radeon_cmd_header_t *cmd;
-
-   assert( !(flags & ~(RADEON_WAIT_2D|RADEON_WAIT_3D)) );
-
-   cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, 1 * sizeof(int),
-					   __FUNCTION__ );
-   cmd[0].i = 0;
-   cmd[0].wait.cmd_type = RADEON_CMD_WAIT;
-   cmd[0].wait.flags = flags;
-}
-
-
-static int radeonFlushCmdBufLocked( radeonContextPtr rmesa, 
-				    const char * caller )
-{
-   int ret, i;
-   drm_radeon_cmd_buffer_t cmd;
-
-   if (rmesa->lost_context)
-      radeonBackUpAndEmitLostStateLocked(rmesa);
-
-   if (RADEON_DEBUG & DEBUG_IOCTL) {
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-
-      if (RADEON_DEBUG & DEBUG_VERBOSE) 
-	 for (i = 0 ; i < rmesa->store.cmd_used ; i += 4 )
-	    fprintf(stderr, "%d: %x\n", i/4, 
-		    *(int *)(&rmesa->store.cmd_buf[i]));
-   }
-
-   if (RADEON_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "%s: Releasing %d buffers\n", __FUNCTION__,
-	      rmesa->dma.nr_released_bufs);
-
-
-   if (RADEON_DEBUG & DEBUG_SANITY) {
-      if (rmesa->state.scissor.enabled) 
-	 ret = radeonSanityCmdBuffer( rmesa, 
-				      rmesa->state.scissor.numClipRects,
-				      rmesa->state.scissor.pClipRects);
-      else
-	 ret = radeonSanityCmdBuffer( rmesa, 
-				      rmesa->numClipRects,
-				      rmesa->pClipRects);
-      if (ret) {
-	 fprintf(stderr, "drmSanityCommandWrite: %d\n", ret);	 
-	 goto out;
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH_RELOC(voffset,
+			 rmesa->radeon.tcl.aos[nr - 1].bo,
+			 voffset,
+			 RADEON_GEM_DOMAIN_GTT,
+			 0, 0);
       }
-   }
-
-
-   cmd.bufsz = rmesa->store.cmd_used;
-   cmd.buf = rmesa->store.cmd_buf;
-
-   if (rmesa->state.scissor.enabled) {
-      cmd.nbox = rmesa->state.scissor.numClipRects;
-      cmd.boxes = rmesa->state.scissor.pClipRects;
    } else {
-      cmd.nbox = rmesa->numClipRects;
-      cmd.boxes = rmesa->pClipRects;
-   }
-
-   ret = drmCommandWrite( rmesa->dri.fd,
-			  DRM_RADEON_CMDBUF,
-			  &cmd, sizeof(cmd) );
-
-   if (ret)
-      fprintf(stderr, "drmCommandWrite: %d\n", ret);
-
-   if (RADEON_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "\nSyncing in %s\n\n", __FUNCTION__);
-      radeonWaitForIdleLocked( rmesa );
-   }
-
- out:
-   rmesa->store.primnr = 0;
-   rmesa->store.statenr = 0;
-   rmesa->store.cmd_used = 0;
-   rmesa->dma.nr_released_bufs = 0;
-   rmesa->save_on_next_emit = 1;
-
-   return ret;
-}
-
-
-/* Note: does not emit any commands to avoid recursion on
- * radeonAllocCmdBuf.
- */
-void radeonFlushCmdBuf( radeonContextPtr rmesa, const char *caller )
-{
-   int ret;
-
-	      
-   LOCK_HARDWARE( rmesa );
-
-   ret = radeonFlushCmdBufLocked( rmesa, caller );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if (ret) {
-      fprintf(stderr, "drm_radeon_cmd_buffer_t: %d (exiting)\n", ret);
-      exit(ret);
-   }
-}
-
-/* =============================================================
- * Hardware vertex buffer handling
- */
-
-
-void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa )
-{
-   struct radeon_dma_buffer *dmabuf;
-   int fd = rmesa->dri.fd;
-   int index = 0;
-   int size = 0;
-   drmDMAReq dma;
-   int ret;
-
-   if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-      fprintf(stderr, "%s\n", __FUNCTION__);  
-
-   if (rmesa->dma.flush) {
-      rmesa->dma.flush( rmesa );
-   }
-
-   if (rmesa->dma.current.buf)
-      radeonReleaseDmaRegion( rmesa, &rmesa->dma.current, __FUNCTION__ );
-
-   if (rmesa->dma.nr_released_bufs > 4)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-
-   dma.context = rmesa->dri.hwContext;
-   dma.send_count = 0;
-   dma.send_list = NULL;
-   dma.send_sizes = NULL;
-   dma.flags = 0;
-   dma.request_count = 1;
-   dma.request_size = RADEON_BUFFER_SIZE;
-   dma.request_list = &index;
-   dma.request_sizes = &size;
-   dma.granted_count = 0;
-
-   LOCK_HARDWARE(rmesa);	/* no need to validate */
-
-   ret = drmDMA( fd, &dma );
-      
-   if (ret != 0) {
-      /* Free some up this way?
-       */
-      if (rmesa->dma.nr_released_bufs) {
-	 radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+      for (i = 0; i + 1 < nr; i += 2) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[i].components << 0) |
+		   (rmesa->radeon.tcl.aos[i].stride << 8) |
+		   (rmesa->radeon.tcl.aos[i + 1].components << 16) |
+		   (rmesa->radeon.tcl.aos[i + 1].stride << 24));
+
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 OUT_BATCH(voffset);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 OUT_BATCH(voffset);
       }
-      
-      if (RADEON_DEBUG & DEBUG_DMA)
-	 fprintf(stderr, "Waiting for buffers\n");
-
-      radeonWaitForIdleLocked( rmesa );
-      ret = drmDMA( fd, &dma );
 
-      if ( ret != 0 ) {
-	 UNLOCK_HARDWARE( rmesa );
-	 fprintf( stderr, "Error: Could not get dma buffer... exiting\n" );
-	 exit( -1 );
+      if (nr & 1) {
+	 OUT_BATCH((rmesa->radeon.tcl.aos[nr - 1].components << 0) |
+		   (rmesa->radeon.tcl.aos[nr - 1].stride << 8));
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 OUT_BATCH(voffset);
       }
-   }
-
-   UNLOCK_HARDWARE(rmesa);
-
-   if (RADEON_DEBUG & DEBUG_DMA)
-      fprintf(stderr, "Allocated buffer %d\n", index);
-
-   dmabuf = CALLOC_STRUCT( radeon_dma_buffer );
-   dmabuf->buf = &rmesa->radeonScreen->buffers->list[index];
-   dmabuf->refcount = 1;
-
-   rmesa->dma.current.buf = dmabuf;
-   rmesa->dma.current.address = dmabuf->buf->address;
-   rmesa->dma.current.end = dmabuf->buf->total;
-   rmesa->dma.current.start = 0;
-   rmesa->dma.current.ptr = 0;
-
-   rmesa->c_vertexBuffers++;
-}
-
-void radeonReleaseDmaRegion( radeonContextPtr rmesa,
-			     struct radeon_dma_region *region,
-			     const char *caller )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s from %s\n", __FUNCTION__, caller); 
-   
-   if (!region->buf)
-      return;
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (--region->buf->refcount == 0) {
-      drm_radeon_cmd_header_t *cmd;
-
-      if (RADEON_DEBUG & (DEBUG_IOCTL|DEBUG_DMA))
-	 fprintf(stderr, "%s -- DISCARD BUF %d\n", __FUNCTION__,
-		 region->buf->buf->idx);  
-      
-      cmd = (drm_radeon_cmd_header_t *)radeonAllocCmdBuf( rmesa, sizeof(*cmd), 
-						     __FUNCTION__ );
-      cmd->dma.cmd_type = RADEON_CMD_DMA_DISCARD;
-      cmd->dma.buf_idx = region->buf->buf->idx;
-      FREE(region->buf);
-      rmesa->dma.nr_released_bufs++;
-   }
-
-   region->buf = NULL;
-   region->start = 0;
-}
-
-/* Allocates a region from rmesa->dma.current.  If there isn't enough
- * space in current, grab a new buffer (and discard what was left of current)
- */
-void radeonAllocDmaRegion( radeonContextPtr rmesa, 
-			   struct radeon_dma_region *region,
-			   int bytes,
-			   int alignment )
-{
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s %d\n", __FUNCTION__, bytes);
-
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
-
-   if (region->buf)
-      radeonReleaseDmaRegion( rmesa, region, __FUNCTION__ );
-
-   alignment--;
-   rmesa->dma.current.start = rmesa->dma.current.ptr = 
-      (rmesa->dma.current.ptr + alignment) & ~alignment;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      radeonRefillCurrentDmaRegion( rmesa );
-
-   region->start = rmesa->dma.current.start;
-   region->ptr = rmesa->dma.current.start;
-   region->end = rmesa->dma.current.start + bytes;
-   region->address = rmesa->dma.current.address;
-   region->buf = rmesa->dma.current.buf;
-   region->buf->refcount++;
-
-   rmesa->dma.current.ptr += bytes; /* bug - if alignment > 7 */
-   rmesa->dma.current.start = 
-      rmesa->dma.current.ptr = (rmesa->dma.current.ptr + 0x7) & ~0x7;  
-}
-
-/* ================================================================
- * SwapBuffers with client-side throttling
- */
-
-static uint32_t radeonGetLastFrame (radeonContextPtr rmesa) 
-{
-   drm_radeon_getparam_t gp;
-   int ret;
-   uint32_t frame;
-
-   gp.param = RADEON_PARAM_LAST_FRAME;
-   gp.value = (int *)&frame;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_GETPARAM,
-			      &gp, sizeof(gp) );
-
-   if ( ret ) {
-      fprintf( stderr, "%s: drm_radeon_getparam_t: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-
-   return frame;
-}
-
-static void radeonEmitIrqLocked( radeonContextPtr rmesa )
-{
-   drm_radeon_irq_emit_t ie;
-   int ret;
-
-   ie.irq_seq = &rmesa->iw.irq_seq;
-   ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_IRQ_EMIT, 
-			      &ie, sizeof(ie) );
-   if ( ret ) {
-      fprintf( stderr, "%s: drm_radeon_irq_emit_t: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void radeonWaitIrq( radeonContextPtr rmesa )
-{
-   int ret;
-
-   do {
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_IRQ_WAIT,
-			     &rmesa->iw, sizeof(rmesa->iw) );
-   } while (ret && (errno == EINTR || errno == EBUSY));
-
-   if ( ret ) {
-      fprintf( stderr, "%s: drmRadeonIrqWait: %d\n", __FUNCTION__, ret );
-      exit(1);
-   }
-}
-
-
-static void radeonWaitForFrameCompletion( radeonContextPtr rmesa )
-{
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
-
-   if (rmesa->do_irqs) {
-      if (radeonGetLastFrame(rmesa) < sarea->last_frame) {
-	 if (!rmesa->irqsEmitted) {
-	    while (radeonGetLastFrame (rmesa) < sarea->last_frame)
-	       ;
-	 }
-	 else {
-	    UNLOCK_HARDWARE( rmesa ); 
-	    radeonWaitIrq( rmesa );	
-	    LOCK_HARDWARE( rmesa ); 
-	 }
-	 rmesa->irqsEmitted = 10;
-      }
-
-      if (rmesa->irqsEmitted) {
-	 radeonEmitIrqLocked( rmesa );
-	 rmesa->irqsEmitted--;
+      for (i = 0; i + 1 < nr; i += 2) {
+	 voffset =  rmesa->radeon.tcl.aos[i + 0].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 0].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+0].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
+	 voffset =  rmesa->radeon.tcl.aos[i + 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[i + 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[i+1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
       }
-   } 
-   else {
-      while (radeonGetLastFrame (rmesa) < sarea->last_frame) {
-	 UNLOCK_HARDWARE( rmesa ); 
-	 if (rmesa->do_usleeps) 
-	    DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa ); 
+      if (nr & 1) {
+	 voffset =  rmesa->radeon.tcl.aos[nr - 1].offset +
+	    offset * 4 * rmesa->radeon.tcl.aos[nr - 1].stride;
+	 radeon_cs_write_reloc(rmesa->radeon.cmdbuf.cs,
+			       rmesa->radeon.tcl.aos[nr-1].bo,
+			       RADEON_GEM_DOMAIN_GTT,
+			       0, 0);
       }
    }
-}
+   END_BATCH();
 
-/* Copy the back color buffer to the front color buffer.
- */
-void radeonCopyBuffer( __DRIdrawablePrivate *dPriv,
-		       const drm_clip_rect_t	  *rect)
-{
-   radeonContextPtr rmesa;
-   GLint nbox, i, ret;
-   GLboolean   missed_target;
-   int64_t ust;
-   __DRIscreenPrivate *psp;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-
-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "\n%s( %p )\n\n", __FUNCTION__, (void *) rmesa->glCtx );
-   }
-
-   RADEON_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   /* Throttle the frame rate -- only allow one pending swap buffers
-    * request at a time.
-    */
-   radeonWaitForFrameCompletion( rmesa );
-   if (!rect)
-   {
-       UNLOCK_HARDWARE( rmesa );
-       driWaitForVBlank( dPriv, & missed_target );
-       LOCK_HARDWARE( rmesa );
-   }
-
-   nbox = dPriv->numClipRects; /* must be in locked region */
-
-   for ( i = 0 ; i < nbox ; ) {
-      GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS , nbox );
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      GLint n = 0;
-
-      for ( ; i < nr ; i++ ) {
-
-	  *b = box[i];
-
-	  if (rect)
-	  {
-	      if (rect->x1 > b->x1)
-		  b->x1 = rect->x1;
-	      if (rect->y1 > b->y1)
-		  b->y1 = rect->y1;
-	      if (rect->x2 < b->x2)
-		  b->x2 = rect->x2;
-	      if (rect->y2 < b->y2)
-		  b->y2 = rect->y2;
-
-	      if (b->x1 >= b->x2 || b->y1 >= b->y2)
-		  continue;
-	  }
-
-	  b++;
-	  n++;
-      }
-      rmesa->sarea->nbox = n;
-
-      if (!n)
-	 continue;
-
-      ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_SWAP );
-
-      if ( ret ) {
-	 fprintf( stderr, "DRM_RADEON_SWAP_BUFFERS: return = %d\n", ret );
-	 UNLOCK_HARDWARE( rmesa );
-	 exit( 1 );
-      }
-   }
-
-   UNLOCK_HARDWARE( rmesa );
-   if (!rect)
-   {
-       psp = dPriv->driScreenPriv;
-       rmesa->swap_count++;
-       (*psp->systemTime->getUST)( & ust );
-       if ( missed_target ) {
-	   rmesa->swap_missed_count++;
-	   rmesa->swap_missed_ust = ust - rmesa->swap_ust;
-       }
-
-       rmesa->swap_ust = ust;
-       rmesa->hw.all_dirty = GL_TRUE;
-   }
-}
-
-void radeonPageFlip( __DRIdrawablePrivate *dPriv )
-{
-   radeonContextPtr rmesa;
-   GLint ret;
-   GLboolean   missed_target;
-   __DRIscreenPrivate *psp;
-
-   assert(dPriv);
-   assert(dPriv->driContextPriv);
-   assert(dPriv->driContextPriv->driverPrivate);
-
-   rmesa = (radeonContextPtr) dPriv->driContextPriv->driverPrivate;
-   psp = dPriv->driScreenPriv;
-
-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
-      fprintf(stderr, "%s: pfCurrentPage: %d\n", __FUNCTION__,
-	      rmesa->sarea->pfCurrentPage);
-   }
-
-   RADEON_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
-
-   /* Need to do this for the perf box placement:
-    */
-   if (dPriv->numClipRects)
-   {
-      drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
-      b[0] = box[0];
-      rmesa->sarea->nbox = 1;
-   }
-
-   /* Throttle the frame rate -- only allow a few pending swap buffers
-    * request at a time.
-    */
-   radeonWaitForFrameCompletion( rmesa );
-   UNLOCK_HARDWARE( rmesa );
-   driWaitForVBlank( dPriv, & missed_target );
-   if ( missed_target ) {
-      rmesa->swap_missed_count++;
-      (void) (*psp->systemTime->getUST)( & rmesa->swap_missed_ust );
-   }
-   LOCK_HARDWARE( rmesa );
-
-   ret = drmCommandNone( rmesa->dri.fd, DRM_RADEON_FLIP );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_FLIP: return = %d\n", ret );
-      exit( 1 );
-   }
-
-   rmesa->swap_count++;
-   (void) (*psp->systemTime->getUST)( & rmesa->swap_ust );
-
-   /* Get ready for drawing next frame.  Update the renderbuffers'
-    * flippedOffset/Pitch fields so we draw into the right place.
-    */
-   driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-                        rmesa->sarea->pfCurrentPage);
-
-   radeonUpdateDrawBuffer(rmesa->glCtx);
+#endif
 }
 
-
 /* ================================================================
  * Buffer clear
  */
 #define RADEON_MAX_CLEARS	256
 
-static void radeonClear( GLcontext *ctx, GLbitfield mask )
+static void radeonKernelClear(GLcontext *ctx, GLuint flags)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   drm_radeon_sarea_t *sarea = rmesa->sarea;
+     r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   drm_radeon_sarea_t *sarea = rmesa->radeon.sarea;
    uint32_t clear;
-   GLuint flags = 0;
-   GLuint color_mask = 0;
    GLint ret, i;
    GLint cx, cy, cw, ch;
 
-   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
-      fprintf( stderr, "radeonClear\n");
-   }
-
-   {
-      LOCK_HARDWARE( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      if ( dPriv->numClipRects == 0 ) 
-	 return;
-   }
-   
-   radeonFlush( ctx ); 
-
-   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
-      flags |= RADEON_FRONT;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_FRONT_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_BACK_LEFT ) {
-      flags |= RADEON_BACK;
-      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      mask &= ~BUFFER_BIT_BACK_LEFT;
-   }
-
-   if ( mask & BUFFER_BIT_DEPTH ) {
-      flags |= RADEON_DEPTH;
-      mask &= ~BUFFER_BIT_DEPTH;
-   }
-
-   if ( (mask & BUFFER_BIT_STENCIL) && rmesa->state.stencil.hwBuffer ) {
-      flags |= RADEON_STENCIL;
-      mask &= ~BUFFER_BIT_STENCIL;
-   }
-
-   if ( mask ) {
-      if (RADEON_DEBUG & DEBUG_FALLBACKS)
-	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
-      _swrast_Clear( ctx, mask );
-   }
-
-   if ( !flags ) 
-      return;
-
-   if (rmesa->using_hyperz) {
-      flags |= RADEON_USE_COMP_ZBUF;
-/*      if (rmesa->radeonScreen->chipset & RADEON_CHIPSET_TCL) 
-         flags |= RADEON_USE_HIERZ; */
-      if (!(rmesa->state.stencil.hwBuffer) ||
-	 ((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
-	    ((rmesa->state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
-	  flags |= RADEON_CLEAR_FASTZ;
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
+   LOCK_HARDWARE( &rmesa->radeon );
 
    /* compute region after locking: */
    cx = ctx->DrawBuffer->_Xmin;
@@ -1112,7 +474,7 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
 
       gp.param = RADEON_PARAM_LAST_CLEAR;
       gp.value = (int *)&clear;
-      ret = drmCommandWriteRead( rmesa->dri.fd,
+      ret = drmCommandWriteRead( rmesa->radeon.dri.fd,
 				 DRM_RADEON_GETPARAM, &gp, sizeof(gp) );
 
       if ( ret ) {
@@ -1124,20 +486,20 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
 	 break;
       }
 
-      if ( rmesa->do_usleeps ) {
-	 UNLOCK_HARDWARE( rmesa );
+      if ( rmesa->radeon.do_usleeps ) {
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 DO_USLEEP( 1 );
-	 LOCK_HARDWARE( rmesa );
+	 LOCK_HARDWARE( &rmesa->radeon );
       }
    }
 
    /* Send current state to the hardware */
-   radeonFlushCmdBufLocked( rmesa, __FUNCTION__ );
+   rcommonFlushCmdBufLocked( &rmesa->radeon, __FUNCTION__ );
 
    for ( i = 0 ; i < dPriv->numClipRects ; ) {
       GLint nr = MIN2( i + RADEON_NR_SAREA_CLIPRECTS, dPriv->numClipRects );
       drm_clip_rect_t *box = dPriv->pClipRects;
-      drm_clip_rect_t *b = rmesa->sarea->boxes;
+      drm_clip_rect_t *b = rmesa->radeon.sarea->boxes;
       drm_radeon_clear_t clear;
       drm_radeon_clear_rect_t depth_boxes[RADEON_NR_SAREA_CLIPRECTS];
       GLint n = 0;
@@ -1172,105 +534,107 @@ static void radeonClear( GLcontext *ctx, GLbitfield mask )
 	 }
       }
 
-      rmesa->sarea->nbox = n;
+      rmesa->radeon.sarea->nbox = n;
 
       clear.flags       = flags;
-      clear.clear_color = rmesa->state.color.clear;
-      clear.clear_depth = rmesa->state.depth.clear;
+      clear.clear_color = rmesa->radeon.state.color.clear;
+      clear.clear_depth = rmesa->radeon.state.depth.clear;
       clear.color_mask  = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
-      clear.depth_mask  = rmesa->state.stencil.clear;
+      clear.depth_mask  = rmesa->radeon.state.stencil.clear;
       clear.depth_boxes = depth_boxes;
 
       n--;
-      b = rmesa->sarea->boxes;
+      b = rmesa->radeon.sarea->boxes;
       for ( ; n >= 0 ; n-- ) {
 	 depth_boxes[n].f[CLEAR_X1] = (float)b[n].x1;
 	 depth_boxes[n].f[CLEAR_Y1] = (float)b[n].y1;
 	 depth_boxes[n].f[CLEAR_X2] = (float)b[n].x2;
 	 depth_boxes[n].f[CLEAR_Y2] = (float)b[n].y2;
-	 depth_boxes[n].f[CLEAR_DEPTH] = 
-	    (float)rmesa->state.depth.clear;
+	 depth_boxes[n].f[CLEAR_DEPTH] =
+	    (float)rmesa->radeon.state.depth.clear;
       }
 
-      ret = drmCommandWrite( rmesa->dri.fd, DRM_RADEON_CLEAR,
+      ret = drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_CLEAR,
 			     &clear, sizeof(drm_radeon_clear_t));
 
       if ( ret ) {
-	 UNLOCK_HARDWARE( rmesa );
+	 UNLOCK_HARDWARE( &rmesa->radeon );
 	 fprintf( stderr, "DRM_RADEON_CLEAR: return = %d\n", ret );
 	 exit( 1 );
       }
    }
-
-   UNLOCK_HARDWARE( rmesa );
-   rmesa->hw.all_dirty = GL_TRUE;
+   UNLOCK_HARDWARE( &rmesa->radeon );
 }
 
-
-void radeonWaitForIdleLocked( radeonContextPtr rmesa )
+static void radeonClear( GLcontext *ctx, GLbitfield mask )
 {
-    int fd = rmesa->dri.fd;
-    int to = 0;
-    int ret, i = 0;
-
-    rmesa->c_drawWaits++;
-
-    do {
-        do {
-            ret = drmCommandNone( fd, DRM_RADEON_CP_IDLE);
-        } while ( ret && errno == EBUSY && i++ < RADEON_IDLE_RETRY );
-    } while ( ( ret == -EBUSY ) && ( to++ < RADEON_TIMEOUT ) );
-
-    if ( ret < 0 ) {
-	UNLOCK_HARDWARE( rmesa );
-	fprintf( stderr, "Error: Radeon timed out... exiting\n" );
-	exit( -1 );
-    }
-}
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLuint flags = 0;
+   GLuint color_mask = 0;
+   GLuint orig_mask = mask;
 
+   if ( RADEON_DEBUG & DEBUG_IOCTL ) {
+      fprintf( stderr, "radeonClear\n");
+   }
 
-static void radeonWaitForIdle( radeonContextPtr rmesa )
-{
-   LOCK_HARDWARE(rmesa);
-   radeonWaitForIdleLocked( rmesa );
-   UNLOCK_HARDWARE(rmesa);
-}
+   {
+      LOCK_HARDWARE( &rmesa->radeon );
+      UNLOCK_HARDWARE( &rmesa->radeon );
+      if ( dPriv->numClipRects == 0 )
+	 return;
+   }
 
+   radeon_firevertices(&rmesa->radeon);
 
-void radeonFlush( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   if ( mask & BUFFER_BIT_FRONT_LEFT ) {
+      flags |= RADEON_FRONT;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_FRONT_LEFT;
+   }
 
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
+   if ( mask & BUFFER_BIT_BACK_LEFT ) {
+      flags |= RADEON_BACK;
+      color_mask = rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK];
+      mask &= ~BUFFER_BIT_BACK_LEFT;
+   }
 
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
+   if ( mask & BUFFER_BIT_DEPTH ) {
+      flags |= RADEON_DEPTH;
+      mask &= ~BUFFER_BIT_DEPTH;
+   }
 
-   radeonEmitState( rmesa );
-   
-   if (rmesa->store.cmd_used)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-}
+   if ( (mask & BUFFER_BIT_STENCIL) ) {
+      flags |= RADEON_STENCIL;
+      mask &= ~BUFFER_BIT_STENCIL;
+   }
 
-/* Make sure all commands have been sent to the hardware and have
- * completed processing.
- */
-void radeonFinish( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   radeonFlush( ctx );
-
-   if (rmesa->do_irqs) {
-      LOCK_HARDWARE( rmesa );
-      radeonEmitIrqLocked( rmesa );
-      UNLOCK_HARDWARE( rmesa );
-      radeonWaitIrq( rmesa );
+   if ( mask ) {
+      if (RADEON_DEBUG & DEBUG_FALLBACKS)
+	 fprintf(stderr, "%s: swrast clear, mask: %x\n", __FUNCTION__, mask);
+      _swrast_Clear( ctx, mask );
    }
-   else
-      radeonWaitForIdle( rmesa );
-}
 
+   if ( !flags )
+      return;
+
+   if (rmesa->using_hyperz) {
+      flags |= RADEON_USE_COMP_ZBUF;
+/*      if (rmesa->radeon.radeonScreen->chipset & RADEON_CHIPSET_TCL)
+         flags |= RADEON_USE_HIERZ; */
+      if (((flags & RADEON_DEPTH) && (flags & RADEON_STENCIL) &&
+	    ((rmesa->radeon.state.stencil.clear & RADEON_STENCIL_WRITE_MASK) == RADEON_STENCIL_WRITE_MASK))) {
+	  flags |= RADEON_CLEAR_FASTZ;
+      }
+   }
+
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     radeonUserClear(ctx, orig_mask);
+   else {
+      radeonKernelClear(ctx, flags);
+      rmesa->radeon.hw.all_dirty = GL_TRUE;
+   }
+}
 
 void radeonInitIoctlFuncs( GLcontext *ctx )
 {
diff --git a/src/mesa/drivers/dri/radeon/radeon_ioctl.h b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
index 4e3a44df075..18805d4c571 100644
--- a/src/mesa/drivers/dri/radeon/radeon_ioctl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_ioctl.h
@@ -38,31 +38,32 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/simple_list.h"
 #include "radeon_lock.h"
+#include "radeon_bocs_wrapper.h"
 
-
-extern void radeonEmitState( radeonContextPtr rmesa );
-extern void radeonEmitVertexAOS( radeonContextPtr rmesa,
+extern void radeonEmitVertexAOS( r100ContextPtr rmesa,
 				 GLuint vertex_size,
+				 struct radeon_bo *bo,
 				 GLuint offset );
 
-extern void radeonEmitVbufPrim( radeonContextPtr rmesa,
+extern void radeonEmitVbufPrim( r100ContextPtr rmesa,
 				GLuint vertex_format,
 				GLuint primitive,
 				GLuint vertex_nr );
 
-extern void radeonFlushElts( radeonContextPtr rmesa );
+extern void radeonFlushElts( GLcontext *ctx );
+			    
 
-extern GLushort *radeonAllocEltsOpenEnded( radeonContextPtr rmesa,
+extern GLushort *radeonAllocEltsOpenEnded( r100ContextPtr rmesa,
 					   GLuint vertex_format,
 					   GLuint primitive,
 					   GLuint min_nr );
 
-extern void radeonEmitAOS( radeonContextPtr rmesa,
-			   struct radeon_dma_region **regions,
+
+extern void radeonEmitAOS( r100ContextPtr rmesa,
 			   GLuint n,
 			   GLuint offset );
 
-extern void radeonEmitBlit( radeonContextPtr rmesa,
+extern void radeonEmitBlit( r100ContextPtr rmesa,
 			    GLuint color_fmt,
 			    GLuint src_pitch,
 			    GLuint src_offset,
@@ -72,30 +73,15 @@ extern void radeonEmitBlit( radeonContextPtr rmesa,
 			    GLint dstx, GLint dsty,
 			    GLuint w, GLuint h );
 
-extern void radeonEmitWait( radeonContextPtr rmesa, GLuint flags );
-
-extern void radeonFlushCmdBuf( radeonContextPtr rmesa, const char * );
-extern void radeonRefillCurrentDmaRegion( radeonContextPtr rmesa );
+extern void radeonEmitWait( r100ContextPtr rmesa, GLuint flags );
 
-extern void radeonAllocDmaRegion( radeonContextPtr rmesa,
-				  struct radeon_dma_region *region,
-				  int bytes, 
-				  int alignment );
+extern void radeonFlushCmdBuf( r100ContextPtr rmesa, const char * );
 
-extern void radeonReleaseDmaRegion( radeonContextPtr rmesa,
-				    struct radeon_dma_region *region,
-				    const char *caller );
-
-extern void radeonCopyBuffer( __DRIdrawablePrivate *drawable,
-			      const drm_clip_rect_t	 *rect);
-extern void radeonPageFlip( __DRIdrawablePrivate *drawable );
 extern void radeonFlush( GLcontext *ctx );
 extern void radeonFinish( GLcontext *ctx );
-extern void radeonWaitForIdleLocked( radeonContextPtr rmesa );
-extern void radeonWaitForVBlank( radeonContextPtr rmesa );
 extern void radeonInitIoctlFuncs( GLcontext *ctx );
-extern void radeonGetAllParams( radeonContextPtr rmesa );
-extern void radeonSetUpAtomList( radeonContextPtr rmesa );
+extern void radeonGetAllParams( r100ContextPtr rmesa );
+extern void radeonSetUpAtomList( r100ContextPtr rmesa );
 
 /* ================================================================
  * Helper macros:
@@ -105,33 +91,33 @@ extern void radeonSetUpAtomList( radeonContextPtr rmesa );
  */
 #define RADEON_NEWPRIM( rmesa )			\
 do {						\
-   if ( rmesa->dma.flush )			\
-      rmesa->dma.flush( rmesa );	\
+   if ( rmesa->radeon.dma.flush )			\
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	\
 } while (0)
 
 /* Can accomodate several state changes and primitive changes without
  * actually firing the buffer.
  */
+
 #define RADEON_STATECHANGE( rmesa, ATOM )			\
 do {								\
    RADEON_NEWPRIM( rmesa );					\
    rmesa->hw.ATOM.dirty = GL_TRUE;				\
-   rmesa->hw.is_dirty = GL_TRUE;				\
+   rmesa->radeon.hw.is_dirty = GL_TRUE;				\
 } while (0)
 
-#define RADEON_DB_STATE( ATOM )			        \
+#define RADEON_DB_STATE( ATOM )				\
    memcpy( rmesa->hw.ATOM.lastcmd, rmesa->hw.ATOM.cmd,	\
 	   rmesa->hw.ATOM.cmd_size * 4)
 
-static INLINE int RADEON_DB_STATECHANGE( 
-   radeonContextPtr rmesa,
-   struct radeon_state_atom *atom )
+static INLINE int RADEON_DB_STATECHANGE(r100ContextPtr rmesa,
+					struct radeon_state_atom *atom )
 {
    if (memcmp(atom->cmd, atom->lastcmd, atom->cmd_size*4)) {
-      int *tmp;
+      GLuint *tmp;
       RADEON_NEWPRIM( rmesa );
       atom->dirty = GL_TRUE;
-      rmesa->hw.is_dirty = GL_TRUE;
+      rmesa->radeon.hw.is_dirty = GL_TRUE;
       tmp = atom->cmd; 
       atom->cmd = atom->lastcmd;
       atom->lastcmd = tmp;
@@ -141,16 +127,6 @@ static INLINE int RADEON_DB_STATECHANGE(
       return 0;
 }
 
-
-/* Fire the buffered vertices no matter what.
- */
-#define RADEON_FIREVERTICES( rmesa )			\
-do {							\
-   if ( rmesa->store.cmd_used || rmesa->dma.flush ) {	\
-      radeonFlush( rmesa->glCtx );			\
-   }							\
-} while (0)
-
 /* Command lengths.  Note that any time you ensure ELTS_BUFSZ or VBUF_BUFSZ
  * are available, you will also be adding an rmesa->state.max_state_size because
  * r200EmitState is called from within r200EmitVbufPrim and r200FlushElts.
@@ -167,36 +143,37 @@ do {							\
 #define VBUF_BUFSZ	(4 * sizeof(int))
 #endif
 
-/* Ensure that a minimum amount of space is available in the command buffer.
- * This is used to ensure atomicity of state updates with the rendering requests
- * that rely on them.
- *
- * An alternative would be to implement a "soft lock" such that when the buffer
- * wraps at an inopportune time, we grab the lock, flush the current buffer,
- * and hang on to the lock until the critical section is finished and we flush
- * the buffer again and unlock.
- */
-static INLINE void radeonEnsureCmdBufSpace( radeonContextPtr rmesa,
-					      int bytes )
-{
-   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
-   assert( bytes <= RADEON_CMD_BUF_SZ );
-}
 
-/* Alloc space in the command buffer
- */
-static INLINE char *radeonAllocCmdBuf( radeonContextPtr rmesa,
-					 int bytes, const char *where )
+static inline uint32_t cmdpacket3(int cmd_type)
 {
-   if (rmesa->store.cmd_used + bytes > RADEON_CMD_BUF_SZ)
-      radeonFlushCmdBuf( rmesa, __FUNCTION__ );
+  drm_radeon_cmd_header_t cmd;
+
+  cmd.i = 0;
+  cmd.header.cmd_type = cmd_type;
+
+  return (uint32_t)cmd.i;
 
-   {
-      char *head = rmesa->store.cmd_buf + rmesa->store.cmd_used;
-      rmesa->store.cmd_used += bytes;
-      return head;
-   }
 }
 
+#define OUT_BATCH_PACKET3(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3));				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+#define OUT_BATCH_PACKET3_CLIP(packet, num_extra) do {	      \
+    if (!b_l_rmesa->radeonScreen->kernel_mm) {		      \
+      OUT_BATCH(cmdpacket3(RADEON_CMD_PACKET3_CLIP));	      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    } else {						      \
+      OUT_BATCH(CP_PACKET2);				      \
+      OUT_BATCH(CP_PACKET3((packet), (num_extra)));	      \
+    }							      \
+  } while(0)
+
+
 #endif /* __RADEON_IOCTL_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.c b/src/mesa/drivers/dri/radeon/radeon_lock.c
index 64bb3ca103f..5774f7ebcf7 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.c
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.c
@@ -41,30 +41,13 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "main/glheader.h"
 #include "main/mtypes.h"
-#include "radeon_context.h"
+#include "main/colormac.h"
+#include "dri_util.h"
+#include "radeon_screen.h"
+#include "radeon_common.h"
 #include "radeon_lock.h"
-#include "radeon_tex.h"
-#include "radeon_state.h"
-#include "radeon_ioctl.h"
-
 #include "drirenderbuffer.h"
 
-#if DEBUG_LOCKING
-char *prevLockFile = NULL;
-int prevLockLine = 0;
-#endif
-
-/* Turn on/off page flipping according to the flags in the sarea:
- */
-static void radeonUpdatePageFlipping(radeonContextPtr rmesa)
-{
-	rmesa->doPageFlip = rmesa->sarea->pfState;
-	if (rmesa->glCtx->WinSysDrawBuffer) {
-		driFlipRenderbuffers(rmesa->glCtx->WinSysDrawBuffer,
-				     rmesa->sarea->pfCurrentPage);
-	}
-}
-
 /* Update the hardware state.  This is called if another context has
  * grabbed the hardware lock, which includes the X server.  This
  * function also updates the driver's window state after the X server
@@ -75,10 +58,11 @@ static void radeonUpdatePageFlipping(radeonContextPtr rmesa)
  */
 void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 {
-	__DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-	__DRIdrawablePrivate *const readable = rmesa->dri.readable;
+	__DRIdrawablePrivate *const drawable = radeon_get_drawable(rmesa);
+	__DRIdrawablePrivate *const readable = radeon_get_readable(rmesa);
 	__DRIscreenPrivate *sPriv = rmesa->dri.screen;
-	drm_radeon_sarea_t *sarea = rmesa->sarea;
+
+	assert(drawable != NULL);
 
 	drmGetLock(rmesa->dri.fd, rmesa->dri.hwContext, flags);
 
@@ -96,29 +80,42 @@ void radeonGetLock(radeonContextPtr rmesa, GLuint flags)
 	}
 
 	if (rmesa->lastStamp != drawable->lastStamp) {
-		radeonUpdatePageFlipping(rmesa);
-		radeonSetCliprects(rmesa);
-		radeonUpdateViewportOffset(rmesa->glCtx);
-		driUpdateFramebufferSize(rmesa->glCtx, drawable);
+		radeon_window_moved(rmesa);
+		rmesa->lastStamp = drawable->lastStamp;
 	}
 
-	RADEON_STATECHANGE(rmesa, ctx);
-	if (rmesa->sarea->tiling_enabled) {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |=
-		    RADEON_COLOR_TILE_ENABLE;
-	} else {
-		rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] &=
-		    ~RADEON_COLOR_TILE_ENABLE;
-	}
+	rmesa->vtbl.get_lock(rmesa);
+
+	rmesa->lost_context = GL_TRUE;
+}
 
-	if (sarea->ctx_owner != rmesa->dri.hwContext) {
-		int i;
-		sarea->ctx_owner = rmesa->dri.hwContext;
+void radeon_lock_hardware(radeonContextPtr radeon)
+{
+	char ret = 0;
+	struct radeon_framebuffer *rfb = NULL;
+	struct radeon_renderbuffer *rrb = NULL;
 
-		for (i = 0; i < rmesa->nr_heaps; i++) {
-			DRI_AGE_TEXTURES(rmesa->texture_heaps[i]);
-		}
+	if (radeon_get_drawable(radeon)) {
+		rfb = radeon_get_drawable(radeon)->driverPrivate;
+
+		if (rfb)
+			rrb = radeon_get_renderbuffer(&rfb->base,
+						      rfb->base._ColorDrawBufferIndexes[0]);
 	}
 
-	rmesa->lost_context = GL_TRUE;
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		DRM_CAS(radeon->dri.hwLock, radeon->dri.hwContext,
+			 (DRM_LOCK_HELD | radeon->dri.hwContext), ret );
+		if (ret)
+			radeonGetLock(radeon, 0);
+	}
+}
+
+void radeon_unlock_hardware(radeonContextPtr radeon)
+{
+	if (!radeon->radeonScreen->driScreen->dri2.enabled) {
+		DRM_UNLOCK( radeon->dri.fd,
+			    radeon->dri.hwLock,
+			    radeon->dri.hwContext );
+	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_lock.h b/src/mesa/drivers/dri/radeon/radeon_lock.h
index 86e96aa7d2c..2817709eed6 100644
--- a/src/mesa/drivers/dri/radeon/radeon_lock.h
+++ b/src/mesa/drivers/dri/radeon/radeon_lock.h
@@ -39,74 +39,21 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  *   Kevin E. Martin <[email protected]>
  */
 
-#ifndef __RADEON_LOCK_H__
-#define __RADEON_LOCK_H__
+#ifndef COMMON_LOCK_H
+#define COMMON_LOCK_H
 
-extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
-
-/* Turn DEBUG_LOCKING on to find locking conflicts.
- */
-#define DEBUG_LOCKING	0
-
-#if DEBUG_LOCKING
-extern char *prevLockFile;
-extern int prevLockLine;
-
-#define DEBUG_LOCK()							\
-   do {									\
-      prevLockFile = (__FILE__);					\
-      prevLockLine = (__LINE__);					\
-   } while (0)
-
-#define DEBUG_RESET()							\
-   do {									\
-      prevLockFile = 0;							\
-      prevLockLine = 0;							\
-   } while (0)
-
-#define DEBUG_CHECK_LOCK()						\
-   do {									\
-      if ( prevLockFile ) {						\
-	 fprintf( stderr,						\
-		  "LOCK SET!\n\tPrevious %s:%d\n\tCurrent: %s:%d\n",	\
-		  prevLockFile, prevLockLine, __FILE__, __LINE__ );	\
-	 exit( 1 );							\
-      }									\
-   } while (0)
-
-#else
+#include "main/colormac.h"
+#include "radeon_screen.h"
+#include "radeon_common.h"
 
-#define DEBUG_LOCK()
-#define DEBUG_RESET()
-#define DEBUG_CHECK_LOCK()
+extern void radeonGetLock(radeonContextPtr rmesa, GLuint flags);
 
-#endif
-
-/*
- * !!! We may want to separate locks from locks with validation.  This
- * could be used to improve performance for those things commands that
- * do not do any drawing !!!
- */
+void radeon_lock_hardware(radeonContextPtr rmesa);
+void radeon_unlock_hardware(radeonContextPtr rmesa);
 
 /* Lock the hardware and validate our state.
  */
-#define LOCK_HARDWARE( rmesa )					\
-   do {								\
-      char __ret = 0;						\
-      DEBUG_CHECK_LOCK();					\
-      DRM_CAS( (rmesa)->dri.hwLock, (rmesa)->dri.hwContext,		\
-	       (DRM_LOCK_HELD | (rmesa)->dri.hwContext), __ret );	\
-      if ( __ret )						\
-	 radeonGetLock( (rmesa), 0 );				\
-      DEBUG_LOCK();						\
-   } while (0)
-
-#define UNLOCK_HARDWARE( rmesa )					\
-   do {									\
-      DRM_UNLOCK( (rmesa)->dri.fd,					\
-		  (rmesa)->dri.hwLock,					\
-		  (rmesa)->dri.hwContext );				\
-      DEBUG_RESET();							\
-   } while (0)
+#define LOCK_HARDWARE( rmesa )	radeon_lock_hardware(rmesa)
+#define UNLOCK_HARDWARE( rmesa )  radeon_unlock_hardware(rmesa)
 
-#endif				/* __RADEON_LOCK_H__ */
+#endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos.h b/src/mesa/drivers/dri/radeon/radeon_maos.h
index b8935e84a05..b88eb198d57 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos.h
+++ b/src/mesa/drivers/dri/radeon/radeon_maos.h
@@ -38,6 +38,5 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_context.h"
 
 extern void radeonEmitArrays( GLcontext *ctx, GLuint inputs );
-extern void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs );
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
index 31eea13f4ef..7c6ea0530e0 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_arrays.c
@@ -48,160 +48,35 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_maos.h"
 #include "radeon_tcl.h"
 
-#if 0
-/* Usage:
- *   - from radeon_tcl_render
- *   - call radeonEmitArrays to ensure uptodate arrays in dma
- *   - emit primitives (new type?) which reference the data
- *       -- need to use elts for lineloop, quads, quadstrip/flat
- *       -- other primitives are all well-formed (need tristrip-1,fake-poly)
- *
- */
-static void emit_ubyte_rgba3( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
+static void emit_vecfog(GLcontext *ctx, struct radeon_aos *aos,
+			GLvoid *data, int stride, int count)
 {
    int i;
-   radeon_color_t *out = (radeon_color_t *)(rvb->start + rvb->address);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p\n",
-	      __FUNCTION__, count, stride, (void *)out);
-
-   for (i = 0; i < count; i++) {
-      out->red   = *data;
-      out->green = *(data+1);
-      out->blue  = *(data+2);
-      out->alpha = 0xFF;
-      out++;
-      data += stride;
-   }
-}
-
-static void emit_ubyte_rgba4( GLcontext *ctx,
-			      struct radeon_dma_region *rvb,
-			      char *data,
-			      int stride,
-			      int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
+   uint32_t *out;
+   int size = 1;
+   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
 
    if (RADEON_DEBUG & DEBUG_VERTS)
       fprintf(stderr, "%s count %d stride %d\n",
 	      __FUNCTION__, count, stride);
 
-   if (stride == 4)
-       COPY_DWORDS( out, data, count );
-   else
-      for (i = 0; i < count; i++) {
-	 *out++ = LE32_TO_CPU(*(int *)data);
-	 data += stride;
-      }
-}
-
-
-static void emit_ubyte_rgba( GLcontext *ctx,
-			     struct radeon_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
-
-   assert (!rvb->buf);
-
    if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
+      radeonAllocDmaRegion( rmesa, &aos->bo, &aos->offset, size * 4, 32 );
       count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
+      aos->stride = 0;
    }
    else {
-      radeonAllocDmaRegion( rmesa, rvb, 4 * count, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
+      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, size * 4, 32);
+      aos->stride = size;
    }
 
-   /* Emit the data
-    */
-   switch (size) {
-   case 3:
-      emit_ubyte_rgba3( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_ubyte_rgba4( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-}
-#endif
-
-#if defined(USE_X86_ASM)
-#define COPY_DWORDS( dst, src, nr )					\
-do {									\
-	int __tmp;							\
-	__asm__ __volatile__( "rep ; movsl"				\
-			      : "=%c" (__tmp), "=D" (dst), "=S" (__tmp)	\
-			      : "0" (nr),				\
-			        "D" ((long)dst),			\
-			        "S" ((long)src) );			\
-} while (0)
-#else
-#define COPY_DWORDS( dst, src, nr )		\
-do {						\
-   int j;					\
-   for ( j = 0 ; j < nr ; j++ )			\
-      dst[j] = ((int *)src)[j];			\
-   dst += nr;					\
-} while (0)
-#endif
-
-static void emit_vecfog( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
-{
-   int i;
-   GLfloat *out;
+   aos->components = size;
+   aos->count = count;
 
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = 1;
-   }
-   else {
-      radeonAllocDmaRegion( rmesa, rvb, count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 1;
-      rvb->aos_size = 1;
-   }
 
    /* Emit the data
     */
-   out = (GLfloat *)(rvb->address + rvb->start);
+   out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
    for (i = 0; i < count; i++) {
       out[0] = radeonComputeFogBlendFactor( ctx, *(GLfloat *)data );
       out++;
@@ -209,169 +84,9 @@ static void emit_vecfog( GLcontext *ctx,
    }
 }
 
-static void emit_vec4( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 4)
-      COPY_DWORDS( out, data, count );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out++;
-	 data += stride;
-      }
-}
-
-
-static void emit_vec8( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 8)
-      COPY_DWORDS( out, data, count*2 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out += 2;
-	 data += stride;
-      }
-}
-
-static void emit_vec12( GLcontext *ctx,
-		       struct radeon_dma_region *rvb,
-		       char *data,
-		       int stride,
-		       int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d out %p data %p\n",
-	      __FUNCTION__, count, stride, (void *)out, (void *)data);
-
-   if (stride == 12)
-      COPY_DWORDS( out, data, count*3 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out += 3;
-	 data += stride;
-      }
-}
-
-static void emit_vec16( GLcontext *ctx,
-			struct radeon_dma_region *rvb,
-			char *data,
-			int stride,
-			int count )
-{
-   int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d stride %d\n",
-	      __FUNCTION__, count, stride);
-
-   if (stride == 16)
-      COPY_DWORDS( out, data, count*4 );
-   else
-      for (i = 0; i < count; i++) {
-	 out[0] = *(int *)data;
-	 out[1] = *(int *)(data+4);
-	 out[2] = *(int *)(data+8);
-	 out[3] = *(int *)(data+12);
-	 out += 4;
-	 data += stride;
-      }
-}
-
-
-static void emit_vector( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int size,
-			 int stride,
-			 int count )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_VERTS)
-      fprintf(stderr, "%s count %d size %d stride %d\n",
-	      __FUNCTION__, count, size, stride);
-
-   assert (!rvb->buf);
-
-   if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, size * 4, 4 );
-      count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = size;
-   }
-   else {
-      radeonAllocDmaRegion( rmesa, rvb, size * count * 4, 4 );	/* alignment? */
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = size;
-      rvb->aos_size = size;
-   }
-
-   /* Emit the data
-    */
-   switch (size) {
-   case 1:
-      emit_vec4( ctx, rvb, data, stride, count );
-      break;
-   case 2:
-      emit_vec8( ctx, rvb, data, stride, count );
-      break;
-   case 3:
-      emit_vec12( ctx, rvb, data, stride, count );
-      break;
-   case 4:
-      emit_vec16( ctx, rvb, data, stride, count );
-      break;
-   default:
-      assert(0);
-      exit(1);
-      break;
-   }
-
-}
-
-
-
-static void emit_s0_vec( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
+static void emit_s0_vec(uint32_t *out, GLvoid *data, int stride, int count)
 {
    int i;
-   int *out = (int *)(rvb->address + rvb->start);
-
    if (RADEON_DEBUG & DEBUG_VERTS)
       fprintf(stderr, "%s count %d stride %d\n",
 	      __FUNCTION__, count, stride);
@@ -384,14 +99,9 @@ static void emit_s0_vec( GLcontext *ctx,
    }
 }
 
-static void emit_stq_vec( GLcontext *ctx,
-			 struct radeon_dma_region *rvb,
-			 char *data,
-			 int stride,
-			 int count )
+static void emit_stq_vec(uint32_t *out, GLvoid *data, int stride, int count)
 {
    int i;
-   int *out = (int *)(rvb->address + rvb->start);
 
    if (RADEON_DEBUG & DEBUG_VERTS)
       fprintf(stderr, "%s count %d stride %d\n",
@@ -409,21 +119,16 @@ static void emit_stq_vec( GLcontext *ctx,
 
 
 
-static void emit_tex_vector( GLcontext *ctx,
-			     struct radeon_dma_region *rvb,
-			     char *data,
-			     int size,
-			     int stride,
-			     int count )
+static void emit_tex_vector(GLcontext *ctx, struct radeon_aos *aos,
+			    GLvoid *data, int size, int stride, int count)
 {
    radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
    int emitsize;
+   uint32_t *out;
 
    if (RADEON_DEBUG & DEBUG_VERTS)
       fprintf(stderr, "%s %d/%d\n", __FUNCTION__, count, size);
 
-   assert (!rvb->buf);
-
    switch (size) {
    case 4: emitsize = 3; break;
    case 3: emitsize = 3; break;
@@ -432,34 +137,33 @@ static void emit_tex_vector( GLcontext *ctx,
 
 
    if (stride == 0) {
-      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize, 4 );
+      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, emitsize * 4, 32);
       count = 1;
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = 0;
-      rvb->aos_size = emitsize;
+      aos->stride = 0;
    }
    else {
-      radeonAllocDmaRegion( rmesa, rvb, 4 * emitsize * count, 4 );
-      rvb->aos_start = GET_START(rvb);
-      rvb->aos_stride = emitsize;
-      rvb->aos_size = emitsize;
+      radeonAllocDmaRegion(rmesa, &aos->bo, &aos->offset, emitsize * count * 4, 32);
+      aos->stride = emitsize;
    }
 
+   aos->components = emitsize;
+   aos->count = count;
 
    /* Emit the data
     */
+   out = (uint32_t*)((char*)aos->bo->ptr + aos->offset);
    switch (size) {
    case 1:
-      emit_s0_vec( ctx, rvb, data, stride, count ); 
+      emit_s0_vec( out, data, stride, count );
       break;
    case 2:
-      emit_vec8( ctx, rvb, data, stride, count );
+      radeonEmitVec8( out, data, stride, count );
       break;
    case 3:
-      emit_vec12( ctx, rvb, data, stride, count );
+      radeonEmitVec12( out, data, stride, count );
       break;
    case 4:
-      emit_stq_vec( ctx, rvb, data, stride, count );
+      emit_stq_vec( out, data, stride, count );
       break;
    default:
       assert(0);
@@ -476,9 +180,8 @@ static void emit_tex_vector( GLcontext *ctx,
  */
 void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    struct vertex_buffer *VB = &TNL_CONTEXT( ctx )->vb;
-   struct radeon_dma_region **component = rmesa->tcl.aos_components;
    GLuint nr = 0;
    GLuint vfmt = 0;
    GLuint count = VB->Count;
@@ -491,12 +194,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
    if (1) {
       if (!rmesa->tcl.obj.buf) 
-	 emit_vector( ctx, 
-		      &rmesa->tcl.obj, 
-		      (char *)VB->ObjPtr->data,
-		      VB->ObjPtr->size,
-		      VB->ObjPtr->stride,
-		      count);
+	rcommon_emit_vector( ctx, 
+			     &(rmesa->tcl.aos[nr]),
+			     (char *)VB->ObjPtr->data,
+			     VB->ObjPtr->size,
+			     VB->ObjPtr->stride,
+			     count);
 
       switch( VB->ObjPtr->size ) {
       case 4: vfmt |= RADEON_CP_VC_FRMT_W0;
@@ -505,21 +208,21 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       default:
          break;
       }
-      component[nr++] = &rmesa->tcl.obj;
+      nr++;
    }
    
 
    if (inputs & VERT_BIT_NORMAL) {
       if (!rmesa->tcl.norm.buf)
-	 emit_vector( ctx, 
-		      &(rmesa->tcl.norm), 
-		      (char *)VB->NormalPtr->data,
-		      3,
-		      VB->NormalPtr->stride,
-		      count);
+	 rcommon_emit_vector( ctx, 
+			      &(rmesa->tcl.aos[nr]),
+			      (char *)VB->NormalPtr->data,
+			      3,
+			      VB->NormalPtr->stride,
+			      count);
 
       vfmt |= RADEON_CP_VC_FRMT_N0;
-      component[nr++] = &rmesa->tcl.norm;
+      nr++;
    }
 
    if (inputs & VERT_BIT_COLOR0) {
@@ -537,31 +240,30 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       }
 
       if (!rmesa->tcl.rgba.buf)
-	 emit_vector( ctx,
-		      &(rmesa->tcl.rgba),
-		      (char *)VB->ColorPtr[0]->data,
-		      emitsize,
-		      VB->ColorPtr[0]->stride,
-		      count);
-
-
-      component[nr++] = &rmesa->tcl.rgba;
+	rcommon_emit_vector( ctx,
+			     &(rmesa->tcl.aos[nr]),
+			     (char *)VB->ColorPtr[0]->data,
+			     emitsize,
+			     VB->ColorPtr[0]->stride,
+			     count);
+
+      nr++;
    }
 
 
    if (inputs & VERT_BIT_COLOR1) {
       if (!rmesa->tcl.spec.buf) {
 
-	 emit_vector( ctx,
-		      &rmesa->tcl.spec,
-		      (char *)VB->SecondaryColorPtr[0]->data,
-		      3,
-		      VB->SecondaryColorPtr[0]->stride,
-		      count);
+	rcommon_emit_vector( ctx,
+			     &(rmesa->tcl.aos[nr]),
+			     (char *)VB->SecondaryColorPtr[0]->data,
+			     3,
+			     VB->SecondaryColorPtr[0]->stride,
+			     count);
       }
 
       vfmt |= RADEON_CP_VC_FRMT_FPSPEC;
-      component[nr++] = &rmesa->tcl.spec;
+      nr++;
    }
 
 /* FIXME: not sure if this is correct. May need to stitch this together with
@@ -570,13 +272,13 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
    if (inputs & VERT_BIT_FOG) {
       if (!rmesa->tcl.fog.buf)
 	 emit_vecfog( ctx,
-		      &(rmesa->tcl.fog),
+		      &(rmesa->tcl.aos[nr]),
 		      (char *)VB->FogCoordPtr->data,
 		      VB->FogCoordPtr->stride,
 		      count);
 
       vfmt |= RADEON_CP_VC_FRMT_FPFOG;
-      component[nr++] = &rmesa->tcl.fog;
+      nr++;
    }
 
 
@@ -587,11 +289,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
       if (inputs & VERT_BIT_TEX(unit)) {
 	 if (!rmesa->tcl.tex[unit].buf)
 	    emit_tex_vector( ctx,
-			     &(rmesa->tcl.tex[unit]),
+			     &(rmesa->tcl.aos[nr]),
 			     (char *)VB->TexCoordPtr[unit]->data,
 			     VB->TexCoordPtr[unit]->size,
 			     VB->TexCoordPtr[unit]->stride,
 			     count );
+	 nr++;
 
 	 vfmt |= RADEON_ST_BIT(unit);
          /* assume we need the 3rd coord if texgen is active for r/q OR at least
@@ -609,7 +312,6 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 		 (swaptexmatcol != ((rmesa->TexMatColSwap >> unit) & 1)))
 	       radeonUploadTexMatrix( rmesa, unit, swaptexmatcol ) ;
 	 }
-	 component[nr++] = &rmesa->tcl.tex[unit];
       }
    }
 
@@ -622,34 +324,3 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
    rmesa->tcl.vertex_format = vfmt;
 }
 
-
-void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-   GLuint unit;
-
-#if 0
-   if (RADEON_DEBUG & DEBUG_VERTS) 
-      _tnl_print_vert_flags( __FUNCTION__, newinputs );
-#endif
-
-   if (newinputs & VERT_BIT_POS) 
-     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.obj, __FUNCTION__ );
-
-   if (newinputs & VERT_BIT_NORMAL) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.norm, __FUNCTION__ );
-
-   if (newinputs & VERT_BIT_COLOR0) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.rgba, __FUNCTION__ );
-
-   if (newinputs & VERT_BIT_COLOR1) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.spec, __FUNCTION__ );
-      
-   if (newinputs & VERT_BIT_FOG)
-      radeonReleaseDmaRegion( rmesa, &rmesa->tcl.fog, __FUNCTION__ );
-
-   for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++) {
-      if (newinputs & VERT_BIT_TEX(unit))
-         radeonReleaseDmaRegion( rmesa, &rmesa->tcl.tex[unit], __FUNCTION__ );
-   }
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
index 126d0727c63..78ec1193026 100644
--- a/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
+++ b/src/mesa/drivers/dri/radeon/radeon_maos_verts.c
@@ -310,7 +310,7 @@ static void init_tcl_verts( void )
 
 void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    struct vertex_buffer *VB = &TNL_CONTEXT(ctx)->vb;
    GLuint req = 0;
    GLuint unit;
@@ -374,14 +374,15 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 	 break;
 
    if (rmesa->tcl.vertex_format == setup_tab[i].vertex_format &&
-       rmesa->tcl.indexed_verts.buf)
+       rmesa->radeon.tcl.aos[0].bo)
       return;
 
-   if (rmesa->tcl.indexed_verts.buf)
+   if (rmesa->radeon.tcl.aos[0].bo)
       radeonReleaseArrays( ctx, ~0 );
 
-   radeonAllocDmaRegion( rmesa,
-			 &rmesa->tcl.indexed_verts, 
+   radeonAllocDmaRegion( &rmesa->radeon,
+			 &rmesa->radeon.tcl.aos[0].bo,
+			 &rmesa->radeon.tcl.aos[0].offset,
 			 VB->Count * setup_tab[i].vertex_size * 4, 
 			 4);
 
@@ -421,29 +422,12 @@ void radeonEmitArrays( GLcontext *ctx, GLuint inputs )
 
 
    setup_tab[i].emit( ctx, 0, VB->Count, 
-		      rmesa->tcl.indexed_verts.address + 
-		      rmesa->tcl.indexed_verts.start );
+		      rmesa->radeon.tcl.aos[0].bo->ptr + rmesa->radeon.tcl.aos[0].offset);
 
+   //   rmesa->radeon.tcl.aos[0].size = setup_tab[i].vertex_size;
+   rmesa->radeon.tcl.aos[0].stride = setup_tab[i].vertex_size;
    rmesa->tcl.vertex_format = setup_tab[i].vertex_format;
-   rmesa->tcl.indexed_verts.aos_start = GET_START( &rmesa->tcl.indexed_verts );
-   rmesa->tcl.indexed_verts.aos_size = setup_tab[i].vertex_size;
-   rmesa->tcl.indexed_verts.aos_stride = setup_tab[i].vertex_size;
-
-   rmesa->tcl.aos_components[0] = &rmesa->tcl.indexed_verts;
-   rmesa->tcl.nr_aos_components = 1;
+   rmesa->radeon.tcl.aos_count = 1;
 }
 
 
-
-void radeonReleaseArrays( GLcontext *ctx, GLuint newinputs )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
-
-#if 0
-   if (RADEON_DEBUG & DEBUG_VERTS) 
-      _tnl_print_vert_flags( __FUNCTION__, newinputs );
-#endif
-
-   if (newinputs) 
-     radeonReleaseDmaRegion( rmesa, &rmesa->tcl.indexed_verts, __FUNCTION__ );
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
new file mode 100644
index 00000000000..f04a07fecd2
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.c
@@ -0,0 +1,423 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "radeon_mipmap_tree.h"
+
+#include <errno.h>
+#include <unistd.h>
+
+#include "main/simple_list.h"
+#include "main/texcompress.h"
+#include "main/texformat.h"
+
+static GLuint radeon_compressed_texture_size(GLcontext *ctx,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLuint mesaFormat)
+{
+	GLuint size = _mesa_compressed_texture_size(ctx, width, height, depth, mesaFormat);
+
+	if (mesaFormat == MESA_FORMAT_RGB_DXT1 ||
+	    mesaFormat == MESA_FORMAT_RGBA_DXT1) {
+		if (width + 3 < 8)	/* width one block */
+			size = size * 4;
+		else if (width + 3 < 16)
+			size = size * 2;
+	} else {
+		/* DXT3/5, 16 bytes per block */
+	  //		WARN_ONCE("DXT 3/5 suffers from multitexturing problems!\n");
+		if (width + 3 < 8)
+			size = size * 2;
+	}
+
+	return size;
+}
+
+
+static int radeon_compressed_num_bytes(GLuint mesaFormat)
+{
+   int bytes = 0;
+   switch(mesaFormat) {
+     
+   case MESA_FORMAT_RGB_FXT1:
+   case MESA_FORMAT_RGBA_FXT1:
+   case MESA_FORMAT_RGB_DXT1:
+   case MESA_FORMAT_RGBA_DXT1:
+     bytes = 2;
+     break;
+     
+   case MESA_FORMAT_RGBA_DXT3:
+   case MESA_FORMAT_RGBA_DXT5:
+     bytes = 4;
+   default:
+     break;
+   }
+   
+   return bytes;
+}
+
+/**
+ * Compute sizes and fill in offset and blit information for the given
+ * image (determined by \p face and \p level).
+ *
+ * \param curOffset points to the offset at which the image is to be stored
+ * and is updated by this function according to the size of the image.
+ */
+static void compute_tex_image_offset(radeonContextPtr rmesa, radeon_mipmap_tree *mt,
+	GLuint face, GLuint level, GLuint* curOffset)
+{
+	radeon_mipmap_level *lvl = &mt->levels[level];
+	uint32_t row_align = rmesa->texture_row_align - 1;
+
+	/* Find image size in bytes */
+	if (mt->compressed) {
+		/* TODO: Is this correct? Need test cases for compressed textures! */
+		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
+		lvl->size = radeon_compressed_texture_size(mt->radeon->glCtx,
+							   lvl->width, lvl->height, lvl->depth, mt->compressed);
+	} else if (mt->target == GL_TEXTURE_RECTANGLE_NV) {
+		lvl->rowstride = (lvl->width * mt->bpp + 63) & ~63;
+		lvl->size = lvl->rowstride * lvl->height;
+	} else if (mt->tilebits & RADEON_TXO_MICRO_TILE) {
+		/* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
+		 * though the actual offset may be different (if texture is less than
+		 * 32 bytes width) to the untiled case */
+		lvl->rowstride = (lvl->width * mt->bpp * 2 + 31) & ~31;
+		lvl->size = lvl->rowstride * ((lvl->height + 1) / 2) * lvl->depth;
+	} else {
+		lvl->rowstride = (lvl->width * mt->bpp + row_align) & ~row_align;
+		lvl->size = lvl->rowstride * lvl->height * lvl->depth;
+	}
+	assert(lvl->size > 0);
+
+	/* All images are aligned to a 32-byte offset */
+	*curOffset = (*curOffset + 0x1f) & ~0x1f;
+	lvl->faces[face].offset = *curOffset;
+	*curOffset += lvl->size;
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+	  fprintf(stderr,
+		  "level %d, face %d: rs:%d %dx%d at %d\n",
+		  level, face, lvl->rowstride, lvl->width, lvl->height, lvl->faces[face].offset);
+}
+
+static GLuint minify(GLuint size, GLuint levels)
+{
+	size = size >> levels;
+	if (size < 1)
+		size = 1;
+	return size;
+}
+
+
+static void calculate_miptree_layout_r100(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
+{
+	GLuint curOffset;
+	GLuint numLevels;
+	GLuint i;
+	GLuint face;
+
+	numLevels = mt->lastLevel - mt->firstLevel + 1;
+	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+
+	curOffset = 0;
+	for(face = 0; face < mt->faces; face++) {
+
+		for(i = 0; i < numLevels; i++) {
+			mt->levels[i].width = minify(mt->width0, i);
+			mt->levels[i].height = minify(mt->height0, i);
+			mt->levels[i].depth = minify(mt->depth0, i);
+			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+		}
+	}
+
+	/* Note the required size in memory */
+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+}
+
+static void calculate_miptree_layout_r300(radeonContextPtr rmesa, radeon_mipmap_tree *mt)
+{
+	GLuint curOffset;
+	GLuint numLevels;
+	GLuint i;
+
+	numLevels = mt->lastLevel - mt->firstLevel + 1;
+	assert(numLevels <= rmesa->glCtx->Const.MaxTextureLevels);
+
+	curOffset = 0;
+	for(i = 0; i < numLevels; i++) {
+		GLuint face;
+
+		mt->levels[i].width = minify(mt->width0, i);
+		mt->levels[i].height = minify(mt->height0, i);
+		mt->levels[i].depth = minify(mt->depth0, i);
+
+		for(face = 0; face < mt->faces; face++)
+			compute_tex_image_offset(rmesa, mt, face, i, &curOffset);
+	}
+
+	/* Note the required size in memory */
+	mt->totalsize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
+}
+
+/**
+ * Create a new mipmap tree, calculate its layout and allocate memory.
+ */
+radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
+		GLenum target, GLuint firstLevel, GLuint lastLevel,
+		GLuint width0, GLuint height0, GLuint depth0,
+		GLuint bpp, GLuint tilebits, GLuint compressed)
+{
+	radeon_mipmap_tree *mt = CALLOC_STRUCT(_radeon_mipmap_tree);
+
+	mt->radeon = rmesa;
+	mt->refcount = 1;
+	mt->t = t;
+	mt->target = target;
+	mt->faces = (target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+	mt->firstLevel = firstLevel;
+	mt->lastLevel = lastLevel;
+	mt->width0 = width0;
+	mt->height0 = height0;
+	mt->depth0 = depth0;
+	mt->bpp = compressed ? radeon_compressed_num_bytes(compressed) : bpp;
+	mt->tilebits = tilebits;
+	mt->compressed = compressed;
+
+	if (rmesa->radeonScreen->chip_family >= CHIP_FAMILY_R300)
+		calculate_miptree_layout_r300(rmesa, mt);
+	else
+		calculate_miptree_layout_r100(rmesa, mt);
+
+#ifdef RADEON_DEBUG_BO
+    mt->bo = radeon_bo_open(rmesa->radeonScreen->bom,
+                            0, mt->totalsize, 1024,
+                            RADEON_GEM_DOMAIN_VRAM,
+                            0,
+                            "MIPMAP TREE");
+#else
+	mt->bo = radeon_bo_open(rmesa->radeonScreen->bom,
+                            0, mt->totalsize, 1024,
+                            RADEON_GEM_DOMAIN_VRAM,
+                            0);
+#endif /* RADEON_DEBUG_BO */
+
+	return mt;
+}
+
+void radeon_miptree_reference(radeon_mipmap_tree *mt)
+{
+	mt->refcount++;
+	assert(mt->refcount > 0);
+}
+
+void radeon_miptree_unreference(radeon_mipmap_tree *mt)
+{
+	if (!mt)
+		return;
+
+	assert(mt->refcount > 0);
+	mt->refcount--;
+	if (!mt->refcount) {
+		radeon_bo_unref(mt->bo);
+		free(mt);
+	}
+}
+
+
+/**
+ * Calculate first and last mip levels for the given texture object,
+ * where the dimensions are taken from the given texture image at
+ * the given level.
+ *
+ * Note: level is the OpenGL level number, which is not necessarily the same
+ * as the first level that is actually present.
+ *
+ * The base level image of the given texture face must be non-null,
+ * or this will fail.
+ */
+static void calculate_first_last_level(struct gl_texture_object *tObj,
+				       GLuint *pfirstLevel, GLuint *plastLevel,
+				       GLuint face, GLuint level)
+{
+	const struct gl_texture_image * const baseImage =
+		tObj->Image[face][level];
+
+	assert(baseImage);
+	
+	/* These must be signed values.  MinLod and MaxLod can be negative numbers,
+	* and having firstLevel and lastLevel as signed prevents the need for
+	* extra sign checks.
+	*/
+	int   firstLevel;
+	int   lastLevel;
+
+	/* Yes, this looks overly complicated, but it's all needed.
+	*/
+	switch (tObj->Target) {
+	case GL_TEXTURE_1D:
+	case GL_TEXTURE_2D:
+	case GL_TEXTURE_3D:
+	case GL_TEXTURE_CUBE_MAP:
+		if (tObj->MinFilter == GL_NEAREST || tObj->MinFilter == GL_LINEAR) {
+			/* GL_NEAREST and GL_LINEAR only care about GL_TEXTURE_BASE_LEVEL.
+			*/
+			firstLevel = lastLevel = tObj->BaseLevel;
+		} else {
+			firstLevel = tObj->BaseLevel + (GLint)(tObj->MinLod + 0.5);
+			firstLevel = MAX2(firstLevel, tObj->BaseLevel);
+			firstLevel = MIN2(firstLevel, level + baseImage->MaxLog2);
+			lastLevel = tObj->BaseLevel + (GLint)(tObj->MaxLod + 0.5);
+			lastLevel = MAX2(lastLevel, tObj->BaseLevel);
+			lastLevel = MIN2(lastLevel, level + baseImage->MaxLog2);
+			lastLevel = MIN2(lastLevel, tObj->MaxLevel);
+			lastLevel = MAX2(firstLevel, lastLevel); /* need at least one level */
+		}
+		break;
+	case GL_TEXTURE_RECTANGLE_NV:
+	case GL_TEXTURE_4D_SGIS:
+		firstLevel = lastLevel = 0;
+		break;
+	default:
+		return;
+	}
+
+	/* save these values */
+	*pfirstLevel = firstLevel;
+	*plastLevel = lastLevel;
+}
+
+
+/**
+ * Checks whether the given miptree can hold the given texture image at the
+ * given face and level.
+ */
+GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
+		struct gl_texture_image *texImage, GLuint face, GLuint level)
+{
+	radeon_mipmap_level *lvl;
+
+	if (face >= mt->faces || level < mt->firstLevel || level > mt->lastLevel)
+		return GL_FALSE;
+
+	if (texImage->IsCompressed != mt->compressed)
+		return GL_FALSE;
+
+	if (!texImage->IsCompressed &&
+	    !mt->compressed &&
+	    texImage->TexFormat->TexelBytes != mt->bpp)
+		return GL_FALSE;
+
+	lvl = &mt->levels[level - mt->firstLevel];
+	if (lvl->width != texImage->Width ||
+	    lvl->height != texImage->Height ||
+	    lvl->depth != texImage->Depth)
+		return GL_FALSE;
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Checks whether the given miptree has the right format to store the given texture object.
+ */
+GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj)
+{
+	struct gl_texture_image *firstImage;
+	GLuint compressed;
+	GLuint numfaces = 1;
+	GLuint firstLevel, lastLevel;
+
+	calculate_first_last_level(texObj, &firstLevel, &lastLevel, 0, texObj->BaseLevel);
+	if (texObj->Target == GL_TEXTURE_CUBE_MAP)
+		numfaces = 6;
+
+	firstImage = texObj->Image[0][firstLevel];
+	compressed = firstImage->IsCompressed ? firstImage->TexFormat->MesaFormat : 0;
+
+	return (mt->firstLevel == firstLevel &&
+	        mt->lastLevel == lastLevel &&
+	        mt->width0 == firstImage->Width &&
+	        mt->height0 == firstImage->Height &&
+	        mt->depth0 == firstImage->Depth &&
+	        mt->bpp == firstImage->TexFormat->TexelBytes &&
+	        mt->compressed == compressed);
+}
+
+
+/**
+ * Try to allocate a mipmap tree for the given texture that will fit the
+ * given image in the given position.
+ */
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
+		struct gl_texture_image *texImage, GLuint face, GLuint level)
+{
+	GLuint compressed = texImage->IsCompressed ? texImage->TexFormat->MesaFormat : 0;
+	GLuint numfaces = 1;
+	GLuint firstLevel, lastLevel;
+
+	assert(!t->mt);
+
+	calculate_first_last_level(&t->base, &firstLevel, &lastLevel, face, level);
+	if (t->base.Target == GL_TEXTURE_CUBE_MAP)
+		numfaces = 6;
+
+	if (level != firstLevel || face >= numfaces)
+		return;
+
+	t->mt = radeon_miptree_create(rmesa, t, t->base.Target,
+		firstLevel, lastLevel,
+		texImage->Width, texImage->Height, texImage->Depth,
+		texImage->TexFormat->TexelBytes, t->tile_bits, compressed);
+}
+
+/* Although we use the image_offset[] array to store relative offsets
+ * to cube faces, Mesa doesn't know anything about this and expects
+ * each cube face to be treated as a separate image.
+ *
+ * These functions present that view to mesa:
+ */
+void
+radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets)
+{
+     if (mt->target != GL_TEXTURE_3D || mt->faces == 1)
+        offsets[0] = 0;
+     else {
+	int i;
+	for (i = 0; i < 6; i++)
+		offsets[i] = mt->levels[level].faces[i].offset;
+     }
+}
+
+GLuint
+radeon_miptree_image_offset(radeon_mipmap_tree *mt,
+			    GLuint face, GLuint level)
+{
+   if (mt->target == GL_TEXTURE_CUBE_MAP_ARB)
+      return (mt->levels[level].faces[face].offset);
+   else
+      return mt->levels[level].faces[0].offset;
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
new file mode 100644
index 00000000000..7ece688493d
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_mipmap_tree.h
@@ -0,0 +1,100 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ *
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef __RADEON_MIPMAP_TREE_H_
+#define __RADEON_MIPMAP_TREE_H_
+
+#include "radeon_common.h"
+
+typedef struct _radeon_mipmap_tree radeon_mipmap_tree;
+typedef struct _radeon_mipmap_level radeon_mipmap_level;
+typedef struct _radeon_mipmap_image radeon_mipmap_image;
+
+struct _radeon_mipmap_image {
+	GLuint offset; /** Offset of this image from the start of mipmap tree buffer, in bytes */
+};
+
+struct _radeon_mipmap_level {
+	GLuint width;
+	GLuint height;
+	GLuint depth;
+	GLuint size; /** Size of each image, in bytes */
+	GLuint rowstride; /** in bytes */
+	radeon_mipmap_image faces[6];
+};
+
+/* store the max possible in the miptree */
+#define RADEON_MIPTREE_MAX_TEXTURE_LEVELS 13
+
+/**
+ * A mipmap tree contains texture images in the layout that the hardware
+ * expects.
+ *
+ * The meta-data of mipmap trees is immutable, i.e. you cannot change the
+ * layout on-the-fly; however, the texture contents (i.e. texels) can be
+ * changed.
+ */
+struct _radeon_mipmap_tree {
+	radeonContextPtr radeon;
+	radeonTexObj *t;
+	struct radeon_bo *bo;
+	GLuint refcount;
+
+	GLuint totalsize; /** total size of the miptree, in bytes */
+
+	GLenum target; /** GL_TEXTURE_xxx */
+	GLuint faces; /** # of faces: 6 for cubemaps, 1 otherwise */
+	GLuint firstLevel; /** First mip level stored in this mipmap tree */
+	GLuint lastLevel; /** Last mip level stored in this mipmap tree */
+
+	GLuint width0; /** Width of firstLevel image */
+	GLuint height0; /** Height of firstLevel image */
+	GLuint depth0; /** Depth of firstLevel image */
+
+	GLuint bpp; /** Bytes per texel */
+	GLuint tilebits; /** RADEON_TXO_xxx_TILE */
+	GLuint compressed; /** MESA_FORMAT_xxx indicating a compressed format, or 0 if uncompressed */
+
+	radeon_mipmap_level levels[RADEON_MIPTREE_MAX_TEXTURE_LEVELS];
+};
+
+radeon_mipmap_tree* radeon_miptree_create(radeonContextPtr rmesa, radeonTexObj *t,
+		GLenum target, GLuint firstLevel, GLuint lastLevel,
+		GLuint width0, GLuint height0, GLuint depth0,
+		GLuint bpp, GLuint tilebits, GLuint compressed);
+void radeon_miptree_reference(radeon_mipmap_tree *mt);
+void radeon_miptree_unreference(radeon_mipmap_tree *mt);
+
+GLboolean radeon_miptree_matches_image(radeon_mipmap_tree *mt,
+		struct gl_texture_image *texImage, GLuint face, GLuint level);
+GLboolean radeon_miptree_matches_texture(radeon_mipmap_tree *mt, struct gl_texture_object *texObj);
+void radeon_try_alloc_miptree(radeonContextPtr rmesa, radeonTexObj *t,
+			      struct gl_texture_image *texImage, GLuint face, GLuint level);
+GLuint radeon_miptree_image_offset(radeon_mipmap_tree *mt,
+				   GLuint face, GLuint level);
+void radeon_miptree_depth_offsets(radeon_mipmap_tree *mt, GLuint level, GLuint *offsets);
+#endif /* __RADEON_MIPMAP_TREE_H_ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_sanity.c b/src/mesa/drivers/dri/radeon/radeon_sanity.c
index 6613757fcea..bbed838b592 100644
--- a/src/mesa/drivers/dri/radeon/radeon_sanity.c
+++ b/src/mesa/drivers/dri/radeon/radeon_sanity.c
@@ -973,7 +973,7 @@ static int radeon_emit_packet3_cliprect( drm_radeon_cmd_buffer_t *cmdbuf )
 }
 
 
-int radeonSanityCmdBuffer( radeonContextPtr rmesa,
+int radeonSanityCmdBuffer( r100ContextPtr rmesa,
 			   int nbox,
 			   drm_clip_rect_t *boxes )
 {
diff --git a/src/mesa/drivers/dri/radeon/radeon_sanity.h b/src/mesa/drivers/dri/radeon/radeon_sanity.h
index 1ec06bc586b..f30eb1c4f15 100644
--- a/src/mesa/drivers/dri/radeon/radeon_sanity.h
+++ b/src/mesa/drivers/dri/radeon/radeon_sanity.h
@@ -1,7 +1,7 @@
 #ifndef RADEON_SANITY_H
 #define RADEON_SANITY_H
 
-extern int radeonSanityCmdBuffer( radeonContextPtr rmesa,
+extern int radeonSanityCmdBuffer( r100ContextPtr rmesa,
 				  int nbox,
 				  drm_clip_rect_t *boxes );
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.c b/src/mesa/drivers/dri/radeon/radeon_screen.c
index a977bedbc28..290ef2394de 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.c
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.c
@@ -35,6 +35,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * \author  Gareth Hughes <[email protected]>
  */
 
+#include <errno.h>
 #include "main/glheader.h"
 #include "main/imports.h"
 #include "main/mtypes.h"
@@ -45,32 +46,43 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_chipset.h"
 #include "radeon_macros.h"
 #include "radeon_screen.h"
+#include "radeon_common.h"
+#include "radeon_span.h"
 #if !RADEON_COMMON
 #include "radeon_context.h"
-#include "radeon_span.h"
 #include "radeon_tex.h"
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
 #include "r200_context.h"
 #include "r200_ioctl.h"
-#include "r200_span.h"
 #include "r200_tex.h"
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
 #include "r300_context.h"
 #include "r300_fragprog.h"
 #include "r300_tex.h"
-#include "radeon_span.h"
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+#include "r600_context.h"
+#include "r700_driconf.h" /* +r6/r7 */
+#include "r600_tex.h"     /* +r6/r7 */
 #endif
 
 #include "utils.h"
 #include "vblank.h"
 #include "drirenderbuffer.h"
 
+#include "radeon_bocs_wrapper.h"
+
 #include "GL/internal/dri_interface.h"
 
 /* Radeon configuration
  */
 #include "xmlpool.h"
 
+#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
+DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
+        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
+        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
+DRI_CONF_OPT_END
+
 #if !RADEON_COMMON	/* R100 */
 PUBLIC const char __driConfigOptions[] =
 DRI_CONF_BEGIN
@@ -80,6 +92,7 @@ DRI_CONF_BEGIN
         DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
         DRI_CONF_MAX_TEXTURE_UNITS(3,2,3)
         DRI_CONF_HYPERZ(false)
+        DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_QUALITY
         DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
@@ -95,7 +108,7 @@ DRI_CONF_BEGIN
         DRI_CONF_NO_RAST(false)
     DRI_CONF_SECTION_END
 DRI_CONF_END;
-static const GLuint __driNConfigOptions = 14;
+static const GLuint __driNConfigOptions = 15;
 
 #elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
 
@@ -107,6 +120,7 @@ DRI_CONF_BEGIN
         DRI_CONF_VBLANK_MODE(DRI_CONF_VBLANK_DEF_INTERVAL_0)
         DRI_CONF_MAX_TEXTURE_UNITS(6,2,6)
         DRI_CONF_HYPERZ(false)
+        DRI_CONF_COMMAND_BUFFER_SIZE(8, 8, 32)
     DRI_CONF_SECTION_END
     DRI_CONF_SECTION_QUALITY
         DRI_CONF_TEXTURE_DEPTH(DRI_CONF_TEXTURE_DEPTH_FB)
@@ -126,7 +140,7 @@ DRI_CONF_BEGIN
         DRI_CONF_NV_VERTEX_PROGRAM(false)
     DRI_CONF_SECTION_END
 DRI_CONF_END;
-static const GLuint __driNConfigOptions = 16;
+static const GLuint __driNConfigOptions = 17;
 
 extern const struct dri_extension blend_extensions[];
 extern const struct dri_extension ARB_vp_extension[];
@@ -134,7 +148,7 @@ extern const struct dri_extension NV_vp_extension[];
 extern const struct dri_extension ATI_fs_extension[];
 extern const struct dri_extension point_extensions[];
 
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+#elif RADEON_COMMON && (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
 
 /* TODO: integrate these into xmlpool.h! */
 #define DRI_CONF_MAX_TEXTURE_IMAGE_UNITS(def,min,max) \
@@ -149,11 +163,7 @@ DRI_CONF_OPT_BEGIN_V(texture_coord_units,int,def, # min ":" # max ) \
         DRI_CONF_DESC(de,"Anzahl der Texturkoordinateneinheiten") \
 DRI_CONF_OPT_END
 
-#define DRI_CONF_COMMAND_BUFFER_SIZE(def,min,max) \
-DRI_CONF_OPT_BEGIN_V(command_buffer_size,int,def, # min ":" # max ) \
-        DRI_CONF_DESC(en,"Size of command buffer (in KB)") \
-        DRI_CONF_DESC(de,"Grösse des Befehlspuffers (in KB)") \
-DRI_CONF_OPT_END
+
 
 #define DRI_CONF_DISABLE_S3TC(def) \
 DRI_CONF_OPT_BEGIN(disable_s3tc,bool,def) \
@@ -209,7 +219,6 @@ static const GLuint __driNConfigOptions = 17;
 extern const struct dri_extension gl_20_extension[];
 
 #ifndef RADEON_DEBUG
-int RADEON_DEBUG = 0;
 
 static const struct dri_debug_control debug_control[] = {
 	{"fall", DEBUG_FALLBACKS},
@@ -236,19 +245,36 @@ static const struct dri_debug_control debug_control[] = {
 #endif /* RADEON_COMMON && defined(RADEON_COMMON_FOR_R300) */
 
 extern const struct dri_extension card_extensions[];
+extern const struct dri_extension mm_extensions[];
 
 static int getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo );
 
 static int
-radeonGetParam(int fd, int param, void *value)
+radeonGetParam(__DRIscreenPrivate *sPriv, int param, void *value)
 {
   int ret;
-  drm_radeon_getparam_t gp;
-
-  gp.param = param;
-  gp.value = value;
+  drm_radeon_getparam_t gp = { 0 };
+  struct drm_radeon_info info = { 0 };
+
+  if (sPriv->drm_version.major >= 2) {
+      info.value = (uint64_t)value;
+      switch (param) {
+      case RADEON_PARAM_DEVICE_ID:
+          info.request = RADEON_INFO_DEVICE_ID;
+          break;
+      case RADEON_PARAM_NUM_GB_PIPES:
+          info.request = RADEON_INFO_NUM_GB_PIPES;
+          break;
+      default:
+          return -EINVAL;
+      }
+      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_INFO, &info, sizeof(info));
+  } else {
+      gp.param = param;
+      gp.value = value;
 
-  ret = drmCommandWriteRead( fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+      ret = drmCommandWriteRead(sPriv->fd, DRM_RADEON_GETPARAM, &gp, sizeof(gp));
+  }
   return ret;
 }
 
@@ -335,6 +361,12 @@ static const __DRItexOffsetExtension radeonTexOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
     radeonSetTexOffset,
 };
+
+static const __DRItexBufferExtension radeonTexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   radeonSetTexBuffer,
+   radeonSetTexBuffer2,
+};
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
@@ -349,6 +381,12 @@ static const __DRItexOffsetExtension r200texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
    r200SetTexOffset,
 };
+
+static const __DRItexBufferExtension r200TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r200SetTexBuffer,
+   r200SetTexBuffer2,
+};
 #endif
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
@@ -356,137 +394,32 @@ static const __DRItexOffsetExtension r300texOffsetExtension = {
     { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
    r300SetTexOffset,
 };
-#endif
 
-/* Create the device specific screen private data struct.
- */
-static radeonScreenPtr
-radeonCreateScreen( __DRIscreenPrivate *sPriv )
-{
-   radeonScreenPtr screen;
-   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
-   unsigned char *RADEONMMIO;
-   int i;
-   int ret;
-   uint32_t temp = 0;
-
-   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
-      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
-      return GL_FALSE;
-   }
-
-   /* Allocate the private area */
-   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
-   if ( !screen ) {
-      __driUtilMessage("%s: Could not allocate memory for screen structure",
-		       __FUNCTION__);
-      return NULL;
-   }
-
-#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
-	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
+static const __DRItexBufferExtension r300TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r300SetTexBuffer,
+   r300SetTexBuffer2,
+};
 #endif
 
-   /* parse information in __driConfigOptions */
-   driParseOptionInfo (&screen->optionCache,
-		       __driConfigOptions, __driNConfigOptions);
-
-   /* This is first since which regions we map depends on whether or
-    * not we are using a PCI card.
-    */
-   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
-   {
-      int ret;
-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BUFFER_OFFSET,
-			    &screen->gart_buffer_offset);
-
-      if (ret) {
-	 FREE( screen );
-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
-	 return NULL;
-      }
-
-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_GART_BASE,
-			    &screen->gart_base);
-      if (ret) {
-	 FREE( screen );
-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
-	 return NULL;
-      }
-
-      ret = radeonGetParam( sPriv->fd, RADEON_PARAM_IRQ_NR,
-			    &screen->irq);
-      if (ret) {
-	 FREE( screen );
-	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
-	 return NULL;
-      }
-      screen->drmSupportsCubeMapsR200 = (sPriv->drm_version.minor >= 7);
-      screen->drmSupportsBlendColor = (sPriv->drm_version.minor >= 11);
-      screen->drmSupportsTriPerf = (sPriv->drm_version.minor >= 16);
-      screen->drmSupportsFragShader = (sPriv->drm_version.minor >= 18);
-      screen->drmSupportsPointSprites = (sPriv->drm_version.minor >= 13);
-      screen->drmSupportsCubeMapsR100 = (sPriv->drm_version.minor >= 15);
-      screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
-   }
-
-   screen->mmio.handle = dri_priv->registerHandle;
-   screen->mmio.size   = dri_priv->registerSize;
-   if ( drmMap( sPriv->fd,
-		screen->mmio.handle,
-		screen->mmio.size,
-		&screen->mmio.map ) ) {
-      FREE( screen );
-      __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
-      return NULL;
-   }
-
-   RADEONMMIO = screen->mmio.map;
-
-   screen->status.handle = dri_priv->statusHandle;
-   screen->status.size   = dri_priv->statusSize;
-   if ( drmMap( sPriv->fd,
-		screen->status.handle,
-		screen->status.size,
-		&screen->status.map ) ) {
-      drmUnmap( screen->mmio.map, screen->mmio.size );
-      FREE( screen );
-      __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
-      return NULL;
-   }
-   screen->scratch = (__volatile__ uint32_t *)
-      ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
-
-   screen->buffers = drmMapBufs( sPriv->fd );
-   if ( !screen->buffers ) {
-      drmUnmap( screen->status.map, screen->status.size );
-      drmUnmap( screen->mmio.map, screen->mmio.size );
-      FREE( screen );
-      __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
-      return NULL;
-   }
-
-   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
-      screen->gartTextures.handle = dri_priv->gartTexHandle;
-      screen->gartTextures.size   = dri_priv->gartTexMapSize;
-      if ( drmMap( sPriv->fd,
-		   screen->gartTextures.handle,
-		   screen->gartTextures.size,
-		   (drmAddressPtr)&screen->gartTextures.map ) ) {
-	 drmUnmapBufs( screen->buffers );
-	 drmUnmap( screen->status.map, screen->status.size );
-	 drmUnmap( screen->mmio.map, screen->mmio.size );
-	 FREE( screen );
-	 __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
-	 return NULL;
-      }
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+static const __DRItexOffsetExtension r600texOffsetExtension = {
+    { __DRI_TEX_OFFSET, __DRI_TEX_OFFSET_VERSION },
+   r600SetTexOffset, /* +r6/r7 */
+};
 
-      screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
-   }
+static const __DRItexBufferExtension r600TexBufferExtension = {
+    { __DRI_TEX_BUFFER, __DRI_TEX_BUFFER_VERSION },
+   r600SetTexBuffer,  /* +r6/r7 */
+   r600SetTexBuffer2, /* +r6/r7 */
+};
+#endif
 
+static int radeon_set_screen_flags(radeonScreenPtr screen, int device_id)
+{
+   screen->device_id = device_id;
    screen->chip_flags = 0;
-   /* XXX: add more chipsets */
-   switch ( dri_priv->deviceID ) {
+   switch ( device_id ) {
    case PCI_CHIP_RADEON_LY:
    case PCI_CHIP_RADEON_LZ:
    case PCI_CHIP_RADEON_QY:
@@ -561,7 +494,6 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       screen->chip_family = CHIP_FAMILY_RS300;
       break;
 
-
    case PCI_CHIP_R300_AD:
    case PCI_CHIP_R300_AE:
    case PCI_CHIP_R300_AF:
@@ -819,11 +751,309 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
       screen->chip_flags = RADEON_CHIPSET_TCL;
       break;
 
+   case PCI_CHIP_R600_9400:
+   case PCI_CHIP_R600_9401:
+   case PCI_CHIP_R600_9402:
+   case PCI_CHIP_R600_9403:
+   case PCI_CHIP_R600_9405:
+   case PCI_CHIP_R600_940A:
+   case PCI_CHIP_R600_940B:
+   case PCI_CHIP_R600_940F:
+      screen->chip_family = CHIP_FAMILY_R600;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV610_94C0:
+   case PCI_CHIP_RV610_94C1:
+   case PCI_CHIP_RV610_94C3:
+   case PCI_CHIP_RV610_94C4:
+   case PCI_CHIP_RV610_94C5:
+   case PCI_CHIP_RV610_94C6:
+   case PCI_CHIP_RV610_94C7:
+   case PCI_CHIP_RV610_94C8:
+   case PCI_CHIP_RV610_94C9:
+   case PCI_CHIP_RV610_94CB:
+   case PCI_CHIP_RV610_94CC:
+   case PCI_CHIP_RV610_94CD:
+      screen->chip_family = CHIP_FAMILY_RV610;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV630_9580:
+   case PCI_CHIP_RV630_9581:
+   case PCI_CHIP_RV630_9583:
+   case PCI_CHIP_RV630_9586:
+   case PCI_CHIP_RV630_9587:
+   case PCI_CHIP_RV630_9588:
+   case PCI_CHIP_RV630_9589:
+   case PCI_CHIP_RV630_958A:
+   case PCI_CHIP_RV630_958B:
+   case PCI_CHIP_RV630_958C:
+   case PCI_CHIP_RV630_958D:
+   case PCI_CHIP_RV630_958E:
+   case PCI_CHIP_RV630_958F:
+      screen->chip_family = CHIP_FAMILY_RV630;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV670_9500:
+   case PCI_CHIP_RV670_9501:
+   case PCI_CHIP_RV670_9504:
+   case PCI_CHIP_RV670_9505:
+   case PCI_CHIP_RV670_9506:
+   case PCI_CHIP_RV670_9507:
+   case PCI_CHIP_RV670_9508:
+   case PCI_CHIP_RV670_9509:
+   case PCI_CHIP_RV670_950F:
+   case PCI_CHIP_RV670_9511:
+   case PCI_CHIP_RV670_9515:
+   case PCI_CHIP_RV670_9517:
+   case PCI_CHIP_RV670_9519:
+      screen->chip_family = CHIP_FAMILY_RV670;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV620_95C0:
+   case PCI_CHIP_RV620_95C2:
+   case PCI_CHIP_RV620_95C4:
+   case PCI_CHIP_RV620_95C5:
+   case PCI_CHIP_RV620_95C6:
+   case PCI_CHIP_RV620_95C7:
+   case PCI_CHIP_RV620_95C9:
+   case PCI_CHIP_RV620_95CC:
+   case PCI_CHIP_RV620_95CD:
+   case PCI_CHIP_RV620_95CE:
+   case PCI_CHIP_RV620_95CF:
+      screen->chip_family = CHIP_FAMILY_RV620;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV635_9590:
+   case PCI_CHIP_RV635_9591:
+   case PCI_CHIP_RV635_9593:
+   case PCI_CHIP_RV635_9595:
+   case PCI_CHIP_RV635_9596:
+   case PCI_CHIP_RV635_9597:
+   case PCI_CHIP_RV635_9598:
+   case PCI_CHIP_RV635_9599:
+   case PCI_CHIP_RV635_959B:
+      screen->chip_family = CHIP_FAMILY_RV635;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RS780_9610:
+   case PCI_CHIP_RS780_9611:
+   case PCI_CHIP_RS780_9612:
+   case PCI_CHIP_RS780_9613:
+   case PCI_CHIP_RS780_9614:
+   case PCI_CHIP_RS780_9615:
+   case PCI_CHIP_RS780_9616:
+      screen->chip_family = CHIP_FAMILY_RS780;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV770_9440:
+   case PCI_CHIP_RV770_9441:
+   case PCI_CHIP_RV770_9442:
+   case PCI_CHIP_RV770_9444:
+   case PCI_CHIP_RV770_9446:
+   case PCI_CHIP_RV770_944A:
+   case PCI_CHIP_RV770_944B:
+   case PCI_CHIP_RV770_944C:
+   case PCI_CHIP_RV770_944E:
+   case PCI_CHIP_RV770_9450:
+   case PCI_CHIP_RV770_9452:
+   case PCI_CHIP_RV770_9456:
+   case PCI_CHIP_RV770_945A:
+   case PCI_CHIP_RV770_945B:
+   case PCI_CHIP_RV790_9460:
+   case PCI_CHIP_RV790_9462:
+   case PCI_CHIP_RV770_946A:
+   case PCI_CHIP_RV770_946B:
+   case PCI_CHIP_RV770_947A:
+   case PCI_CHIP_RV770_947B:
+      screen->chip_family = CHIP_FAMILY_RV770;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV730_9487:
+   case PCI_CHIP_RV730_9489:
+   case PCI_CHIP_RV730_948F:
+   case PCI_CHIP_RV730_9490:
+   case PCI_CHIP_RV730_9491:
+   case PCI_CHIP_RV730_9498:
+   case PCI_CHIP_RV730_949C:
+   case PCI_CHIP_RV730_949E:
+   case PCI_CHIP_RV730_949F:
+      screen->chip_family = CHIP_FAMILY_RV730;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV710_9540:
+   case PCI_CHIP_RV710_9541:
+   case PCI_CHIP_RV710_9542:
+   case PCI_CHIP_RV710_954E:
+   case PCI_CHIP_RV710_954F:
+   case PCI_CHIP_RV710_9552:
+   case PCI_CHIP_RV710_9553:
+   case PCI_CHIP_RV710_9555:
+      screen->chip_family = CHIP_FAMILY_RV710;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
+   case PCI_CHIP_RV740_94A0:
+   case PCI_CHIP_RV740_94A1:
+   case PCI_CHIP_RV740_94B1:
+   case PCI_CHIP_RV740_94B3:
+   case PCI_CHIP_RV740_94B5:
+      screen->chip_family = CHIP_FAMILY_RV740;
+      screen->chip_flags = RADEON_CHIPSET_TCL;
+      break;
+
    default:
       fprintf(stderr, "unknown chip id 0x%x, can't guess.\n",
-	      dri_priv->deviceID);
+	      device_id);
+      return -1;
+   }
+
+   return 0;
+}
+
+
+/* Create the device specific screen private data struct.
+ */
+static radeonScreenPtr
+radeonCreateScreen( __DRIscreenPrivate *sPriv )
+{
+   radeonScreenPtr screen;
+   RADEONDRIPtr dri_priv = (RADEONDRIPtr)sPriv->pDevPriv;
+   unsigned char *RADEONMMIO = NULL;
+   int i;
+   int ret;
+   uint32_t temp = 0;
+
+   if (sPriv->devPrivSize != sizeof(RADEONDRIRec)) {
+      fprintf(stderr,"\nERROR!  sizeof(RADEONDRIRec) does not match passed size from device driver\n");
+      return GL_FALSE;
+   }
+
+   /* Allocate the private area */
+   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
+   if ( !screen ) {
+      __driUtilMessage("%s: Could not allocate memory for screen structure",
+		       __FUNCTION__);
       return NULL;
    }
+
+#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
+#endif
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo (&screen->optionCache,
+		       __driConfigOptions, __driNConfigOptions);
+
+   /* This is first since which regions we map depends on whether or
+    * not we are using a PCI card.
+    */
+   screen->card_type = (dri_priv->IsPCI ? RADEON_CARD_PCI : RADEON_CARD_AGP);
+   {
+      int ret;
+
+      ret = radeonGetParam(sPriv, RADEON_PARAM_GART_BUFFER_OFFSET,
+			    &screen->gart_buffer_offset);
+
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BUFFER_OFFSET): %d\n", ret);
+	 return NULL;
+      }
+
+      ret = radeonGetParam(sPriv, RADEON_PARAM_GART_BASE,
+			    &screen->gart_base);
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_GART_BASE): %d\n", ret);
+	 return NULL;
+      }
+
+      ret = radeonGetParam(sPriv, RADEON_PARAM_IRQ_NR,
+			    &screen->irq);
+      if (ret) {
+	 FREE( screen );
+	 fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_IRQ_NR): %d\n", ret);
+	 return NULL;
+      }
+      screen->drmSupportsCubeMapsR200 = (sPriv->drm_version.minor >= 7);
+      screen->drmSupportsBlendColor = (sPriv->drm_version.minor >= 11);
+      screen->drmSupportsTriPerf = (sPriv->drm_version.minor >= 16);
+      screen->drmSupportsFragShader = (sPriv->drm_version.minor >= 18);
+      screen->drmSupportsPointSprites = (sPriv->drm_version.minor >= 13);
+      screen->drmSupportsCubeMapsR100 = (sPriv->drm_version.minor >= 15);
+      screen->drmSupportsVertexProgram = (sPriv->drm_version.minor >= 25);
+   }
+
+   ret = radeon_set_screen_flags(screen, dri_priv->deviceID);
+   if (ret == -1)
+     return NULL;
+
+   screen->mmio.handle = dri_priv->registerHandle;
+   screen->mmio.size   = dri_priv->registerSize;
+   if ( drmMap( sPriv->fd,
+		screen->mmio.handle,
+		screen->mmio.size,
+		&screen->mmio.map ) ) {
+     FREE( screen );
+     __driUtilMessage("%s: drmMap failed\n", __FUNCTION__ );
+     return NULL;
+   }
+
+   RADEONMMIO = screen->mmio.map;
+
+   screen->status.handle = dri_priv->statusHandle;
+   screen->status.size   = dri_priv->statusSize;
+   if ( drmMap( sPriv->fd,
+		screen->status.handle,
+		screen->status.size,
+		&screen->status.map ) ) {
+     drmUnmap( screen->mmio.map, screen->mmio.size );
+     FREE( screen );
+     __driUtilMessage("%s: drmMap (2) failed\n", __FUNCTION__ );
+     return NULL;
+   }
+   if (screen->chip_family < CHIP_FAMILY_R600)
+	   screen->scratch = (__volatile__ uint32_t *)
+		   ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
+   else
+	   screen->scratch = (__volatile__ uint32_t *)
+		   ((GLubyte *)screen->status.map + RADEON_SCRATCH_REG_OFFSET);
+
+   screen->buffers = drmMapBufs( sPriv->fd );
+   if ( !screen->buffers ) {
+     drmUnmap( screen->status.map, screen->status.size );
+     drmUnmap( screen->mmio.map, screen->mmio.size );
+     FREE( screen );
+     __driUtilMessage("%s: drmMapBufs failed\n", __FUNCTION__ );
+     return NULL;
+   }
+
+   if ( dri_priv->gartTexHandle && dri_priv->gartTexMapSize ) {
+     screen->gartTextures.handle = dri_priv->gartTexHandle;
+     screen->gartTextures.size   = dri_priv->gartTexMapSize;
+     if ( drmMap( sPriv->fd,
+		  screen->gartTextures.handle,
+		  screen->gartTextures.size,
+		  (drmAddressPtr)&screen->gartTextures.map ) ) {
+       drmUnmapBufs( screen->buffers );
+       drmUnmap( screen->status.map, screen->status.size );
+       drmUnmap( screen->mmio.map, screen->mmio.size );
+       FREE( screen );
+       __driUtilMessage("%s: drmMap failed for GART texture area\n", __FUNCTION__);
+       return NULL;
+    }
+
+     screen->gart_texture_offset = dri_priv->gartTexOffset + screen->gart_base;
+   }
+
    if ((screen->chip_family == CHIP_FAMILY_R350 || screen->chip_family == CHIP_FAMILY_R300) &&
        sPriv->ddx_version.minor < 2) {
       fprintf(stderr, "xf86-video-ati-6.6.2 or newer needed for Radeon 9500/9700/9800 cards.\n");
@@ -836,35 +1066,57 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    }
 
    if (getenv("R300_NO_TCL"))
-     screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+	   screen->chip_flags &= ~RADEON_CHIPSET_TCL;
 
    if (screen->chip_family <= CHIP_FAMILY_RS200)
-      screen->chip_flags |= RADEON_CLASS_R100;
+	   screen->chip_flags |= RADEON_CLASS_R100;
    else if (screen->chip_family <= CHIP_FAMILY_RV280)
-      screen->chip_flags |= RADEON_CLASS_R200;
+	   screen->chip_flags |= RADEON_CLASS_R200;
+   else if (screen->chip_family <= CHIP_FAMILY_RV570)
+	   screen->chip_flags |= RADEON_CLASS_R300;
    else
-      screen->chip_flags |= RADEON_CLASS_R300;
+	   screen->chip_flags |= RADEON_CLASS_R600;
 
    screen->cpp = dri_priv->bpp / 8;
    screen->AGPMode = dri_priv->AGPMode;
 
-   ret = radeonGetParam( sPriv->fd, RADEON_PARAM_FB_LOCATION,
-                         &temp);
-   if (ret) {
-       if (screen->chip_family < CHIP_FAMILY_RS600)
-	   screen->fbLocation      = ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff) << 16;
-       else {
-           FREE( screen );
-           fprintf(stderr, "Unable to get fb location need newer drm\n");
-           return NULL;
+   ret = radeonGetParam(sPriv, RADEON_PARAM_FB_LOCATION, &temp);
+
+   /* +r6/r7 */
+   if(screen->chip_family >= CHIP_FAMILY_R600)
+   {
+       if (ret) 
+       {
+            FREE( screen );
+            fprintf(stderr, "Unable to get fb location need newer drm\n");
+            return NULL;
        }
-   } else {
-       screen->fbLocation = (temp & 0xffff) << 16;
+       else
+       {
+            screen->fbLocation = (temp & 0xffff) << 24;
+       }
+   }
+   else
+   {
+        if (ret) 
+        {
+            if (screen->chip_family < CHIP_FAMILY_RS600 && !screen->kernel_mm)
+	            screen->fbLocation      = ( INREG( RADEON_MC_FB_LOCATION ) & 0xffff) << 16;
+            else 
+            {
+                FREE( screen );
+                fprintf(stderr, "Unable to get fb location need newer drm\n");
+                return NULL;
+            }
+        } 
+        else 
+        {
+            screen->fbLocation = (temp & 0xffff) << 16;
+        }
    }
 
-   if (screen->chip_family >= CHIP_FAMILY_R300) {
-       ret = radeonGetParam( sPriv->fd, RADEON_PARAM_NUM_GB_PIPES,
-			     &temp);
+   if (IS_R300_CLASS(screen)) {
+       ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
        if (ret) {
 	   fprintf(stderr, "Unable to get num_pipes, need newer drm\n");
 	   switch (screen->chip_family) {
@@ -901,7 +1153,6 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
        default:
 	   break;
        }
-
    }
 
    if ( sPriv->drm_version.minor >= 10 ) {
@@ -971,7 +1222,7 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 
 #if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
    if (IS_R200_CLASS(screen))
-       screen->extensions[i++] = &r200AllocateExtension.base;
+      screen->extensions[i++] = &r200AllocateExtension.base;
 
    screen->extensions[i++] = &r200texOffsetExtension.base;
 #endif
@@ -980,11 +1231,168 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
    screen->extensions[i++] = &r300texOffsetExtension.base;
 #endif
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   screen->extensions[i++] = &r600texOffsetExtension.base;
+#endif
+
    screen->extensions[i++] = NULL;
    sPriv->extensions = screen->extensions;
 
    screen->driScreen = sPriv;
    screen->sarea_priv_offset = dri_priv->sarea_priv_offset;
+   screen->sarea = (drm_radeon_sarea_t *) ((GLubyte *) sPriv->pSAREA +
+					       screen->sarea_priv_offset);
+
+   screen->bom = radeon_bo_manager_legacy_ctor(screen);
+   if (screen->bom == NULL) {
+     free(screen);
+     return NULL;
+   }
+
+   return screen;
+}
+
+static radeonScreenPtr
+radeonCreateScreen2(__DRIscreenPrivate *sPriv)
+{
+   radeonScreenPtr screen;
+   int i;
+   int ret;
+   uint32_t device_id = 0;
+   uint32_t temp = 0;
+
+   /* Allocate the private area */
+   screen = (radeonScreenPtr) CALLOC( sizeof(*screen) );
+   if ( !screen ) {
+      __driUtilMessage("%s: Could not allocate memory for screen structure",
+		       __FUNCTION__);
+      fprintf(stderr, "leaving here\n");
+      return NULL;
+   }
+
+#if DO_DEBUG && RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+	RADEON_DEBUG = driParseDebugString(getenv("RADEON_DEBUG"), debug_control);
+#endif
+
+   /* parse information in __driConfigOptions */
+   driParseOptionInfo (&screen->optionCache,
+		       __driConfigOptions, __driNConfigOptions);
+
+   screen->kernel_mm = 1;
+   screen->chip_flags = 0;
+
+   /* if we have kms we can support all of these */
+   screen->drmSupportsCubeMapsR200 = 1;
+   screen->drmSupportsBlendColor = 1;
+   screen->drmSupportsTriPerf = 1;
+   screen->drmSupportsFragShader = 1;
+   screen->drmSupportsPointSprites = 1;
+   screen->drmSupportsCubeMapsR100 = 1;
+   screen->drmSupportsVertexProgram = 1;
+   screen->irq = 1;
+
+   ret = radeonGetParam(sPriv, RADEON_PARAM_DEVICE_ID, &device_id);
+   if (ret) {
+     FREE( screen );
+     fprintf(stderr, "drm_radeon_getparam_t (RADEON_PARAM_DEVICE_ID): %d\n", ret);
+     return NULL;
+   }
+
+   ret = radeon_set_screen_flags(screen, device_id);
+   if (ret == -1)
+     return NULL;
+
+   if (getenv("R300_NO_TCL"))
+	   screen->chip_flags &= ~RADEON_CHIPSET_TCL;
+
+   if (screen->chip_family <= CHIP_FAMILY_RS200)
+	   screen->chip_flags |= RADEON_CLASS_R100;
+   else if (screen->chip_family <= CHIP_FAMILY_RV280)
+	   screen->chip_flags |= RADEON_CLASS_R200;
+   else if (screen->chip_family <= CHIP_FAMILY_RV570)
+	   screen->chip_flags |= RADEON_CLASS_R300;
+   else
+	   screen->chip_flags |= RADEON_CLASS_R600;
+
+   if (IS_R300_CLASS(screen)) {
+       ret = radeonGetParam(sPriv, RADEON_PARAM_NUM_GB_PIPES, &temp);
+       if (ret) {
+	   fprintf(stderr, "Unable to get num_pipes, need newer drm\n");
+	   switch (screen->chip_family) {
+	   case CHIP_FAMILY_R300:
+	   case CHIP_FAMILY_R350:
+	       screen->num_gb_pipes = 2;
+	       break;
+	   case CHIP_FAMILY_R420:
+	   case CHIP_FAMILY_R520:
+	   case CHIP_FAMILY_R580:
+	   case CHIP_FAMILY_RV560:
+	   case CHIP_FAMILY_RV570:
+	       screen->num_gb_pipes = 4;
+	       break;
+	   case CHIP_FAMILY_RV350:
+	   case CHIP_FAMILY_RV515:
+	   case CHIP_FAMILY_RV530:
+	   case CHIP_FAMILY_RV410:
+	   default:
+	       screen->num_gb_pipes = 1;
+	       break;
+	   }
+       } else {
+	   screen->num_gb_pipes = temp;
+       }
+
+       /* pipe overrides */
+       switch (device_id) {
+       case PCI_CHIP_R300_AD: /* 9500 with 1 quadpipe verified by: Reid Linnemann <[email protected]> */
+       case PCI_CHIP_RV410_5E4C: /* RV410 SE only have 1 quadpipe */
+       case PCI_CHIP_RV410_5E4F: /* RV410 SE only have 1 quadpipe */
+	   screen->num_gb_pipes = 1;
+	   break;
+       default:
+	   break;
+       }
+
+   }
+
+   i = 0;
+   screen->extensions[i++] = &driCopySubBufferExtension.base;
+   screen->extensions[i++] = &driFrameTrackingExtension.base;
+   screen->extensions[i++] = &driReadDrawableExtension;
+
+   if ( screen->irq != 0 ) {
+       screen->extensions[i++] = &driSwapControlExtension.base;
+       screen->extensions[i++] = &driMediaStreamCounterExtension.base;
+   }
+
+#if !RADEON_COMMON
+   screen->extensions[i++] = &radeonTexBufferExtension.base;
+#endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   if (IS_R200_CLASS(screen))
+       screen->extensions[i++] = &r200AllocateExtension.base;
+
+   screen->extensions[i++] = &r200TexBufferExtension.base;
+#endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
+   screen->extensions[i++] = &r300TexBufferExtension.base;
+#endif
+
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   screen->extensions[i++] = &r600TexBufferExtension.base;
+#endif
+
+   screen->extensions[i++] = NULL;
+   sPriv->extensions = screen->extensions;
+
+   screen->driScreen = sPriv;
+   screen->bom = radeon_bo_manager_gem_ctor(sPriv->fd);
+   if (screen->bom == NULL) {
+       free(screen);
+       return NULL;
+   }
    return screen;
 }
 
@@ -993,23 +1401,32 @@ radeonCreateScreen( __DRIscreenPrivate *sPriv )
 static void
 radeonDestroyScreen( __DRIscreenPrivate *sPriv )
 {
-   radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
+    radeonScreenPtr screen = (radeonScreenPtr)sPriv->private;
 
-   if (!screen)
-      return;
+    if (!screen)
+        return;
 
-   if ( screen->gartTextures.map ) {
-      drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
-   }
-   drmUnmapBufs( screen->buffers );
-   drmUnmap( screen->status.map, screen->status.size );
-   drmUnmap( screen->mmio.map, screen->mmio.size );
+    if (screen->kernel_mm) {
+#ifdef RADEON_BO_TRACK
+        radeon_tracker_print(&screen->bom->tracker, stderr);
+#endif
+        radeon_bo_manager_gem_dtor(screen->bom);
+    } else {
+        radeon_bo_manager_legacy_dtor(screen->bom);
+
+        if ( screen->gartTextures.map ) {
+            drmUnmap( screen->gartTextures.map, screen->gartTextures.size );
+        }
+        drmUnmapBufs( screen->buffers );
+        drmUnmap( screen->status.map, screen->status.size );
+        drmUnmap( screen->mmio.map, screen->mmio.size );
+    }
 
-   /* free all option information */
-   driDestroyOptionInfo (&screen->optionCache);
+    /* free all option information */
+    driDestroyOptionInfo (&screen->optionCache);
 
-   FREE( screen );
-   sPriv->private = NULL;
+    FREE( screen );
+    sPriv->private = NULL;
 }
 
 
@@ -1018,16 +1435,21 @@ radeonDestroyScreen( __DRIscreenPrivate *sPriv )
 static GLboolean
 radeonInitDriver( __DRIscreenPrivate *sPriv )
 {
-   sPriv->private = (void *) radeonCreateScreen( sPriv );
-   if ( !sPriv->private ) {
-      radeonDestroyScreen( sPriv );
-      return GL_FALSE;
-   }
+    if (sPriv->dri2.enabled) {
+        sPriv->private = (void *) radeonCreateScreen2( sPriv );
+    } else {
+        sPriv->private = (void *) radeonCreateScreen( sPriv );
+    }
+    if ( !sPriv->private ) {
+        radeonDestroyScreen( sPriv );
+        return GL_FALSE;
+    }
 
-   return GL_TRUE;
+    return GL_TRUE;
 }
 
 
+
 /**
  * Create the Mesa framebuffer and renderbuffers for a given window/drawable.
  *
@@ -1040,101 +1462,111 @@ radeonCreateBuffer( __DRIscreenPrivate *driScrnPriv,
                     const __GLcontextModes *mesaVis,
                     GLboolean isPixmap )
 {
-   radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
+    radeonScreenPtr screen = (radeonScreenPtr) driScrnPriv->private;
 
-   if (isPixmap) {
+    const GLboolean swDepth = GL_FALSE;
+    const GLboolean swAlpha = GL_FALSE;
+    const GLboolean swAccum = mesaVis->accumRedBits > 0;
+    const GLboolean swStencil = mesaVis->stencilBits > 0 &&
+	mesaVis->depthBits != 24;
+    GLenum rgbFormat;
+    struct radeon_framebuffer *rfb;
+
+    if (isPixmap)
       return GL_FALSE; /* not implemented */
-   }
-   else {
-      const GLboolean swDepth = GL_FALSE;
-      const GLboolean swAlpha = GL_FALSE;
-      const GLboolean swAccum = mesaVis->accumRedBits > 0;
-      const GLboolean swStencil = mesaVis->stencilBits > 0 &&
-         mesaVis->depthBits != 24;
-      struct gl_framebuffer *fb = _mesa_create_framebuffer(mesaVis);
-
-      /* front color renderbuffer */
-      {
-         driRenderbuffer *frontRb
-            = driNewRenderbuffer(GL_RGBA,
-                                 driScrnPriv->pFB + screen->frontOffset,
-                                 screen->cpp,
-                                 screen->frontOffset, screen->frontPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(frontRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_FRONT_LEFT, &frontRb->Base);
-      }
 
-      /* back color renderbuffer */
-      if (mesaVis->doubleBufferMode) {
-         driRenderbuffer *backRb
-            = driNewRenderbuffer(GL_RGBA,
-                                 driScrnPriv->pFB + screen->backOffset,
-                                 screen->cpp,
-                                 screen->backOffset, screen->backPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(backRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_BACK_LEFT, &backRb->Base);
-      }
+    rfb = CALLOC_STRUCT(radeon_framebuffer);
+    if (!rfb)
+      return GL_FALSE;
 
-      /* depth renderbuffer */
-      if (mesaVis->depthBits == 16) {
-         driRenderbuffer *depthRb
-            = driNewRenderbuffer(GL_DEPTH_COMPONENT16,
-                                 driScrnPriv->pFB + screen->depthOffset,
-                                 screen->cpp,
-                                 screen->depthOffset, screen->depthPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(depthRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
-	 depthRb->depthHasSurface = screen->depthHasSurface;
-      }
-      else if (mesaVis->depthBits == 24) {
-         driRenderbuffer *depthRb
-            = driNewRenderbuffer(GL_DEPTH_COMPONENT24,
-                                 driScrnPriv->pFB + screen->depthOffset,
-                                 screen->cpp,
-                                 screen->depthOffset, screen->depthPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(depthRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_DEPTH, &depthRb->Base);
-	 depthRb->depthHasSurface = screen->depthHasSurface;
-      }
+    _mesa_initialize_framebuffer(&rfb->base, mesaVis);
+
+    if (mesaVis->redBits == 5)
+        rgbFormat = GL_RGB5;
+    else if (mesaVis->alphaBits == 0)
+        rgbFormat = GL_RGB8;
+    else
+        rgbFormat = GL_RGBA8;
+
+    /* front color renderbuffer */
+    rfb->color_rb[0] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
+    _mesa_add_renderbuffer(&rfb->base, BUFFER_FRONT_LEFT, &rfb->color_rb[0]->base);
+    rfb->color_rb[0]->has_surface = 1;
+
+    /* back color renderbuffer */
+    if (mesaVis->doubleBufferMode) {
+      rfb->color_rb[1] = radeon_create_renderbuffer(rgbFormat, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_BACK_LEFT, &rfb->color_rb[1]->base);
+	rfb->color_rb[1]->has_surface = 1;
+    }
 
-      /* stencil renderbuffer */
-      if (mesaVis->stencilBits > 0 && !swStencil) {
-         driRenderbuffer *stencilRb
-            = driNewRenderbuffer(GL_STENCIL_INDEX8_EXT,
-                                 driScrnPriv->pFB + screen->depthOffset,
-                                 screen->cpp,
-                                 screen->depthOffset, screen->depthPitch,
-                                 driDrawPriv);
-         radeonSetSpanFunctions(stencilRb, mesaVis);
-         _mesa_add_renderbuffer(fb, BUFFER_STENCIL, &stencilRb->Base);
-	 stencilRb->depthHasSurface = screen->depthHasSurface;
+    if (mesaVis->depthBits == 24) {
+      if (mesaVis->stencilBits == 8) {
+	struct radeon_renderbuffer *depthStencilRb = radeon_create_renderbuffer(GL_DEPTH24_STENCIL8_EXT, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depthStencilRb->base);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_STENCIL, &depthStencilRb->base);
+	depthStencilRb->has_surface = screen->depthHasSurface;
+      } else {
+	/* depth renderbuffer */
+	struct radeon_renderbuffer *depth = radeon_create_renderbuffer(GL_DEPTH_COMPONENT24, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depth->base);
+	depth->has_surface = screen->depthHasSurface;
       }
+    } else if (mesaVis->depthBits == 16) {
+      /* just 16-bit depth buffer, no hw stencil */
+	struct radeon_renderbuffer *depth = radeon_create_renderbuffer(GL_DEPTH_COMPONENT16, driDrawPriv);
+	_mesa_add_renderbuffer(&rfb->base, BUFFER_DEPTH, &depth->base);
+	depth->has_surface = screen->depthHasSurface;
+    }
 
-      _mesa_add_soft_renderbuffers(fb,
-                                   GL_FALSE, /* color */
-                                   swDepth,
-                                   swStencil,
-                                   swAccum,
-                                   swAlpha,
-                                   GL_FALSE /* aux */);
-      driDrawPriv->driverPrivate = (void *) fb;
+    _mesa_add_soft_renderbuffers(&rfb->base,
+	    GL_FALSE, /* color */
+	    swDepth,
+	    swStencil,
+	    swAccum,
+	    swAlpha,
+	    GL_FALSE /* aux */);
+    driDrawPriv->driverPrivate = (void *) rfb;
 
-      return (driDrawPriv->driverPrivate != NULL);
-   }
+    return (driDrawPriv->driverPrivate != NULL);
 }
 
 
-static void
+static void radeon_cleanup_renderbuffers(struct radeon_framebuffer *rfb)
+{
+	struct radeon_renderbuffer *rb;
+
+	rb = rfb->color_rb[0];
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[1];
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = radeon_get_renderbuffer(&rfb->base, BUFFER_DEPTH);
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+}
+
+void
 radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv)
 {
-   _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
+    struct radeon_framebuffer *rfb;
+    if (!driDrawPriv)
+	return;
+
+    rfb = (void*)driDrawPriv->driverPrivate;
+    if (!rfb)
+	return;
+    radeon_cleanup_renderbuffers(rfb);
+    _mesa_reference_framebuffer((GLframebuffer **)(&(driDrawPriv->driverPrivate)), NULL);
 }
 
-#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
 /**
  * Choose the appropriate CreateContext function based on the chipset.
  * Eventually, all drivers will go through this process.
@@ -1145,25 +1577,27 @@ static GLboolean radeonCreateContext(const __GLcontextModes * glVisual,
 {
 	__DRIscreenPrivate *sPriv = driContextPriv->driScreenPriv;
 	radeonScreenPtr screen = (radeonScreenPtr) (sPriv->private);
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+	if (IS_R600_CLASS(screen))
+		return r600CreateContext(glVisual, driContextPriv, sharedContextPriv);
+#endif
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R300)
 	if (IS_R300_CLASS(screen))
 		return r300CreateContext(glVisual, driContextPriv, sharedContextPriv);
-        return GL_FALSE;
-}
-
-/**
- * Choose the appropriate DestroyContext function based on the chipset.
- */
-static void radeonDestroyContext(__DRIcontextPrivate * driContextPriv)
-{
-	radeonContextPtr radeon = (radeonContextPtr) driContextPriv->driverPrivate;
-
-	if (IS_R300_CLASS(radeon->radeonScreen))
-		return r300DestroyContext(driContextPriv);
-}
+#endif
 
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+	if (IS_R200_CLASS(screen))
+		return r200CreateContext(glVisual, driContextPriv, sharedContextPriv);
+#endif
 
+#if !RADEON_COMMON
+	(void)screen;
+	return r100CreateContext(glVisual, driContextPriv, sharedContextPriv);
 #endif
+	return GL_FALSE;
+}
 
 
 /**
@@ -1191,6 +1625,11 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
    static const __DRIversion dri_expected = { 4, 0, 0 };
    static const __DRIversion drm_expected = { 1, 24, 0 };
+#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R600)
+   static const char *driver_name = "R600";
+   static const __DRIutilversion2 ddx_expected = { 4, 5, 0, 0 };
+   static const __DRIversion dri_expected = { 4, 0, 0 };
+   static const __DRIversion drm_expected = { 1, 24, 0 };
 #endif
    RADEONDRIPtr dri_priv = (RADEONDRIPtr) psp->pDevPriv;
 
@@ -1218,20 +1657,110 @@ radeonInitScreen(__DRIscreenPrivate *psp)
    driInitSingleExtension( NULL, NV_vp_extension );
    driInitSingleExtension( NULL, ATI_fs_extension );
    driInitExtensions( NULL, point_extensions, GL_FALSE );
-#elif defined(RADEON_COMMON_FOR_R300)
+#elif (defined(RADEON_COMMON_FOR_R300) || defined(RADEON_COMMON_FOR_R600))
    driInitSingleExtension( NULL, gl_20_extension );
 #endif
 
    if (!radeonInitDriver(psp))
        return NULL;
 
+   /* for now fill in all modes */
    return radeonFillInModes( psp,
 			     dri_priv->bpp,
 			     (dri_priv->bpp == 16) ? 16 : 24,
-			     (dri_priv->bpp == 16) ? 0  : 8,
-			     (dri_priv->backOffset != dri_priv->depthOffset) );
+			     (dri_priv->bpp == 16) ? 0  : 8, 1);
 }
+#define ARRAY_SIZE(a) (sizeof (a) / sizeof ((a)[0]))
+
+/**
+ * This is the driver specific part of the createNewScreen entry point.
+ * Called when using DRI2.
+ *
+ * \return the __GLcontextModes supported by this driver
+ */
+static const
+__DRIconfig **radeonInitScreen2(__DRIscreenPrivate *psp)
+{
+   GLenum fb_format[3];
+   GLenum fb_type[3];
+   /* GLX_SWAP_COPY_OML is only supported because the Intel driver doesn't
+    * support pageflipping at all.
+    */
+   static const GLenum back_buffer_modes[] = {
+     GLX_NONE, GLX_SWAP_UNDEFINED_OML, /*, GLX_SWAP_COPY_OML*/
+   };
+   uint8_t depth_bits[4], stencil_bits[4], msaa_samples_array[1];
+   int color;
+   __DRIconfig **configs = NULL;
+
+   /* Calling driInitExtensions here, with a NULL context pointer,
+    * does not actually enable the extensions.  It just makes sure
+    * that all the dispatch offsets for all the extensions that
+    * *might* be enables are known.  This is needed because the
+    * dispatch offsets need to be known when _mesa_context_create
+    * is called, but we can't enable the extensions until we have a
+    * context pointer.
+    *
+    * Hello chicken.  Hello egg.  How are you two today?
+    */
+   driInitExtensions( NULL, card_extensions, GL_FALSE );
+   driInitExtensions( NULL, mm_extensions, GL_FALSE );
+#if RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
+   driInitExtensions( NULL, blend_extensions, GL_FALSE );
+   driInitSingleExtension( NULL, ARB_vp_extension );
+   driInitSingleExtension( NULL, NV_vp_extension );
+   driInitSingleExtension( NULL, ATI_fs_extension );
+   driInitExtensions( NULL, point_extensions, GL_FALSE );
+#endif
 
+   if (!radeonInitDriver(psp)) {
+       return NULL;
+    }
+   depth_bits[0] = 0;
+   stencil_bits[0] = 0;
+   depth_bits[1] = 16;
+   stencil_bits[1] = 0;
+   depth_bits[2] = 24;
+   stencil_bits[2] = 0;
+   depth_bits[3] = 24;
+   stencil_bits[3] = 8;
+
+   msaa_samples_array[0] = 0;
+
+   fb_format[0] = GL_RGB;
+   fb_type[0] = GL_UNSIGNED_SHORT_5_6_5;
+
+   fb_format[1] = GL_BGR;
+   fb_type[1] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   fb_format[2] = GL_BGRA;
+   fb_type[2] = GL_UNSIGNED_INT_8_8_8_8_REV;
+
+   for (color = 0; color < ARRAY_SIZE(fb_format); color++) {
+      __DRIconfig **new_configs;
+
+      new_configs = driCreateConfigs(fb_format[color], fb_type[color],
+				     depth_bits,
+				     stencil_bits,
+				     ARRAY_SIZE(depth_bits),
+				     back_buffer_modes,
+				     ARRAY_SIZE(back_buffer_modes),
+				     msaa_samples_array,
+				     ARRAY_SIZE(msaa_samples_array));
+      if (configs == NULL)
+	 configs = new_configs;
+      else
+	 configs = driConcatConfigs(configs, new_configs);
+   }
+
+   if (configs == NULL) {
+      fprintf(stderr, "[%s:%u] Error creating FBConfig!\n", __func__,
+              __LINE__);
+      return NULL;
+   }
+
+   return (const __DRIconfig **)configs;
+}
 
 /**
  * Get information about previous buffer swaps.
@@ -1239,31 +1768,26 @@ radeonInitScreen(__DRIscreenPrivate *psp)
 static int
 getSwapInfo( __DRIdrawablePrivate *dPriv, __DRIswapInfo * sInfo )
 {
-#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
-   radeonContextPtr  rmesa;
-#elif RADEON_COMMON && defined(RADEON_COMMON_FOR_R200)
-   r200ContextPtr  rmesa;
-#endif
+    struct radeon_framebuffer *rfb;
 
-   if ( (dPriv == NULL) || (dPriv->driContextPriv == NULL)
-	|| (dPriv->driContextPriv->driverPrivate == NULL)
-	|| (sInfo == NULL) ) {
-      return -1;
+    if ( (dPriv == NULL) || (dPriv->driContextPriv == NULL)
+	 || (dPriv->driContextPriv->driverPrivate == NULL)
+	 || (sInfo == NULL) ) {
+	return -1;
    }
 
-   rmesa = dPriv->driContextPriv->driverPrivate;
-   sInfo->swap_count = rmesa->swap_count;
-   sInfo->swap_ust = rmesa->swap_ust;
-   sInfo->swap_missed_count = rmesa->swap_missed_count;
+    rfb = dPriv->driverPrivate;
+    sInfo->swap_count = rfb->swap_count;
+    sInfo->swap_ust = rfb->swap_ust;
+    sInfo->swap_missed_count = rfb->swap_missed_count;
 
    sInfo->swap_missed_usage = (sInfo->swap_missed_count != 0)
-       ? driCalculateSwapUsage( dPriv, 0, rmesa->swap_missed_ust )
+       ? driCalculateSwapUsage( dPriv, 0, rfb->swap_missed_ust )
        : 0.0;
 
    return 0;
 }
 
-#if !RADEON_COMMON || (RADEON_COMMON && defined(RADEON_COMMON_FOR_R300))
 const struct __DriverAPIRec driDriverAPI = {
    .InitScreen      = radeonInitScreen,
    .DestroyScreen   = radeonDestroyScreen,
@@ -1280,23 +1804,7 @@ const struct __DriverAPIRec driDriverAPI = {
    .WaitForSBC      = NULL,
    .SwapBuffersMSC  = NULL,
    .CopySubBuffer   = radeonCopySubBuffer,
+    /* DRI2 */
+   .InitScreen2     = radeonInitScreen2,
 };
-#else
-const struct __DriverAPIRec driDriverAPI = {
-   .InitScreen      = radeonInitScreen,
-   .DestroyScreen   = radeonDestroyScreen,
-   .CreateContext   = r200CreateContext,
-   .DestroyContext  = r200DestroyContext,
-   .CreateBuffer    = radeonCreateBuffer,
-   .DestroyBuffer   = radeonDestroyBuffer,
-   .SwapBuffers     = r200SwapBuffers,
-   .MakeCurrent     = r200MakeCurrent,
-   .UnbindContext   = r200UnbindContext,
-   .GetSwapInfo     = getSwapInfo,
-   .GetDrawableMSC  = driDrawableGetMSC32,
-   .WaitForMSC      = driWaitForMSC32,
-   .WaitForSBC      = NULL,
-   .SwapBuffersMSC  = NULL,
-   .CopySubBuffer   = r200CopySubBuffer,
-};
-#endif
+
diff --git a/src/mesa/drivers/dri/radeon/radeon_screen.h b/src/mesa/drivers/dri/radeon/radeon_screen.h
index b84c70bfae9..2a2f6b1b0b8 100644
--- a/src/mesa/drivers/dri/radeon/radeon_screen.h
+++ b/src/mesa/drivers/dri/radeon/radeon_screen.h
@@ -54,11 +54,12 @@ typedef struct {
    drmAddress map;			/* Mapping of the DRM region */
 } radeonRegionRec, *radeonRegionPtr;
 
-typedef struct {
+typedef struct radeon_screen {
    int chip_family;
    int chip_flags;
    int cpp;
    int card_type;
+   int device_id; /* PCI ID */
    int AGPMode;
    unsigned int irq;			/* IRQ number (0 means none) */
 
@@ -103,9 +104,12 @@ typedef struct {
    /* Configuration cache with default values for all contexts */
    driOptionCache optionCache;
 
-   const __DRIextension *extensions[8];
+   const __DRIextension *extensions[16];
 
    int num_gb_pipes;
+   int kernel_mm;
+   drm_radeon_sarea_t *sarea;	/* Private SAREA data */
+   struct radeon_bo_manager *bom;
 } radeonScreenRec, *radeonScreenPtr;
 
 #define IS_R100_CLASS(screen) \
@@ -114,5 +118,8 @@ typedef struct {
 	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R200)
 #define IS_R300_CLASS(screen) \
 	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R300)
+#define IS_R600_CLASS(screen) \
+	((screen->chip_flags & RADEON_CLASS_MASK) == RADEON_CLASS_R600)
 
+extern void radeonDestroyBuffer(__DRIdrawablePrivate *driDrawPriv);
 #endif /* __RADEON_SCREEN_H__ */
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.c b/src/mesa/drivers/dri/radeon/radeon_span.c
index 12051ff1c81..b2a468b4fd6 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.c
+++ b/src/mesa/drivers/dri/radeon/radeon_span.c
@@ -43,46 +43,168 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/glheader.h"
 #include "swrast/swrast.h"
 
-#include "radeon_context.h"
-#include "radeon_ioctl.h"
-#include "radeon_state.h"
+#include "radeon_common.h"
+#include "radeon_lock.h"
 #include "radeon_span.h"
-#include "radeon_tex.h"
-
-#include "drirenderbuffer.h"
 
 #define DBG 0
 
+static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb);
+
+/* radeon tiling on r300-r500 has 4 states,
+   macro-linear/micro-linear
+   macro-linear/micro-tiled
+   macro-tiled /micro-linear
+   macro-tiled /micro-tiled
+   1 byte surface 
+   2 byte surface - two types - we only provide 8x2 microtiling
+   4 byte surface
+   8/16 byte (unused)
+*/
+   
+static GLubyte *radeon_ptr_4byte(const struct radeon_renderbuffer * rrb,
+			     GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
+    GLint offset;
+
+    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
+        offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+        offset = 0;
+        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+	    if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+		offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 5)) << 11;
+		offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 10;
+		offset += (((y >> 4) ^ (x >> 4)) & 0x1) << 9;
+		offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 8;
+		offset += (((y >> 3) ^ (x >> 3)) & 0x1) << 7;
+		offset += ((y >> 1) & 0x1) << 6;
+		offset += ((x >> 2) & 0x1) << 5;
+		offset += (y & 1) << 4;
+		offset += (x & 3) << 2;
+            } else {
+		offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 6)) << 11;
+		offset += (((y >> 2) ^ (x >> 6)) & 0x1) << 10;
+		offset += (((y >> 3) ^ (x >> 5)) & 0x1) << 9;
+		offset += (((y >> 1) ^ (x >> 5)) & 0x1) << 8;
+		offset += (((y >> 2) ^ (x >> 4)) & 0x1) << 7;
+		offset += (y & 1) << 6;
+		offset += (x & 15) << 2;
+            }
+        } else {
+	    offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 2)) << 5;
+	    offset += (y & 1) << 4;
+	    offset += (x & 3) << 2;
+        }
+    }
+    return &ptr[offset];
+}
+
+static GLubyte *radeon_ptr_2byte_8x2(const struct radeon_renderbuffer * rrb,
+				     GLint x, GLint y)
+{
+    GLubyte *ptr = rrb->bo->ptr;
+    uint32_t mask = RADEON_BO_FLAGS_MACRO_TILE | RADEON_BO_FLAGS_MICRO_TILE;
+    GLint offset;
+
+    if (rrb->has_surface || !(rrb->bo->flags & mask)) {
+        offset = x * rrb->cpp + y * rrb->pitch;
+    } else {
+        offset = 0;
+        if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE) {
+            if (rrb->bo->flags & RADEON_BO_FLAGS_MICRO_TILE) {
+		offset = ((y >> 4) * (rrb->pitch >> 7) + (x >> 6)) << 11;
+		offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 10;
+		offset += (((y >> 4) ^ (x >> 5)) & 0x1) << 9;
+		offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 8;
+		offset += (((y >> 3) ^ (x >> 4)) & 0x1) << 7;
+		offset += ((y >> 1) & 0x1) << 6;
+		offset += ((x >> 3) & 0x1) << 5;
+		offset += (y & 1) << 4;
+		offset += (x & 3) << 2;
+            } else {
+		offset = ((y >> 3) * (rrb->pitch >> 8) + (x >> 7)) << 11;
+		offset += (((y >> 2) ^ (x >> 7)) & 0x1) << 10;
+		offset += (((y >> 3) ^ (x >> 6)) & 0x1) << 9;
+		offset += (((y >> 1) ^ (x >> 6)) & 0x1) << 8;
+		offset += (((y >> 2) ^ (x >> 5)) & 0x1) << 7;
+		offset += (y & 1) << 6;
+		offset += ((x >> 4) & 0x1) << 5;
+                offset += (x & 15) << 2;
+            }
+        } else {
+	    offset = ((y >> 1) * (rrb->pitch >> 4) + (x >> 3)) << 5;
+	    offset += (y & 0x1) << 4;
+	    offset += (x & 0x7) << 1;
+        }
+    }
+    return &ptr[offset];
+}
+
+#ifndef COMPILE_R300
+static uint32_t
+z24s8_to_s8z24(uint32_t val)
+{
+   return (val << 24) | (val >> 8);
+}
+
+static uint32_t
+s8z24_to_z24s8(uint32_t val)
+{
+   return (val >> 24) | (val << 8);
+}
+#endif
+
 /*
  * Note that all information needed to access pixels in a renderbuffer
  * should be obtained through the gl_renderbuffer parameter, not per-context
  * information.
  */
 #define LOCAL_VARS						\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;		\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;		\
-   const GLuint bottom = dPriv->h - 1;				\
-   GLubyte *buf = (GLubyte *) drb->flippedData			\
-      + (dPriv->y * drb->flippedPitch + dPriv->x) * drb->cpp;	\
-   GLuint p;							\
-   (void) p;
+   struct radeon_context *radeon = RADEON_CONTEXT(ctx);			\
+   struct radeon_renderbuffer *rrb = (void *) rb;		\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : rrb->base.Height - 1;\
+   unsigned int num_cliprects;						\
+   struct drm_clip_rect *cliprects;					\
+   int x_off, y_off;							\
+   GLuint p;						\
+   (void)p;						\
+   radeon_get_cliprects(radeon, &cliprects, &num_cliprects, &x_off, &y_off);
 
 #define LOCAL_DEPTH_VARS				\
-   driRenderbuffer *drb = (driRenderbuffer *) rb;	\
-   const __DRIdrawablePrivate *dPriv = drb->dPriv;	\
-   const GLuint bottom = dPriv->h - 1;			\
-   GLuint xo = dPriv->x;				\
-   GLuint yo = dPriv->y;				\
-   GLubyte *buf = (GLubyte *) drb->Base.Data;
+   struct radeon_context *radeon = RADEON_CONTEXT(ctx);			\
+   struct radeon_renderbuffer *rrb = (void *) rb;	\
+   const GLint yScale = ctx->DrawBuffer->Name ? 1 : -1;			\
+   const GLint yBias = ctx->DrawBuffer->Name ? 0 : rrb->base.Height - 1;\
+   unsigned int num_cliprects;						\
+   struct drm_clip_rect *cliprects;					\
+   int x_off, y_off;							\
+  radeon_get_cliprects(radeon, &cliprects, &num_cliprects, &x_off, &y_off);
 
 #define LOCAL_STENCIL_VARS LOCAL_DEPTH_VARS
 
-#define Y_FLIP(Y) (bottom - (Y))
+#define Y_FLIP(_y) ((_y) * yScale + yBias)
 
 #define HW_LOCK()
 
 #define HW_UNLOCK()
 
+/* XXX FBO: this is identical to the macro in spantmp2.h except we get
+ * the cliprect info from the context, not the driDrawable.
+ * Move this into spantmp2.h someday.
+ */
+#define HW_CLIPLOOP()							\
+   do {									\
+      int _nc = num_cliprects;						\
+      while ( _nc-- ) {							\
+	 int minx = cliprects[_nc].x1 - x_off;				\
+	 int miny = cliprects[_nc].y1 - y_off;				\
+	 int maxx = cliprects[_nc].x2 - x_off;				\
+	 int maxy = cliprects[_nc].y2 - y_off;
+
 /* ================================================================
  * Color buffer
  */
@@ -94,7 +216,41 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_RGB565
 #define TAG2(x,y) radeon##x##_RGB565##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 2)
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 16 bit, ARGB1555 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_1_5_5_5_REV
+
+#define TAG(x)    radeon##x##_ARGB1555
+#define TAG2(x,y) radeon##x##_ARGB1555##y
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 16 bit, RGBA4 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_SHORT_4_4_4_4_REV
+
+#define TAG(x)    radeon##x##_ARGB4444
+#define TAG2(x,y) radeon##x##_ARGB4444##y
+#define GET_PTR(X,Y) radeon_ptr_2byte_8x2(rrb, (X) + x_off, (Y) + y_off)
+#include "spantmp2.h"
+
+/* 32 bit, xRGB8888 color spanline and pixel functions
+ */
+#define SPANTMP_PIXEL_FMT GL_BGRA
+#define SPANTMP_PIXEL_TYPE GL_UNSIGNED_INT_8_8_8_8_REV
+
+#define TAG(x)    radeon##x##_xRGB8888
+#define TAG2(x,y) radeon##x##_xRGB8888##y
+#define GET_VALUE(_x, _y) ((*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) | 0xff000000))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
 #include "spantmp2.h"
 
 /* 32 bit, ARGB8888 color spanline and pixel functions
@@ -104,7 +260,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #define TAG(x)    radeon##x##_ARGB8888
 #define TAG2(x,y) radeon##x##_ARGB8888##y
-#define GET_PTR(X,Y) (buf + ((Y) * drb->flippedPitch + (X)) * 4)
+#define GET_VALUE(_x, _y) (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)))
+#define PUT_VALUE(_x, _y, d) { \
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
 #include "spantmp2.h"
 
 /* ================================================================
@@ -121,70 +281,20 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  * too...
  */
 
-static GLuint radeon_mba_z32(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 4 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0..1] = 0           */
-
-#ifdef COMPILE_R300
-		ba = (y / 8) * (pitch / 8) + (x / 8);
-#else
-		ba = (y / 16) * (pitch / 16) + (x / 16);
-#endif
-
-		address |= (x & 0x7) << 2;	/* a[2..4] = x[0..2]     */
-		address |= (y & 0x3) << 5;	/* a[5..6] = y[0..1]     */
-		address |= (((x & 0x10) >> 2) ^ (y & 0x4)) << 5;	/* a[7]    = x[4] ^ y[2] */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= (((x & 0x8) << 1) ^ (y & 0x10)) << 7;	/* a[11]   = x[3] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
-static INLINE GLuint
-radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
-{
-	GLuint pitch = drb->pitch;
-	if (drb->depthHasSurface) {
-		return 2 * (x + y * pitch);
-	} else {
-		GLuint ba, address = 0;	/* a[0]    = 0           */
-
-		ba = (y / 16) * (pitch / 32) + (x / 32);
-
-		address |= (x & 0x7) << 1;	/* a[1..3] = x[0..2]     */
-		address |= (y & 0x7) << 4;	/* a[4..6] = y[0..2]     */
-		address |= (x & 0x8) << 4;	/* a[7]    = x[3]        */
-		address |= (ba & 0x3) << 8;	/* a[8..9] = ba[0..1]    */
-		address |= (y & 0x8) << 7;	/* a[10]   = y[3]        */
-		address |= ((x & 0x10) ^ (y & 0x10)) << 7;	/* a[11]   = x[4] ^ y[4] */
-		address |= (ba & ~0x3) << 10;	/* a[12..] = ba[2..]     */
-
-		return address;
-	}
-}
-
 /* 16-bit depth buffer functions
  */
 #define VALUE_TYPE GLushort
 
 #define WRITE_DEPTH( _x, _y, d )					\
-   *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo )) = d;
+   *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off) = d
 
 #define READ_DEPTH( d, _x, _y )						\
-   d = *(GLushort *)(buf + radeon_mba_z16( drb, _x + xo, _y + yo ));
+   d = *(GLushort *)radeon_ptr_2byte_8x2(rrb, _x + x_off, _y + y_off)
 
 #define TAG(x) radeon##x##_z16
 #include "depthtmp.h"
 
-/* 24 bit depth, 8 bit stencil depthbuffer functions
+/* 24 bit depth
  *
  * Careful: It looks like the R300 uses ZZZS byte order while the R200
  * uses SZZZ for 24 bit depth, 8 bit stencil mode.
@@ -194,35 +304,76 @@ radeon_mba_z16(const driRenderbuffer * drb, GLint x, GLint y)
 #ifdef COMPILE_R300
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0x000000ff;							\
    tmp |= ((d << 8) & 0xffffff00);					\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #else
 #define WRITE_DEPTH( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
+   GLuint tmp = *_ptr;							\
    tmp &= 0xff000000;							\
    tmp |= ((d) & 0x00ffffff);						\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #endif
 
 #ifdef COMPILE_R300
 #define READ_DEPTH( d, _x, _y )						\
-  do { \
-    d = (*(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,		\
-					 _y + yo )) & 0xffffff00) >> 8; \
+  do {									\
+    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)) & 0xffffff00) >> 8; \
   }while(0)
 #else
-#define READ_DEPTH( d, _x, _y )						\
-   d = *(GLuint *)(buf + radeon_mba_z32( drb, _x + xo,			\
-					 _y + yo )) & 0x00ffffff;
+#define READ_DEPTH( d, _x, _y )	\
+  d = *(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off)) & 0x00ffffff;
+#endif
+/*
+    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
+   d = *(GLuint*)(radeon_ptr(rrb, _x,	_y )) & 0x00ffffff;
+*/
+#define TAG(x) radeon##x##_z24
+#include "depthtmp.h"
+
+/* 24 bit depth, 8 bit stencil depthbuffer functions
+ * EXT_depth_stencil
+ *
+ * Careful: It looks like the R300 uses ZZZS byte order while the R200
+ * uses SZZZ for 24 bit depth, 8 bit stencil mode.
+ */
+#define VALUE_TYPE GLuint
+
+#ifdef COMPILE_R300
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   *_ptr = d;								\
+} while (0)
+#else
+#define WRITE_DEPTH( _x, _y, d )					\
+do {									\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );	\
+   GLuint tmp = z24s8_to_s8z24(d);					\
+   *_ptr = tmp;					\
+} while (0)
 #endif
 
+#ifdef COMPILE_R300
+#define READ_DEPTH( d, _x, _y )						\
+  do { \
+    d = (*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off, _y + y_off)));	\
+  }while(0)
+#else
+#define READ_DEPTH( d, _x, _y )	do {					\
+    d = s8z24_to_z24s8(*(GLuint*)(radeon_ptr_4byte(rrb, _x + x_off,	_y + y_off ))); \
+  } while (0)
+#endif
+/*
+    fprintf(stderr, "dval(%d, %d, %d, %d)=0x%08X\n", _x, xo, _y, yo, d);\
+   d = *(GLuint*)(radeon_ptr(rrb, _x,	_y )) & 0x00ffffff;
+*/
 #define TAG(x) radeon##x##_z24_s8
 #include "depthtmp.h"
 
@@ -235,35 +386,35 @@ do {									\
 #ifdef COMPILE_R300
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0xffffff00;							\
    tmp |= (d) & 0xff;							\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #else
 #define WRITE_STENCIL( _x, _y, d )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte(rrb, _x + x_off, _y + y_off);		\
+   GLuint tmp = *_ptr;				\
    tmp &= 0x00ffffff;							\
    tmp |= (((d) & 0xff) << 24);						\
-   *(GLuint *)(buf + offset) = tmp;					\
+   *_ptr = tmp;					\
 } while (0)
 #endif
 
 #ifdef COMPILE_R300
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    d = tmp & 0x000000ff;						\
 } while (0)
 #else
 #define READ_STENCIL( d, _x, _y )					\
 do {									\
-   GLuint offset = radeon_mba_z32( drb, _x + xo, _y + yo );		\
-   GLuint tmp = *(GLuint *)(buf + offset);				\
+   GLuint *_ptr = (GLuint*)radeon_ptr_4byte( rrb, _x + x_off, _y + y_off );		\
+   GLuint tmp = *_ptr;				\
    d = (tmp & 0xff000000) >> 24;					\
 } while (0)
 #endif
@@ -271,29 +422,110 @@ do {									\
 #define TAG(x) radeon##x##_z24_s8
 #include "stenciltmp.h"
 
-/* Move locking out to get reasonable span performance (10x better
- * than doing this in HW_LOCK above).  WaitForIdle() is the main
- * culprit.
- */
+
+static void map_unmap_rb(struct gl_renderbuffer *rb, int flag)
+{
+	struct radeon_renderbuffer *rrb = radeon_renderbuffer(rb);
+	int r;
+
+	if (rrb == NULL || !rrb->bo)
+		return;
+
+	if (flag) {
+		if (rrb->bo->bom->funcs->bo_wait)
+			radeon_bo_wait(rrb->bo);
+		r = radeon_bo_map(rrb->bo, 1);
+		if (r) {
+			fprintf(stderr, "(%s) error(%d) mapping buffer.\n",
+				__FUNCTION__, r);
+		}
+
+		radeonSetSpanFunctions(rrb);
+	} else {
+		radeon_bo_unmap(rrb->bo);
+		rb->GetRow = NULL;
+		rb->PutRow = NULL;
+	}
+}
+
+static void
+radeon_map_unmap_buffers(GLcontext *ctx, GLboolean map)
+{
+	GLuint i, j;
+
+	/* color draw buffers */
+	for (j = 0; j < ctx->DrawBuffer->_NumColorDrawBuffers; j++)
+		map_unmap_rb(ctx->DrawBuffer->_ColorDrawBuffers[j], map);
+
+	/* check for render to textures */
+	for (i = 0; i < BUFFER_COUNT; i++) {
+		struct gl_renderbuffer_attachment *att =
+			ctx->DrawBuffer->Attachment + i;
+		struct gl_texture_object *tex = att->Texture;
+		if (tex) {
+			/* Render to texture. Note that a mipmapped texture need not
+			 * be complete for render to texture, so we must restrict to
+			 * mapping only the attached image.
+			 */
+			radeon_texture_image *image = get_radeon_texture_image(tex->Image[att->CubeMapFace][att->TextureLevel]);
+			ASSERT(att->Renderbuffer);
+
+			if (map)
+				radeon_teximage_map(image, GL_TRUE);
+			else
+				radeon_teximage_unmap(image);
+		}
+	}
+
+	map_unmap_rb(ctx->ReadBuffer->_ColorReadBuffer, map);
+
+	/* depth buffer (Note wrapper!) */
+	if (ctx->DrawBuffer->_DepthBuffer)
+		map_unmap_rb(ctx->DrawBuffer->_DepthBuffer->Wrapped, map);
+
+	if (ctx->DrawBuffer->_StencilBuffer)
+		map_unmap_rb(ctx->DrawBuffer->_StencilBuffer->Wrapped, map);
+}
 
 static void radeonSpanRenderStart(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-#ifdef COMPILE_R300
-	r300ContextPtr r300 = (r300ContextPtr) rmesa;
-	R300_FIREVERTICES(r300);
-#else
-	RADEON_FIREVERTICES(rmesa);
-#endif
-	LOCK_HARDWARE(rmesa);
-	radeonWaitForIdleLocked(rmesa);
+	int i;
+
+	radeon_firevertices(rmesa);
+
+	/* The locking and wait for idle should really only be needed in classic mode.
+	 * In a future memory manager based implementation, this should become
+	 * unnecessary due to the fact that mapping our buffers, textures, etc.
+	 * should implicitly wait for any previous rendering commands that must
+	 * be waited on. */
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		LOCK_HARDWARE(rmesa);
+		radeonWaitForIdleLocked(rmesa);
+	}
+
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.MapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	radeon_map_unmap_buffers(ctx, 1);
 }
 
 static void radeonSpanRenderFinish(GLcontext * ctx)
 {
 	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	int i;
 	_swrast_flush(ctx);
-	UNLOCK_HARDWARE(rmesa);
+	if (!rmesa->radeonScreen->driScreen->dri2.enabled) {
+		UNLOCK_HARDWARE(rmesa);
+	}
+	for (i = 0; i < ctx->Const.MaxTextureImageUnits; i++) {
+		if (ctx->Texture.Unit[i]._ReallyEnabled)
+			ctx->Driver.UnmapTexture(ctx, ctx->Texture.Unit[i]._Current);
+	}
+
+	radeon_map_unmap_buffers(ctx, 0);
 }
 
 void radeonInitSpanFuncs(GLcontext * ctx)
@@ -307,20 +539,27 @@ void radeonInitSpanFuncs(GLcontext * ctx)
 /**
  * Plug in the Get/Put routines for the given driRenderbuffer.
  */
-void radeonSetSpanFunctions(driRenderbuffer * drb, const GLvisual * vis)
+static void radeonSetSpanFunctions(struct radeon_renderbuffer *rrb)
 {
-	if (drb->Base.InternalFormat == GL_RGBA) {
-		if (vis->redBits == 5 && vis->greenBits == 6
-		    && vis->blueBits == 5) {
-			radeonInitPointers_RGB565(&drb->Base);
-		} else {
-			radeonInitPointers_ARGB8888(&drb->Base);
-		}
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT16) {
-		radeonInitDepthPointers_z16(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_DEPTH_COMPONENT24) {
-		radeonInitDepthPointers_z24_s8(&drb->Base);
-	} else if (drb->Base.InternalFormat == GL_STENCIL_INDEX8_EXT) {
-		radeonInitStencilPointers_z24_s8(&drb->Base);
+	if (rrb->base._ActualFormat == GL_RGB5) {
+		radeonInitPointers_RGB565(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGB8) {
+		radeonInitPointers_xRGB8888(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGBA8) {
+		radeonInitPointers_ARGB8888(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGBA4) {
+		radeonInitPointers_ARGB4444(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_RGB5_A1) {
+		radeonInitPointers_ARGB1555(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT16) {
+		radeonInitDepthPointers_z16(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH_COMPONENT24) {
+		radeonInitDepthPointers_z24(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_DEPTH24_STENCIL8_EXT) {
+		radeonInitDepthPointers_z24_s8(&rrb->base);
+	} else if (rrb->base._ActualFormat == GL_STENCIL_INDEX8_EXT) {
+		radeonInitStencilPointers_z24_s8(&rrb->base);
+	} else {
+		fprintf(stderr, "radeonSetSpanFunctions: bad actual format: 0x%04X\n", rrb->base._ActualFormat);
 	}
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_span.h b/src/mesa/drivers/dri/radeon/radeon_span.h
index 9abe0864b17..ea6a2e7fb4e 100644
--- a/src/mesa/drivers/dri/radeon/radeon_span.h
+++ b/src/mesa/drivers/dri/radeon/radeon_span.h
@@ -42,9 +42,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #ifndef __RADEON_SPAN_H__
 #define __RADEON_SPAN_H__
 
-#include "drirenderbuffer.h"
-
 extern void radeonInitSpanFuncs(GLcontext * ctx);
-extern void radeonSetSpanFunctions(driRenderbuffer * rb, const GLvisual * vis);
 
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.c b/src/mesa/drivers/dri/radeon/radeon_state.c
index b6561001e76..0d1728b747a 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state.c
@@ -47,6 +47,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "swrast_setup/swrast_setup.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
 #include "radeon_tcl.h"
@@ -62,7 +63,7 @@ static void radeonUpdateSpecular( GLcontext *ctx );
 
 static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    int pp_misc = rmesa->hw.ctx.cmd[CTX_PP_MISC];
    GLubyte refByte;
 
@@ -106,7 +107,7 @@ static void radeonAlphaFunc( GLcontext *ctx, GLenum func, GLfloat ref )
 static void radeonBlendEquationSeparate( GLcontext *ctx,
 					 GLenum modeRGB, GLenum modeA )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & ~RADEON_COMB_FCN_MASK;
    GLboolean fallback = GL_FALSE;
 
@@ -147,8 +148,8 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
 				     GLenum sfactorRGB, GLenum dfactorRGB,
 				     GLenum sfactorA, GLenum dfactorA )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] & 
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint b = rmesa->hw.ctx.cmd[CTX_RB3D_BLENDCNTL] &
       ~(RADEON_SRC_BLEND_MASK | RADEON_DST_BLEND_MASK);
    GLboolean fallback = GL_FALSE;
 
@@ -257,7 +258,7 @@ static void radeonBlendFuncSeparate( GLcontext *ctx,
 
 static void radeonDepthFunc( GLcontext *ctx, GLenum func )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, ctx );
    rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_Z_TEST_MASK;
@@ -293,7 +294,7 @@ static void radeonDepthFunc( GLcontext *ctx, GLenum func )
 
 static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    RADEON_STATECHANGE( rmesa, ctx );
 
    if ( ctx->Depth.Mask ) {
@@ -305,16 +306,16 @@ static void radeonDepthMask( GLcontext *ctx, GLboolean flag )
 
 static void radeonClearDepth( GLcontext *ctx, GLclampd d )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint format = (rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &
 		    RADEON_DEPTH_FORMAT_MASK);
 
    switch ( format ) {
    case RADEON_DEPTH_FORMAT_16BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x0000ffff;
+      rmesa->radeon.state.depth.clear = d * 0x0000ffff;
       break;
    case RADEON_DEPTH_FORMAT_24BIT_INT_Z:
-      rmesa->state.depth.clear = d * 0x00ffffff;
+      rmesa->radeon.state.depth.clear = d * 0x00ffffff;
       break;
    }
 }
@@ -327,7 +328,7 @@ static void radeonClearDepth( GLcontext *ctx, GLclampd d )
 
 static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    union { int i; float f; } c, d;
    GLchan col[4];
 
@@ -391,7 +392,7 @@ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
 	 rmesa->hw.fog.cmd[FOG_D] = d.i;
       }
       break;
-   case GL_FOG_COLOR: 
+   case GL_FOG_COLOR:
       RADEON_STATECHANGE( rmesa, ctx );
       UNCLAMPED_FLOAT_TO_RGB_CHAN( col, ctx->Fog.Color );
       rmesa->hw.ctx.cmd[CTX_PP_FOG_COLOR] &= ~RADEON_FOG_COLOR_MASK;
@@ -406,109 +407,13 @@ static void radeonFogfv( GLcontext *ctx, GLenum pname, const GLfloat *param )
    }
 }
 
-
-/* =============================================================
- * Scissoring
- */
-
-
-static GLboolean intersect_rect( drm_clip_rect_t *out,
-				 drm_clip_rect_t *a,
-				 drm_clip_rect_t *b )
-{
-   *out = *a;
-   if ( b->x1 > out->x1 ) out->x1 = b->x1;
-   if ( b->y1 > out->y1 ) out->y1 = b->y1;
-   if ( b->x2 < out->x2 ) out->x2 = b->x2;
-   if ( b->y2 < out->y2 ) out->y2 = b->y2;
-   if ( out->x1 >= out->x2 ) return GL_FALSE;
-   if ( out->y1 >= out->y2 ) return GL_FALSE;
-   return GL_TRUE;
-}
-
-
-void radeonRecalcScissorRects( radeonContextPtr rmesa )
-{
-   drm_clip_rect_t *out;
-   int i;
-
-   /* Grow cliprect store?
-    */
-   if (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-      while (rmesa->state.scissor.numAllocedClipRects < rmesa->numClipRects) {
-	 rmesa->state.scissor.numAllocedClipRects += 1;	/* zero case */
-	 rmesa->state.scissor.numAllocedClipRects *= 2;
-      }
-
-      if (rmesa->state.scissor.pClipRects)
-	 FREE(rmesa->state.scissor.pClipRects);
-
-      rmesa->state.scissor.pClipRects = 
-	 MALLOC( rmesa->state.scissor.numAllocedClipRects * 
-		 sizeof(drm_clip_rect_t) );
-
-      if ( rmesa->state.scissor.pClipRects == NULL ) {
-	 rmesa->state.scissor.numAllocedClipRects = 0;
-	 return;
-      }
-   }
-   
-   out = rmesa->state.scissor.pClipRects;
-   rmesa->state.scissor.numClipRects = 0;
-
-   for ( i = 0 ; i < rmesa->numClipRects ;  i++ ) {
-      if ( intersect_rect( out, 
-			   &rmesa->pClipRects[i], 
-			   &rmesa->state.scissor.rect ) ) {
-	 rmesa->state.scissor.numClipRects++;
-	 out++;
-      }
-   }
-}
-
-
-static void radeonUpdateScissor( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if ( rmesa->dri.drawable ) {
-      __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-
-      int x = ctx->Scissor.X;
-      int y = dPriv->h - ctx->Scissor.Y - ctx->Scissor.Height;
-      int w = ctx->Scissor.X + ctx->Scissor.Width - 1;
-      int h = dPriv->h - ctx->Scissor.Y - 1;
-
-      rmesa->state.scissor.rect.x1 = x + dPriv->x;
-      rmesa->state.scissor.rect.y1 = y + dPriv->y;
-      rmesa->state.scissor.rect.x2 = w + dPriv->x + 1;
-      rmesa->state.scissor.rect.y2 = h + dPriv->y + 1;
-
-      radeonRecalcScissorRects( rmesa );
-   }
-}
-
-
-static void radeonScissor( GLcontext *ctx,
-			   GLint x, GLint y, GLsizei w, GLsizei h )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if ( ctx->Scissor.Enabled ) {
-      RADEON_FIREVERTICES( rmesa );	/* don't pipeline cliprect changes */
-      radeonUpdateScissor( ctx );
-   }
-
-}
-
-
 /* =============================================================
  * Culling
  */
 
 static void radeonCullFace( GLcontext *ctx, GLenum unused )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
    GLuint t = rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL];
 
@@ -545,7 +450,7 @@ static void radeonCullFace( GLcontext *ctx, GLenum unused )
 
 static void radeonFrontFace( GLcontext *ctx, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, set );
    rmesa->hw.set.cmd[SET_SE_CNTL] &= ~RADEON_FFACE_CULL_DIR_MASK;
@@ -553,6 +458,10 @@ static void radeonFrontFace( GLcontext *ctx, GLenum mode )
    RADEON_STATECHANGE( rmesa, tcl );
    rmesa->hw.tcl.cmd[TCL_UCP_VERT_BLEND_CTL] &= ~RADEON_CULL_FRONT_IS_CCW;
 
+   /* Winding is inverted when rendering to FBO */
+   if (ctx->DrawBuffer && ctx->DrawBuffer->Name)
+      mode = (mode == GL_CW) ? GL_CCW : GL_CW;
+
    switch ( mode ) {
    case GL_CW:
       rmesa->hw.set.cmd[SET_SE_CNTL] |= RADEON_FFACE_CULL_CW;
@@ -570,7 +479,7 @@ static void radeonFrontFace( GLcontext *ctx, GLenum mode )
  */
 static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, lin );
    RADEON_STATECHANGE( rmesa, set );
@@ -587,10 +496,10 @@ static void radeonLineWidth( GLcontext *ctx, GLfloat widthf )
 
 static void radeonLineStipple( GLcontext *ctx, GLint factor, GLushort pattern )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, lin );
-   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] = 
+   rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] =
       ((((GLuint)factor & 0xff) << 16) | ((GLuint)pattern));
 }
 
@@ -602,12 +511,19 @@ static void radeonColorMask( GLcontext *ctx,
 			     GLboolean r, GLboolean g,
 			     GLboolean b, GLboolean a )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint mask = radeonPackColor( rmesa->radeonScreen->cpp,
-				  ctx->Color.ColorMask[RCOMP],
-				  ctx->Color.ColorMask[GCOMP],
-				  ctx->Color.ColorMask[BCOMP],
-				  ctx->Color.ColorMask[ACOMP] );
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb;
+   GLuint mask;
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
+
+   mask = radeonPackColor( rrb->cpp,
+			   ctx->Color.ColorMask[RCOMP],
+			   ctx->Color.ColorMask[GCOMP],
+			   ctx->Color.ColorMask[BCOMP],
+			   ctx->Color.ColorMask[ACOMP] );
 
    if ( rmesa->hw.msk.cmd[MSK_RB3D_PLANEMASK] != mask ) {
       RADEON_STATECHANGE( rmesa, msk );
@@ -623,8 +539,9 @@ static void radeonColorMask( GLcontext *ctx,
 static void radeonPolygonOffset( GLcontext *ctx,
 				 GLfloat factor, GLfloat units )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   float_ui32_type constant =  { units * rmesa->state.depth.scale };
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   float_ui32_type constant =  { units * depthScale };
    float_ui32_type factoru = { factor };
 
    RADEON_STATECHANGE( rmesa, zbs );
@@ -634,7 +551,7 @@ static void radeonPolygonOffset( GLcontext *ctx,
 
 static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint i;
    drm_radeon_stipple_t stipple;
 
@@ -646,27 +563,27 @@ static void radeonPolygonStipple( GLcontext *ctx, const GLubyte *mask )
 
    /* TODO: push this into cmd mechanism
     */
-   RADEON_FIREVERTICES( rmesa );
-   LOCK_HARDWARE( rmesa );
+   radeon_firevertices(&rmesa->radeon);
+   LOCK_HARDWARE( &rmesa->radeon );
 
    /* FIXME: Use window x,y offsets into stipple RAM.
     */
    stipple.mask = rmesa->state.stipple.mask;
-   drmCommandWrite( rmesa->dri.fd, DRM_RADEON_STIPPLE, 
+   drmCommandWrite( rmesa->radeon.dri.fd, DRM_RADEON_STIPPLE,
                     &stipple, sizeof(drm_radeon_stipple_t) );
-   UNLOCK_HARDWARE( rmesa );
+   UNLOCK_HARDWARE( &rmesa->radeon );
 }
 
 static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean flag = (ctx->_TriangleCaps & DD_TRI_UNFILLED) != 0;
 
    /* Can't generally do unfilled via tcl, but some good special
-    * cases work. 
+    * cases work.
     */
    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_UNFILLED, flag);
-   if (rmesa->TclFallback) {
+   if (rmesa->radeon.TclFallback) {
       radeonChooseRenderState( ctx );
       radeonChooseVertexState( ctx );
    }
@@ -686,7 +603,7 @@ static void radeonPolygonMode( GLcontext *ctx, GLenum face, GLenum mode )
  */
 static void radeonUpdateSpecular( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    uint32_t p = rmesa->hw.ctx.cmd[CTX_PP_CNTL];
    GLuint flag = 0;
 
@@ -711,7 +628,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_TCL_VTX_PK_DIFFUSE;
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] |= RADEON_LIGHTING_ENABLE;
       p |=  RADEON_SPECULAR_ENABLE;
-      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= 
+      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &=
 	 ~RADEON_DIFFUSE_SPECULAR_COMBINE;
    }
    else if (ctx->Light.Enabled) {
@@ -741,7 +658,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
 	    RADEON_TCL_COMPUTE_SPECULAR) != 0;
       }
    }
- 
+
    TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_FOGCOORDSPEC, flag);
 
    if (NEED_SECONDARY_COLOR(ctx)) {
@@ -757,7 +674,7 @@ static void radeonUpdateSpecular( GLcontext *ctx )
 
    /* Update vertex/render formats
     */
-   if (rmesa->TclFallback) { 
+   if (rmesa->radeon.TclFallback) {
       radeonChooseRenderState( ctx );
       radeonChooseVertexState( ctx );
    }
@@ -769,12 +686,12 @@ static void radeonUpdateSpecular( GLcontext *ctx )
  */
 
 
-/* Update on colormaterial, material emmissive/ambient, 
+/* Update on colormaterial, material emmissive/ambient,
  * lightmodel.globalambient
  */
 static void update_global_ambient( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    float *fcmd = (float *)RADEON_DB_STATE( glt );
 
    /* Need to do more if both emmissive & ambient are PREMULT:
@@ -782,23 +699,23 @@ static void update_global_ambient( GLcontext *ctx )
     */
    if ((rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &
        ((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
-	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0) 
+	(3 << RADEON_AMBIENT_SOURCE_SHIFT))) == 0)
    {
-      COPY_3V( &fcmd[GLT_RED], 
+      COPY_3V( &fcmd[GLT_RED],
 	       ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_EMISSION]);
       ACC_SCALE_3V( &fcmd[GLT_RED],
 		   ctx->Light.Model.Ambient,
 		   ctx->Light.Material.Attrib[MAT_ATTRIB_FRONT_AMBIENT]);
-   } 
+   }
    else
    {
       COPY_3V( &fcmd[GLT_RED], ctx->Light.Model.Ambient );
    }
-   
+
    RADEON_DB_STATECHANGE(rmesa, &rmesa->hw.glt);
 }
 
-/* Update on change to 
+/* Update on change to
  *    - light[p].colors
  *    - light[p].enabled
  */
@@ -809,13 +726,13 @@ static void update_light_colors( GLcontext *ctx, GLuint p )
 /*     fprintf(stderr, "%s\n", __FUNCTION__); */
 
    if (l->Enabled) {
-      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      r100ContextPtr rmesa = R100_CONTEXT(ctx);
       float *fcmd = (float *)RADEON_DB_STATE( lit[p] );
 
-      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );	 
+      COPY_4V( &fcmd[LIT_AMBIENT_RED], l->Ambient );
       COPY_4V( &fcmd[LIT_DIFFUSE_RED], l->Diffuse );
       COPY_4V( &fcmd[LIT_SPECULAR_RED], l->Specular );
-      
+
       RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.lit[p] );
    }
 }
@@ -829,7 +746,7 @@ static void check_twoside_fallback( GLcontext *ctx )
 
    if (ctx->Light.Enabled && ctx->Light.Model.TwoSide) {
       if (ctx->Light.ColorMaterialEnabled &&
-	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) != 
+	  (ctx->Light.ColorMaterialBitmask & BACK_MATERIAL_BITS) !=
 	  ((ctx->Light.ColorMaterialBitmask & FRONT_MATERIAL_BITS)<<1))
 	 fallback = GL_TRUE;
       else {
@@ -837,7 +754,7 @@ static void check_twoside_fallback( GLcontext *ctx )
 	    if (memcmp( ctx->Light.Material.Attrib[i],
 			ctx->Light.Material.Attrib[i+1],
 			sizeof(GLfloat)*4) != 0) {
-	       fallback = GL_TRUE;  
+	       fallback = GL_TRUE;
 	       break;
 	    }
       }
@@ -849,14 +766,14 @@ static void check_twoside_fallback( GLcontext *ctx )
 
 static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 {
-      radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+      r100ContextPtr rmesa = R100_CONTEXT(ctx);
       GLuint light_model_ctl1 = rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL];
 
       light_model_ctl1 &= ~((3 << RADEON_EMISSIVE_SOURCE_SHIFT) |
 			   (3 << RADEON_AMBIENT_SOURCE_SHIFT) |
 			   (3 << RADEON_DIFFUSE_SOURCE_SHIFT) |
-			   (3 << RADEON_SPECULAR_SOURCE_SHIFT)); 
-   
+			   (3 << RADEON_SPECULAR_SOURCE_SHIFT));
+
    if (ctx->Light.ColorMaterialEnabled) {
       GLuint mask = ctx->Light.ColorMaterialBitmask;
 
@@ -877,7 +794,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
 			     RADEON_AMBIENT_SOURCE_SHIFT);
       }
-	 
+
       if (mask & MAT_BIT_FRONT_DIFFUSE) {
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
 			     RADEON_DIFFUSE_SOURCE_SHIFT);
@@ -886,7 +803,7 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_STATE_MULT <<
 			     RADEON_DIFFUSE_SOURCE_SHIFT);
       }
-   
+
       if (mask & MAT_BIT_FRONT_SPECULAR) {
 	 light_model_ctl1 |= (RADEON_LM_SOURCE_VERTEX_DIFFUSE <<
 			     RADEON_SPECULAR_SOURCE_SHIFT);
@@ -904,27 +821,27 @@ static void radeonColorMaterial( GLcontext *ctx, GLenum face, GLenum mode )
 		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_DIFFUSE_SOURCE_SHIFT) |
 		   (RADEON_LM_SOURCE_STATE_MULT << RADEON_SPECULAR_SOURCE_SHIFT);
    }
-   
+
       if (light_model_ctl1 != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) {
 	 RADEON_STATECHANGE( rmesa, tcl );
-	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;      
+	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = light_model_ctl1;
    }
 }
 
 void radeonUpdateMaterial( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLfloat (*mat)[4] = ctx->Light.Material.Attrib;
    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( mtl );
    GLuint mask = ~0;
-   
+
    if (ctx->Light.ColorMaterialEnabled)
       mask &= ~ctx->Light.ColorMaterialBitmask;
 
    if (RADEON_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s\n", __FUNCTION__);
 
-      
+
    if (mask & MAT_BIT_FRONT_EMISSION) {
       fcmd[MTL_EMMISSIVE_RED]   = mat[MAT_ATTRIB_FRONT_EMISSION][0];
       fcmd[MTL_EMMISSIVE_GREEN] = mat[MAT_ATTRIB_FRONT_EMISSION][1];
@@ -974,11 +891,11 @@ void radeonUpdateMaterial( GLcontext *ctx )
  *
  * which are calculated in light.c and are correct for the current
  * lighting space (model or eye), hence dependencies on _NEW_MODELVIEW
- * and _MESA_NEW_NEED_EYE_COORDS.  
+ * and _MESA_NEW_NEED_EYE_COORDS.
  */
 static void update_light( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    /* Have to check these, or have an automatic shortcircuit mechanism
     * to remove noop statechanges. (Or just do a better job on the
@@ -991,12 +908,12 @@ static void update_light( GLcontext *ctx )
 	 tmp &= ~RADEON_LIGHT_IN_MODELSPACE;
       else
 	 tmp |= RADEON_LIGHT_IN_MODELSPACE;
-      
+
 
       /* Leave this test disabled: (unexplained q3 lockup) (even with
          new packets)
       */
-      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]) 
+      if (tmp != rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL])
       {
 	 RADEON_STATECHANGE( rmesa, tcl );
 	 rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] = tmp;
@@ -1020,10 +937,10 @@ static void update_light( GLcontext *ctx )
 	 if (ctx->Light.Light[p].Enabled) {
 	    struct gl_light *l = &ctx->Light.Light[p];
 	    GLfloat *fcmd = (GLfloat *)RADEON_DB_STATE( lit[p] );
-	    
+
 	    if (l->EyePosition[3] == 0.0) {
-	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm ); 
-	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm ); 
+	       COPY_3FV( &fcmd[LIT_POSITION_X], l->_VP_inf_norm );
+	       COPY_3FV( &fcmd[LIT_DIRECTION_X], l->_h_inf_norm );
 	       fcmd[LIT_POSITION_W] = 0;
 	       fcmd[LIT_DIRECTION_W] = 0;
 	    } else {
@@ -1043,30 +960,30 @@ static void update_light( GLcontext *ctx )
 static void radeonLightfv( GLcontext *ctx, GLenum light,
 			   GLenum pname, const GLfloat *params )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLint p = light - GL_LIGHT0;
    struct gl_light *l = &ctx->Light.Light[p];
    GLfloat *fcmd = (GLfloat *)rmesa->hw.lit[p].cmd;
-   
+
 
    switch (pname) {
-   case GL_AMBIENT:		
+   case GL_AMBIENT:
    case GL_DIFFUSE:
    case GL_SPECULAR:
       update_light_colors( ctx, p );
       break;
 
-   case GL_SPOT_DIRECTION: 
-      /* picked up in update_light */	
+   case GL_SPOT_DIRECTION:
+      /* picked up in update_light */
       break;
 
    case GL_POSITION: {
-      /* positions picked up in update_light, but can do flag here */	
+      /* positions picked up in update_light, but can do flag here */
       GLuint flag;
       GLuint idx = TCL_PER_LIGHT_CTL_0 + p/2;
 
       /* FIXME: Set RANGE_ATTEN only when needed */
-      if (p&1) 
+      if (p&1)
 	 flag = RADEON_LIGHT_1_IS_LOCAL;
       else
 	 flag = RADEON_LIGHT_0_IS_LOCAL;
@@ -1158,16 +1075,16 @@ static void radeonLightfv( GLcontext *ctx, GLenum light,
    }
 }
 
-		  
+
 
 
 static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
 				const GLfloat *param )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    switch (pname) {
-      case GL_LIGHT_MODEL_AMBIENT: 
+      case GL_LIGHT_MODEL_AMBIENT:
 	 update_global_ambient( ctx );
 	 break;
 
@@ -1188,7 +1105,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
 
 	 check_twoside_fallback( ctx );
 
-	 if (rmesa->TclFallback) {
+	 if (rmesa->radeon.TclFallback) {
 	    radeonChooseRenderState( ctx );
 	    radeonChooseVertexState( ctx );
 	 }
@@ -1205,7 +1122,7 @@ static void radeonLightModelfv( GLcontext *ctx, GLenum pname,
 
 static void radeonShadeModel( GLcontext *ctx, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint s = rmesa->hw.set.cmd[SET_SE_CNTL];
 
    s &= ~(RADEON_DIFFUSE_SHADE_MASK |
@@ -1244,7 +1161,7 @@ static void radeonShadeModel( GLcontext *ctx, GLenum mode )
 static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 {
    GLint p = (GLint) plane - (GLint) GL_CLIP_PLANE0;
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLint *ip = (GLint *)ctx->Transform._ClipUserPlane[p];
 
    RADEON_STATECHANGE( rmesa, ucp[p] );
@@ -1256,7 +1173,7 @@ static void radeonClipPlane( GLcontext *ctx, GLenum plane, const GLfloat *eq )
 
 static void radeonUpdateClipPlanes( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint p;
 
    for (p = 0; p < ctx->Const.MaxClipPlanes; p++) {
@@ -1281,7 +1198,7 @@ static void
 radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
                            GLint ref, GLuint mask )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint refmask = (((ctx->Stencil.Ref[0] & 0xff) << RADEON_STENCIL_REF_SHIFT) |
 		     ((ctx->Stencil.ValueMask[0] & 0xff) << RADEON_STENCIL_MASK_SHIFT));
 
@@ -1325,7 +1242,7 @@ radeonStencilFuncSeparate( GLcontext *ctx, GLenum face, GLenum func,
 static void
 radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    RADEON_STATECHANGE( rmesa, msk );
    rmesa->hw.msk.cmd[MSK_RB3D_STENCILREFMASK] &= ~RADEON_STENCIL_WRITE_MASK;
@@ -1336,20 +1253,20 @@ radeonStencilMaskSeparate( GLcontext *ctx, GLenum face, GLuint mask )
 static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
                                      GLenum zfail, GLenum zpass )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
    /* radeon 7200 have stencil bug, DEC and INC_WRAP will actually both do DEC_WRAP,
       and DEC_WRAP (and INVERT) will do INVERT. No way to get correct INC_WRAP and DEC,
       but DEC_WRAP can be fixed by using DEC and INC_WRAP at least use INC. */
-   
+
    GLuint tempRADEON_STENCIL_FAIL_DEC_WRAP;
    GLuint tempRADEON_STENCIL_FAIL_INC_WRAP;
    GLuint tempRADEON_STENCIL_ZFAIL_DEC_WRAP;
    GLuint tempRADEON_STENCIL_ZFAIL_INC_WRAP;
    GLuint tempRADEON_STENCIL_ZPASS_DEC_WRAP;
    GLuint tempRADEON_STENCIL_ZPASS_INC_WRAP;
-   
-   if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
+
+   if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_BROKEN_STENCIL) {
       tempRADEON_STENCIL_FAIL_DEC_WRAP = RADEON_STENCIL_FAIL_DEC;
       tempRADEON_STENCIL_FAIL_INC_WRAP = RADEON_STENCIL_FAIL_INC;
       tempRADEON_STENCIL_ZFAIL_DEC_WRAP = RADEON_STENCIL_ZFAIL_DEC;
@@ -1365,7 +1282,7 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
       tempRADEON_STENCIL_ZPASS_DEC_WRAP = RADEON_STENCIL_ZPASS_DEC_WRAP;
       tempRADEON_STENCIL_ZPASS_INC_WRAP = RADEON_STENCIL_ZPASS_INC_WRAP;
    }
-   
+
    RADEON_STATECHANGE( rmesa, ctx );
    rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] &= ~(RADEON_STENCIL_FAIL_MASK |
 					       RADEON_STENCIL_ZFAIL_MASK |
@@ -1455,9 +1372,9 @@ static void radeonStencilOpSeparate( GLcontext *ctx, GLenum face, GLenum fail,
 
 static void radeonClearStencil( GLcontext *ctx, GLint s )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   rmesa->state.stencil.clear = 
+   rmesa->radeon.state.stencil.clear =
       ((GLuint) (ctx->Stencil.Clear & 0xff) |
        (0xff << RADEON_STENCIL_MASK_SHIFT) |
        ((ctx->Stencil.WriteMask[0] & 0xff) << RADEON_STENCIL_WRITEMASK_SHIFT));
@@ -1481,20 +1398,30 @@ static void radeonClearStencil( GLcontext *ctx, GLint s )
  */
 void radeonUpdateWindow( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
-   GLfloat xoffset = (GLfloat)dPriv->x;
-   GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
+   GLfloat xoffset = dPriv ? (GLfloat) dPriv->x : 0;
+   GLfloat yoffset = dPriv ? (GLfloat) dPriv->y + dPriv->h : 0;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
+   const GLboolean render_to_fbo = (ctx->DrawBuffer ? (ctx->DrawBuffer->Name != 0) : 0);
+   const GLfloat depthScale = 1.0F / ctx->DrawBuffer->_DepthMaxF;
+   GLfloat y_scale, y_bias;
+
+   if (render_to_fbo) {
+      y_scale = 1.0;
+      y_bias = 0;
+   } else {
+      y_scale = -1.0;
+      y_bias = yoffset;
+   }
 
    float_ui32_type sx = { v[MAT_SX] };
    float_ui32_type tx = { v[MAT_TX] + xoffset + SUBPIXEL_X };
-   float_ui32_type sy = { - v[MAT_SY] };
-   float_ui32_type ty = { (- v[MAT_TY]) + yoffset + SUBPIXEL_Y };
-   float_ui32_type sz = { v[MAT_SZ] * rmesa->state.depth.scale };
-   float_ui32_type tz = { v[MAT_TZ] * rmesa->state.depth.scale };
+   float_ui32_type sy = { v[MAT_SY] * y_scale };
+   float_ui32_type ty = { (v[MAT_TY] * y_scale) + y_bias + SUBPIXEL_Y };
+   float_ui32_type sz = { v[MAT_SZ] * depthScale };
+   float_ui32_type tz = { v[MAT_TZ] * depthScale };
 
-   RADEON_FIREVERTICES( rmesa );
    RADEON_STATECHANGE( rmesa, vpt );
 
    rmesa->hw.vpt.cmd[VPT_SE_VPORT_XSCALE]  = sx.ui32;
@@ -1514,6 +1441,8 @@ static void radeonViewport( GLcontext *ctx, GLint x, GLint y,
     * values, or keep the originals hanging around.
     */
    radeonUpdateWindow( ctx );
+
+   radeon_viewport(ctx, x, y, width, height);
 }
 
 static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
@@ -1524,8 +1453,8 @@ static void radeonDepthRange( GLcontext *ctx, GLclampd nearval,
 
 void radeonUpdateViewportOffset( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   __DRIdrawablePrivate *dPriv = rmesa->dri.drawable;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   __DRIdrawablePrivate *dPriv = radeon_get_drawable(&rmesa->radeon);
    GLfloat xoffset = (GLfloat)dPriv->x;
    GLfloat yoffset = (GLfloat)dPriv->y + dPriv->h;
    const GLfloat *v = ctx->Viewport._WindowMap.m;
@@ -1555,8 +1484,8 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
                 RADEON_STIPPLE_Y_OFFSET_MASK);
 
          /* add magic offsets, then invert */
-         stx = 31 - ((rmesa->dri.drawable->x - 1) & RADEON_STIPPLE_COORD_MASK);
-         sty = 31 - ((rmesa->dri.drawable->y + rmesa->dri.drawable->h - 1)
+         stx = 31 - ((dPriv->x - 1) & RADEON_STIPPLE_COORD_MASK);
+         sty = 31 - ((dPriv->y + dPriv->h - 1)
                      & RADEON_STIPPLE_COORD_MASK);
 
          m |= ((stx << RADEON_STIPPLE_X_OFFSET_SHIFT) |
@@ -1580,20 +1509,26 @@ void radeonUpdateViewportOffset( GLcontext *ctx )
 
 static void radeonClearColor( GLcontext *ctx, const GLfloat color[4] )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLubyte c[4];
+   struct radeon_renderbuffer *rrb;
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   if (!rrb)
+     return;
+     
    CLAMPED_FLOAT_TO_UBYTE(c[0], color[0]);
    CLAMPED_FLOAT_TO_UBYTE(c[1], color[1]);
    CLAMPED_FLOAT_TO_UBYTE(c[2], color[2]);
    CLAMPED_FLOAT_TO_UBYTE(c[3], color[3]);
-   rmesa->state.color.clear = radeonPackColor( rmesa->radeonScreen->cpp,
+   rmesa->radeon.state.color.clear = radeonPackColor( rrb->cpp,
 					       c[0], c[1], c[2], c[3] );
 }
 
 
 static void radeonRenderMode( GLcontext *ctx, GLenum mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    FALLBACK( rmesa, RADEON_FALLBACK_RENDER_MODE, (mode != GL_RENDER) );
 }
 
@@ -1619,7 +1554,7 @@ static GLuint radeon_rop_tab[] = {
 
 static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint rop = (GLuint)opcode - GL_CLEAR;
 
    ASSERT( rop < 16 );
@@ -1628,105 +1563,13 @@ static void radeonLogicOpCode( GLcontext *ctx, GLenum opcode )
    rmesa->hw.msk.cmd[MSK_RB3D_ROPCNTL] = radeon_rop_tab[rop];
 }
 
-
-/**
- * Set up the cliprects for either front or back-buffer drawing.
- */
-void radeonSetCliprects( radeonContextPtr rmesa )
-{
-   __DRIdrawablePrivate *const drawable = rmesa->dri.drawable;
-   __DRIdrawablePrivate *const readable = rmesa->dri.readable;
-   GLframebuffer *const draw_fb = (GLframebuffer*) drawable->driverPrivate;
-   GLframebuffer *const read_fb = (GLframebuffer*) readable->driverPrivate;
-
-   if (draw_fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-      /* Can't ignore 2d windows if we are page flipping.
-       */
-      if ( drawable->numBackClipRects == 0 || rmesa->doPageFlip ) {
-	 rmesa->numClipRects = drawable->numClipRects;
-	 rmesa->pClipRects = drawable->pClipRects;
-      }
-      else {
-	 rmesa->numClipRects = drawable->numBackClipRects;
-	 rmesa->pClipRects = drawable->pBackClipRects;
-      }
-   }
-   else {
-      /* front buffer (or none, or multiple buffers */
-      rmesa->numClipRects = drawable->numClipRects;
-      rmesa->pClipRects = drawable->pClipRects;
-   }
-
-   if ((draw_fb->Width != drawable->w) || (draw_fb->Height != drawable->h)) {
-      _mesa_resize_framebuffer(rmesa->glCtx, draw_fb,
-			       drawable->w, drawable->h);
-      draw_fb->Initialized = GL_TRUE;
-   }
-
-   if (drawable != readable) {
-      if ((read_fb->Width != readable->w) || (read_fb->Height != readable->h)) {
-	 _mesa_resize_framebuffer(rmesa->glCtx, read_fb,
-				  readable->w, readable->h);
-	 read_fb->Initialized = GL_TRUE;
-      }
-   }
-
-   if (rmesa->state.scissor.enabled)
-      radeonRecalcScissorRects( rmesa );
-
-   rmesa->lastStamp = drawable->lastStamp;
-}
-
-
-/**
- * Called via glDrawBuffer.
- */
-static void radeonDrawBuffer( GLcontext *ctx, GLenum mode )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (RADEON_DEBUG & DEBUG_DRI)
-      fprintf(stderr, "%s %s\n", __FUNCTION__,
-	      _mesa_lookup_enum_by_nr( mode ));
-
-   RADEON_FIREVERTICES(rmesa);	/* don't pipeline cliprect changes */
-
-   if (ctx->DrawBuffer->_NumColorDrawBuffers != 1) {
-      /* 0 (GL_NONE) buffers or multiple color drawing buffers */
-      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   switch ( ctx->DrawBuffer->_ColorDrawBufferIndexes[0] ) {
-   case BUFFER_FRONT_LEFT:
-   case BUFFER_BACK_LEFT:
-      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_FALSE );
-      break;
-   default:
-      FALLBACK( rmesa, RADEON_FALLBACK_DRAW_BUFFER, GL_TRUE );
-      return;
-   }
-
-   radeonSetCliprects( rmesa );
-
-   /* We'll set the drawing engine's offset/pitch parameters later
-    * when we update other state.
-    */
-}
-
-static void radeonReadBuffer( GLcontext *ctx, GLenum mode )
-{
-   /* nothing, until we implement h/w glRead/CopyPixels or CopyTexImage */
-}
-
-
 /* =============================================================
  * State enable/disable
  */
 
 static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint p, flag;
 
    if ( RADEON_DEBUG & DEBUG_STATE )
@@ -1787,7 +1630,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_CLIP_PLANE2:
    case GL_CLIP_PLANE3:
    case GL_CLIP_PLANE4:
-   case GL_CLIP_PLANE5: 
+   case GL_CLIP_PLANE5:
       p = cap-GL_CLIP_PLANE0;
       RADEON_STATECHANGE( rmesa, tcl );
       if (state) {
@@ -1821,10 +1664,10 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
       RADEON_STATECHANGE(rmesa, ctx );
       if ( state ) {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~rmesa->radeon.state.color.roundEnable;
       } else {
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_DITHER_ENABLE;
-	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->state.color.roundEnable;
+	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  rmesa->radeon.state.color.roundEnable;
       }
       break;
 
@@ -1852,13 +1695,13 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_LIGHT7:
       RADEON_STATECHANGE(rmesa, tcl);
       p = cap - GL_LIGHT0;
-      if (p&1) 
+      if (p&1)
 	 flag = (RADEON_LIGHT_1_ENABLE |
-		 RADEON_LIGHT_1_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_1_ENABLE_AMBIENT |
 		 RADEON_LIGHT_1_ENABLE_SPECULAR);
       else
 	 flag = (RADEON_LIGHT_0_ENABLE |
-		 RADEON_LIGHT_0_ENABLE_AMBIENT | 
+		 RADEON_LIGHT_0_ENABLE_AMBIENT |
 		 RADEON_LIGHT_0_ENABLE_SPECULAR);
 
       if (state)
@@ -1866,7 +1709,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
       else
 	 rmesa->hw.tcl.cmd[p/2 + TCL_PER_LIGHT_CTL_0] &= ~flag;
 
-      /* 
+      /*
        */
       update_light_colors( ctx, p );
       break;
@@ -1904,7 +1747,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 	 rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_ROP_ENABLE;
       }
       break;
-      
+
    case GL_NORMALIZE:
       RADEON_STATECHANGE( rmesa, tcl );
       if ( state ) {
@@ -1971,21 +1814,30 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    }
 
    case GL_SCISSOR_TEST:
-      RADEON_FIREVERTICES( rmesa );
-      rmesa->state.scissor.enabled = state;
+      radeon_firevertices(&rmesa->radeon);
+      rmesa->radeon.state.scissor.enabled = state;
       radeonUpdateScissor( ctx );
       break;
 
    case GL_STENCIL_TEST:
-      if ( rmesa->state.stencil.hwBuffer ) {
-	 RADEON_STATECHANGE( rmesa, ctx );
-	 if ( state ) {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_STENCIL_ENABLE;
+      {
+	 GLboolean hw_stencil = GL_FALSE;
+	 if (ctx->DrawBuffer) {
+	    struct radeon_renderbuffer *rrbStencil
+	       = radeon_get_renderbuffer(ctx->DrawBuffer, BUFFER_STENCIL);
+	    hw_stencil = (rrbStencil && rrbStencil->bo);
+	 }
+
+	 if (hw_stencil) {
+	    RADEON_STATECHANGE( rmesa, ctx );
+	    if ( state ) {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |=  RADEON_STENCIL_ENABLE;
+	    } else {
+	       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_STENCIL_ENABLE;
+	    }
 	 } else {
-	    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] &= ~RADEON_STENCIL_ENABLE;
+	    FALLBACK( rmesa, RADEON_FALLBACK_STENCIL, state );
 	 }
-      } else {
-	 FALLBACK( rmesa, RADEON_FALLBACK_STENCIL, state );
       }
       break;
 
@@ -1995,7 +1847,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
    case GL_TEXTURE_GEN_T:
       /* Picked up in radeonUpdateTextureState.
        */
-      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE; 
+      rmesa->recheck_texgen[ctx->Texture.CurrentUnit] = GL_TRUE;
       break;
 
    case GL_COLOR_SUM_EXT:
@@ -2010,7 +1862,7 @@ static void radeonEnable( GLcontext *ctx, GLenum cap, GLboolean state )
 
 static void radeonLightingSpaceChange( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean tmp;
    RADEON_STATECHANGE( rmesa, tcl );
 
@@ -2029,7 +1881,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
       rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL] &= ~RADEON_RESCALE_NORMALS;
    }
 
-   if (RADEON_DEBUG & DEBUG_STATE) 
+   if (RADEON_DEBUG & DEBUG_STATE)
       fprintf(stderr, "%s %d AFTER %x\n", __FUNCTION__, ctx->_NeedEyeCoords,
 	      rmesa->hw.tcl.cmd[TCL_LIGHT_MODEL_CTL]);
 }
@@ -2039,7 +1891,7 @@ static void radeonLightingSpaceChange( GLcontext *ctx )
  */
 
 
-void radeonUploadTexMatrix( radeonContextPtr rmesa,
+void radeonUploadTexMatrix( r100ContextPtr rmesa,
 			    int unit, GLboolean swapcols )
 {
 /* Here's how this works: on r100, only 3 tex coords can be submitted, so the
@@ -2065,7 +1917,7 @@ void radeonUploadTexMatrix( radeonContextPtr rmesa,
    int idx = TEXMAT_0 + unit;
    float *dest = ((float *)RADEON_DB_STATE( mat[idx] )) + MAT_ELT_0;
    int i;
-   struct gl_texture_unit tUnit = rmesa->glCtx->Texture.Unit[unit];
+   struct gl_texture_unit tUnit = rmesa->radeon.glCtx->Texture.Unit[unit];
    GLfloat *src = rmesa->tmpmat[unit].m;
 
    rmesa->TexMatColSwap &= ~(1 << unit);
@@ -2119,7 +1971,7 @@ void radeonUploadTexMatrix( radeonContextPtr rmesa,
 }
 
 
-static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
+static void upload_matrix( r100ContextPtr rmesa, GLfloat *src, int idx )
 {
    float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
    int i;
@@ -2135,7 +1987,7 @@ static void upload_matrix( radeonContextPtr rmesa, GLfloat *src, int idx )
    RADEON_DB_STATECHANGE( rmesa, &rmesa->hw.mat[idx] );
 }
 
-static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
+static void upload_matrix_t( r100ContextPtr rmesa, GLfloat *src, int idx )
 {
    float *dest = ((float *)RADEON_DB_STATE( mat[idx] ))+MAT_ELT_0;
    memcpy(dest, src, 16*sizeof(float));
@@ -2145,7 +1997,7 @@ static void upload_matrix_t( radeonContextPtr rmesa, GLfloat *src, int idx )
 
 static void update_texturematrix( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    GLuint tpc = rmesa->hw.tcl.cmd[TCL_TEXTURE_PROC_CTL];
    GLuint vs = rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXSEL];
    int unit;
@@ -2209,64 +2061,74 @@ static void update_texturematrix( GLcontext *ctx )
    }
 }
 
-
-/**
- * Tell the card where to render (offset, pitch).
- * Effected by glDrawBuffer, etc
- */
-void
-radeonUpdateDrawBuffer(GLcontext *ctx)
+static GLboolean r100ValidateBuffers(GLcontext *ctx)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_framebuffer *fb = ctx->DrawBuffer;
-   driRenderbuffer *drb;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   struct radeon_renderbuffer *rrb;
+   int i, ret;
 
-   if (fb->_ColorDrawBufferIndexes[0] == BUFFER_FRONT_LEFT) {
-      /* draw to front */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_FRONT_LEFT].Renderbuffer;
-   }
-   else if (fb->_ColorDrawBufferIndexes[0] == BUFFER_BACK_LEFT) {
-      /* draw to back */
-      drb = (driRenderbuffer *) fb->Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+   radeon_cs_space_reset_bos(rmesa->radeon.cmdbuf.cs);
+
+   rrb = radeon_get_colorbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
-   else {
-      /* drawing to multiple buffers, or none */
-      return;
+
+   /* depth buffer */
+   rrb = radeon_get_depthbuffer(&rmesa->radeon);
+   /* color buffer */
+   if (rrb && rrb->bo) {
+     radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, rrb->bo,
+				       0, RADEON_GEM_DOMAIN_VRAM);
    }
 
-   assert(drb);
-   assert(drb->flippedPitch);
+   for (i = 0; i < ctx->Const.MaxTextureImageUnits; ++i) {
+      radeonTexObj *t;
 
-   RADEON_STATECHANGE( rmesa, ctx );
+      if (!ctx->Texture.Unit[i]._ReallyEnabled)
+	 continue;
 
-   /* Note: we used the (possibly) page-flipped values */
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET]
-     = ((drb->flippedOffset + rmesa->radeonScreen->fbLocation)
-	& RADEON_COLOROFFSET_MASK);
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = drb->flippedPitch;
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
+      t = rmesa->state.texture.unit[i].texobj;
+      if (t->image_override && t->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
+      else if (t->mt->bo)
+	radeon_cs_space_add_persistent_bo(rmesa->radeon.cmdbuf.cs, t->mt->bo,
+			   RADEON_GEM_DOMAIN_GTT | RADEON_GEM_DOMAIN_VRAM, 0);
    }
-}
 
+   ret = radeon_cs_space_check_with_bo(rmesa->radeon.cmdbuf.cs, rmesa->radeon.dma.current, RADEON_GEM_DOMAIN_GTT, 0);
+   if (ret)
+       return GL_FALSE;
+   return GL_TRUE;
+}
 
-void radeonValidateState( GLcontext *ctx )
+GLboolean radeonValidateState( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint new_state = rmesa->NewGLState;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint new_state = rmesa->radeon.NewGLState;
 
-   if (new_state & (_NEW_BUFFERS | _NEW_COLOR | _NEW_PIXEL)) {
-     radeonUpdateDrawBuffer(ctx);
+   if (new_state & _NEW_BUFFERS) {
+     _mesa_update_framebuffer(ctx);
+     /* this updates the DrawBuffer's Width/Height if it's a FBO */
+     _mesa_update_draw_buffer_bounds(ctx);
+     RADEON_STATECHANGE(rmesa, ctx);
    }
 
    if (new_state & _NEW_TEXTURE) {
       radeonUpdateTextureState( ctx );
-      new_state |= rmesa->NewGLState; /* may add TEXTURE_MATRIX */
+      new_state |= rmesa->radeon.NewGLState; /* may add TEXTURE_MATRIX */
    }
 
+   /* we need to do a space check here */
+   if (!r100ValidateBuffers(ctx))
+     return GL_FALSE;
+
    /* Need an event driven matrix update?
     */
-   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION)) 
+   if (new_state & (_NEW_MODELVIEW|_NEW_PROJECTION))
       upload_matrix( rmesa, ctx->_ModelProjectMatrix.m, MODEL_PROJ );
 
    /* Need these for lighting (shouldn't upload otherwise)
@@ -2290,12 +2152,14 @@ void radeonValidateState( GLcontext *ctx )
    /* emit all active clip planes if projection matrix changes.
     */
    if (new_state & (_NEW_PROJECTION)) {
-      if (ctx->Transform.ClipPlanesEnabled) 
+      if (ctx->Transform.ClipPlanesEnabled)
 	 radeonUpdateClipPlanes( ctx );
    }
 
 
-   rmesa->NewGLState = 0;
+   rmesa->radeon.NewGLState = 0;
+
+   return GL_TRUE;
 }
 
 
@@ -2306,7 +2170,7 @@ static void radeonInvalidateState( GLcontext *ctx, GLuint new_state )
    _vbo_InvalidateState( ctx, new_state );
    _tnl_InvalidateState( ctx, new_state );
    _ae_invalidate_state( ctx, new_state );
-   RADEON_CONTEXT(ctx)->NewGLState |= new_state;
+   R100_CONTEXT(ctx)->radeon.NewGLState |= new_state;
 }
 
 
@@ -2317,8 +2181,8 @@ static GLboolean check_material( GLcontext *ctx )
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLint i;
 
-   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT; 
-	i < _TNL_ATTRIB_MAT_BACK_INDEXES; 
+   for (i = _TNL_ATTRIB_MAT_FRONT_AMBIENT;
+	i < _TNL_ATTRIB_MAT_BACK_INDEXES;
 	i++)
       if (tnl->vb.AttribPtr[i] &&
 	  tnl->vb.AttribPtr[i]->stride)
@@ -2326,20 +2190,21 @@ static GLboolean check_material( GLcontext *ctx )
 
    return GL_FALSE;
 }
-      
+
 
 static void radeonWrapRunPipeline( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean has_material;
 
    if (0)
-      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->NewGLState);
+      fprintf(stderr, "%s, newstate: %x\n", __FUNCTION__, rmesa->radeon.NewGLState);
 
    /* Validate state:
     */
-   if (rmesa->NewGLState)
-      radeonValidateState( ctx );
+   if (rmesa->radeon.NewGLState)
+      if (!radeonValidateState( ctx ))
+	 FALLBACK(rmesa, RADEON_FALLBACK_TEXTURE, GL_TRUE);
 
    has_material = (ctx->Light.Enabled && check_material( ctx ));
 
@@ -2348,7 +2213,7 @@ static void radeonWrapRunPipeline( GLcontext *ctx )
    }
 
    /* Run the pipeline.
-    */ 
+    */
    _tnl_run_pipeline( ctx );
 
    if (has_material) {
diff --git a/src/mesa/drivers/dri/radeon/radeon_state.h b/src/mesa/drivers/dri/radeon/radeon_state.h
index 2171879f759..a7c8eef32a5 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state.h
+++ b/src/mesa/drivers/dri/radeon/radeon_state.h
@@ -39,30 +39,25 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 
 #include "radeon_context.h"
 
-extern void radeonInitState( radeonContextPtr rmesa );
+extern void radeonInitState( r100ContextPtr rmesa );
 extern void radeonInitStateFuncs( GLcontext *ctx );
 
 extern void radeonUpdateMaterial( GLcontext *ctx );
 
-extern void radeonSetCliprects( radeonContextPtr rmesa );
-extern void radeonRecalcScissorRects( radeonContextPtr rmesa );
 extern void radeonUpdateViewportOffset( GLcontext *ctx );
 extern void radeonUpdateWindow( GLcontext *ctx );
 extern void radeonUpdateDrawBuffer( GLcontext *ctx );
-extern void radeonUploadTexMatrix( radeonContextPtr rmesa,
+extern void radeonUploadTexMatrix( r100ContextPtr rmesa,
 				   int unit, GLboolean swapcols );
 
-extern void radeonValidateState( GLcontext *ctx );
-
-extern void radeonPrintDirty( radeonContextPtr rmesa,
-			      const char *msg );
+extern GLboolean radeonValidateState( GLcontext *ctx );
 
 
 extern void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode );
 #define FALLBACK( rmesa, bit, mode ) do {				\
    if ( 0 ) fprintf( stderr, "FALLBACK in %s: #%d=%d\n",		\
 		     __FUNCTION__, bit, mode );				\
-   radeonFallback( rmesa->glCtx, bit, mode );				\
+   radeonFallback( rmesa->radeon.glCtx, bit, mode );				\
 } while (0)
 
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_state_init.c b/src/mesa/drivers/dri/radeon/radeon_state_init.c
index 57dc3800501..aaa82b1d6a4 100644
--- a/src/mesa/drivers/dri/radeon/radeon_state_init.c
+++ b/src/mesa/drivers/dri/radeon/radeon_state_init.c
@@ -38,39 +38,140 @@
 #include "swrast_setup/swrast_setup.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_ioctl.h"
 #include "radeon_state.h"
 #include "radeon_tcl.h"
 #include "radeon_tex.h"
 #include "radeon_swtcl.h"
 
+#include "../r200/r200_reg.h"
+
 #include "xmlpool.h"
 
+/* New (1.3) state mechanism.  3 commands (packet, scalar, vector) in
+ * 1.3 cmdbuffers allow all previous state to be updated as well as
+ * the tcl scalar and vector areas.
+ */
+static struct {
+	int start;
+	int len;
+	const char *name;
+} packet[RADEON_MAX_STATE_PACKETS] = {
+	{RADEON_PP_MISC, 7, "RADEON_PP_MISC"},
+	{RADEON_PP_CNTL, 3, "RADEON_PP_CNTL"},
+	{RADEON_RB3D_COLORPITCH, 1, "RADEON_RB3D_COLORPITCH"},
+	{RADEON_RE_LINE_PATTERN, 2, "RADEON_RE_LINE_PATTERN"},
+	{RADEON_SE_LINE_WIDTH, 1, "RADEON_SE_LINE_WIDTH"},
+	{RADEON_PP_LUM_MATRIX, 1, "RADEON_PP_LUM_MATRIX"},
+	{RADEON_PP_ROT_MATRIX_0, 2, "RADEON_PP_ROT_MATRIX_0"},
+	{RADEON_RB3D_STENCILREFMASK, 3, "RADEON_RB3D_STENCILREFMASK"},
+	{RADEON_SE_VPORT_XSCALE, 6, "RADEON_SE_VPORT_XSCALE"},
+	{RADEON_SE_CNTL, 2, "RADEON_SE_CNTL"},
+	{RADEON_SE_CNTL_STATUS, 1, "RADEON_SE_CNTL_STATUS"},
+	{RADEON_RE_MISC, 1, "RADEON_RE_MISC"},
+	{RADEON_PP_TXFILTER_0, 6, "RADEON_PP_TXFILTER_0"},
+	{RADEON_PP_BORDER_COLOR_0, 1, "RADEON_PP_BORDER_COLOR_0"},
+	{RADEON_PP_TXFILTER_1, 6, "RADEON_PP_TXFILTER_1"},
+	{RADEON_PP_BORDER_COLOR_1, 1, "RADEON_PP_BORDER_COLOR_1"},
+	{RADEON_PP_TXFILTER_2, 6, "RADEON_PP_TXFILTER_2"},
+	{RADEON_PP_BORDER_COLOR_2, 1, "RADEON_PP_BORDER_COLOR_2"},
+	{RADEON_SE_ZBIAS_FACTOR, 2, "RADEON_SE_ZBIAS_FACTOR"},
+	{RADEON_SE_TCL_OUTPUT_VTX_FMT, 11, "RADEON_SE_TCL_OUTPUT_VTX_FMT"},
+	{RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED, 17,
+		    "RADEON_SE_TCL_MATERIAL_EMMISSIVE_RED"},
+	{R200_PP_TXCBLEND_0, 4, "R200_PP_TXCBLEND_0"},
+	{R200_PP_TXCBLEND_1, 4, "R200_PP_TXCBLEND_1"},
+	{R200_PP_TXCBLEND_2, 4, "R200_PP_TXCBLEND_2"},
+	{R200_PP_TXCBLEND_3, 4, "R200_PP_TXCBLEND_3"},
+	{R200_PP_TXCBLEND_4, 4, "R200_PP_TXCBLEND_4"},
+	{R200_PP_TXCBLEND_5, 4, "R200_PP_TXCBLEND_5"},
+	{R200_PP_TXCBLEND_6, 4, "R200_PP_TXCBLEND_6"},
+	{R200_PP_TXCBLEND_7, 4, "R200_PP_TXCBLEND_7"},
+	{R200_SE_TCL_LIGHT_MODEL_CTL_0, 6, "R200_SE_TCL_LIGHT_MODEL_CTL_0"},
+	{R200_PP_TFACTOR_0, 6, "R200_PP_TFACTOR_0"},
+	{R200_SE_VTX_FMT_0, 4, "R200_SE_VTX_FMT_0"},
+	{R200_SE_VAP_CNTL, 1, "R200_SE_VAP_CNTL"},
+	{R200_SE_TCL_MATRIX_SEL_0, 5, "R200_SE_TCL_MATRIX_SEL_0"},
+	{R200_SE_TCL_TEX_PROC_CTL_2, 5, "R200_SE_TCL_TEX_PROC_CTL_2"},
+	{R200_SE_TCL_UCP_VERT_BLEND_CTL, 1, "R200_SE_TCL_UCP_VERT_BLEND_CTL"},
+	{R200_PP_TXFILTER_0, 6, "R200_PP_TXFILTER_0"},
+	{R200_PP_TXFILTER_1, 6, "R200_PP_TXFILTER_1"},
+	{R200_PP_TXFILTER_2, 6, "R200_PP_TXFILTER_2"},
+	{R200_PP_TXFILTER_3, 6, "R200_PP_TXFILTER_3"},
+	{R200_PP_TXFILTER_4, 6, "R200_PP_TXFILTER_4"},
+	{R200_PP_TXFILTER_5, 6, "R200_PP_TXFILTER_5"},
+	{R200_PP_TXOFFSET_0, 1, "R200_PP_TXOFFSET_0"},
+	{R200_PP_TXOFFSET_1, 1, "R200_PP_TXOFFSET_1"},
+	{R200_PP_TXOFFSET_2, 1, "R200_PP_TXOFFSET_2"},
+	{R200_PP_TXOFFSET_3, 1, "R200_PP_TXOFFSET_3"},
+	{R200_PP_TXOFFSET_4, 1, "R200_PP_TXOFFSET_4"},
+	{R200_PP_TXOFFSET_5, 1, "R200_PP_TXOFFSET_5"},
+	{R200_SE_VTE_CNTL, 1, "R200_SE_VTE_CNTL"},
+	{R200_SE_TCL_OUTPUT_VTX_COMP_SEL, 1,
+	 "R200_SE_TCL_OUTPUT_VTX_COMP_SEL"},
+	{R200_PP_TAM_DEBUG3, 1, "R200_PP_TAM_DEBUG3"},
+	{R200_PP_CNTL_X, 1, "R200_PP_CNTL_X"},
+	{R200_RB3D_DEPTHXY_OFFSET, 1, "R200_RB3D_DEPTHXY_OFFSET"},
+	{R200_RE_AUX_SCISSOR_CNTL, 1, "R200_RE_AUX_SCISSOR_CNTL"},
+	{R200_RE_SCISSOR_TL_0, 2, "R200_RE_SCISSOR_TL_0"},
+	{R200_RE_SCISSOR_TL_1, 2, "R200_RE_SCISSOR_TL_1"},
+	{R200_RE_SCISSOR_TL_2, 2, "R200_RE_SCISSOR_TL_2"},
+	{R200_SE_VAP_CNTL_STATUS, 1, "R200_SE_VAP_CNTL_STATUS"},
+	{R200_SE_VTX_STATE_CNTL, 1, "R200_SE_VTX_STATE_CNTL"},
+	{R200_RE_POINTSIZE, 1, "R200_RE_POINTSIZE"},
+	{R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0, 4,
+		    "R200_SE_TCL_INPUT_VTX_VECTOR_ADDR_0"},
+	{R200_PP_CUBIC_FACES_0, 1, "R200_PP_CUBIC_FACES_0"},	/* 61 */
+	{R200_PP_CUBIC_OFFSET_F1_0, 5, "R200_PP_CUBIC_OFFSET_F1_0"}, /* 62 */
+	{R200_PP_CUBIC_FACES_1, 1, "R200_PP_CUBIC_FACES_1"},
+	{R200_PP_CUBIC_OFFSET_F1_1, 5, "R200_PP_CUBIC_OFFSET_F1_1"},
+	{R200_PP_CUBIC_FACES_2, 1, "R200_PP_CUBIC_FACES_2"},
+	{R200_PP_CUBIC_OFFSET_F1_2, 5, "R200_PP_CUBIC_OFFSET_F1_2"},
+	{R200_PP_CUBIC_FACES_3, 1, "R200_PP_CUBIC_FACES_3"},
+	{R200_PP_CUBIC_OFFSET_F1_3, 5, "R200_PP_CUBIC_OFFSET_F1_3"},
+	{R200_PP_CUBIC_FACES_4, 1, "R200_PP_CUBIC_FACES_4"},
+	{R200_PP_CUBIC_OFFSET_F1_4, 5, "R200_PP_CUBIC_OFFSET_F1_4"},
+	{R200_PP_CUBIC_FACES_5, 1, "R200_PP_CUBIC_FACES_5"},
+	{R200_PP_CUBIC_OFFSET_F1_5, 5, "R200_PP_CUBIC_OFFSET_F1_5"},
+	{RADEON_PP_TEX_SIZE_0, 2, "RADEON_PP_TEX_SIZE_0"},
+	{RADEON_PP_TEX_SIZE_1, 2, "RADEON_PP_TEX_SIZE_1"},
+	{RADEON_PP_TEX_SIZE_2, 2, "RADEON_PP_TEX_SIZE_2"},
+	{R200_RB3D_BLENDCOLOR, 3, "R200_RB3D_BLENDCOLOR"},
+	{R200_SE_TCL_POINT_SPRITE_CNTL, 1, "R200_SE_TCL_POINT_SPRITE_CNTL"},
+	{RADEON_PP_CUBIC_FACES_0, 1, "RADEON_PP_CUBIC_FACES_0"},
+	{RADEON_PP_CUBIC_OFFSET_T0_0, 5, "RADEON_PP_CUBIC_OFFSET_T0_0"},
+	{RADEON_PP_CUBIC_FACES_1, 1, "RADEON_PP_CUBIC_FACES_1"},
+	{RADEON_PP_CUBIC_OFFSET_T1_0, 5, "RADEON_PP_CUBIC_OFFSET_T1_0"},
+	{RADEON_PP_CUBIC_FACES_2, 1, "RADEON_PP_CUBIC_FACES_2"},
+	{RADEON_PP_CUBIC_OFFSET_T2_0, 5, "RADEON_PP_CUBIC_OFFSET_T2_0"},
+	{R200_PP_TRI_PERF, 2, "R200_PP_TRI_PERF"},
+	{R200_PP_TXCBLEND_8, 32, "R200_PP_AFS_0"},     /* 85 */
+	{R200_PP_TXCBLEND_0, 32, "R200_PP_AFS_1"},
+	{R200_PP_TFACTOR_0, 8, "R200_ATF_TFACTOR"},
+	{R200_PP_TXFILTER_0, 8, "R200_PP_TXCTLALL_0"},
+	{R200_PP_TXFILTER_1, 8, "R200_PP_TXCTLALL_1"},
+	{R200_PP_TXFILTER_2, 8, "R200_PP_TXCTLALL_2"},
+	{R200_PP_TXFILTER_3, 8, "R200_PP_TXCTLALL_3"},
+	{R200_PP_TXFILTER_4, 8, "R200_PP_TXCTLALL_4"},
+	{R200_PP_TXFILTER_5, 8, "R200_PP_TXCTLALL_5"},
+	{R200_VAP_PVS_CNTL_1, 2, "R200_VAP_PVS_CNTL"},
+};
+
 /* =============================================================
  * State initialization
  */
-
-void radeonPrintDirty( radeonContextPtr rmesa, const char *msg )
+static int cmdpkt( r100ContextPtr rmesa, int id ) 
 {
-   struct radeon_state_atom *l;
-
-   fprintf(stderr, msg);
-   fprintf(stderr, ": ");
+   drm_radeon_cmd_header_t h;
 
-   foreach(l, &rmesa->hw.atomlist) {
-      if (l->dirty || rmesa->hw.all_dirty)
-	 fprintf(stderr, "%s, ", l->name);
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+     return CP_PACKET0(packet[id].start, packet[id].len - 1);
+   } else {
+     h.i = 0;
+     h.packet.cmd_type = RADEON_CMD_PACKET;
+     h.packet.packet_id = id;
    }
-
-   fprintf(stderr, "\n");
-}
-
-static int cmdpkt( int id ) 
-{
-   drm_radeon_cmd_header_t h;
-   h.i = 0;
-   h.packet.cmd_type = RADEON_CMD_PACKET;
-   h.packet.packet_id = id;
    return h.i;
 }
 
@@ -96,17 +197,17 @@ static int cmdscl( int offset, int stride, int count )
    return h.i;
 }
 
-#define CHECK( NM, FLAG )			\
-static GLboolean check_##NM( GLcontext *ctx )	\
-{						\
-   return FLAG;					\
+#define CHECK( NM, FLAG )				\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
+{							\
+   return FLAG ? atom->cmd_size : 0;			\
 }
 
 #define TCL_CHECK( NM, FLAG )				\
-static GLboolean check_##NM( GLcontext *ctx )		\
+static int check_##NM( GLcontext *ctx, struct radeon_state_atom *atom )	\
 {							\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);	\
-   return !rmesa->TclFallback && (FLAG);		\
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);	\
+   return (!rmesa->radeon.TclFallback && (FLAG)) ? atom->cmd_size : 0;	\
 }
 
 
@@ -146,81 +247,429 @@ CHECK( txr0, (ctx->Texture.Unit[0]._ReallyEnabled & TEXTURE_RECT_BIT))
 CHECK( txr1, (ctx->Texture.Unit[1]._ReallyEnabled & TEXTURE_RECT_BIT))
 CHECK( txr2, (ctx->Texture.Unit[2]._ReallyEnabled & TEXTURE_RECT_BIT))
 
+#define OUT_VEC(hdr, data) do {			\
+    drm_radeon_cmd_header_t h;					\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(RADEON_SE_TCL_STATE_FLUSH, 0));		\
+    OUT_BATCH(0);							\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_VECTOR_INDX_REG, 0));		\
+    OUT_BATCH(h.vectors.offset | (h.vectors.stride << RADEON_VEC_INDX_OCTWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_VECTOR_DATA_REG, h.vectors.count - 1));	\
+    OUT_BATCH_TABLE((data), h.vectors.count);				\
+  } while(0)
+
+#define OUT_SCL(hdr, data) do {					\
+    drm_radeon_cmd_header_t h;						\
+    h.i = hdr;								\
+    OUT_BATCH(CP_PACKET0(R200_SE_TCL_SCALAR_INDX_REG, 0));		\
+    OUT_BATCH((h.scalars.offset) | (h.scalars.stride << RADEON_SCAL_INDX_DWORD_STRIDE_SHIFT)); \
+    OUT_BATCH(CP_PACKET0_ONE(R200_SE_TCL_SCALAR_DATA_REG, h.scalars.count - 1));	\
+    OUT_BATCH_TABLE((data), h.scalars.count);				\
+  } while(0)
+
+static void scl_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+   
+   dwords += 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_SCL(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
 
 
-/* Initialize the context's hardware state.
- */
-void radeonInitState( radeonContextPtr rmesa )
+static void vec_emit(GLcontext *ctx, struct radeon_state_atom *atom)
 {
-   GLcontext *ctx = rmesa->glCtx;
-   GLuint color_fmt, depth_fmt, i;
-   GLint drawPitch, drawOffset;
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 4;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[0], atom->cmd+1);
+   END_BATCH();
+}
 
-   switch ( rmesa->radeonScreen->cpp ) {
-   case 2:
-      color_fmt = RADEON_COLOR_FORMAT_RGB565;
-      break;
-   case 4:
-      color_fmt = RADEON_COLOR_FORMAT_ARGB8888;
-      break;
-   default:
-      fprintf( stderr, "Error: Unsupported pixel depth... exiting\n" );
-      exit( -1 );
+
+static void lit_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+
+   dwords += 6;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+   OUT_VEC(atom->cmd[LIT_CMD_0], atom->cmd+1);
+   OUT_SCL(atom->cmd[LIT_CMD_1], atom->cmd+LIT_CMD_1+1);
+   END_BATCH();
+}
+
+static void ctx_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   struct radeon_renderbuffer *rrb;
+   uint32_t cbpitch;
+   uint32_t zbpitch, depth_fmt;
+   uint32_t dwords = atom->cmd_size;
+
+   /* output the first 7 bytes of context */
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + 4);
+   OUT_BATCH_TABLE(atom->cmd, 5);
+
+   rrb = radeon_get_depthbuffer(&r100->radeon);
+   if (!rrb) {
+     OUT_BATCH(0);
+     OUT_BATCH(0);
+   } else {
+     zbpitch = (rrb->pitch / rrb->cpp);
+     if (r100->using_hyperz)
+       zbpitch |= RADEON_DEPTH_HYPERZ;
+
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+     OUT_BATCH(zbpitch);
+     if (rrb->cpp == 4)
+        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+     else
+        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
+   }
+     
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(atom->cmd[CTX_CMD_1]);
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+
+   rrb = radeon_get_colorbuffer(&r100->radeon);
+   if (!rrb || !rrb->bo) {
+      OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+      OUT_BATCH(atom->cmd[CTX_RB3D_COLOROFFSET]);
+   } else {
+      atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
+      if (rrb->cpp == 4)
+         atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
+      else
+         atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
+
+      OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+      OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+   }
+
+   OUT_BATCH(atom->cmd[CTX_CMD_2]);
+
+   if (!rrb || !rrb->bo) {
+     OUT_BATCH(atom->cmd[CTX_RB3D_COLORPITCH]);
+   } else {
+     cbpitch = (rrb->pitch / rrb->cpp);
+     if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= RADEON_COLOR_TILE_ENABLE;
+     OUT_BATCH(cbpitch);
+   }
+
+   END_BATCH();
+}
+
+static void ctx_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   struct radeon_renderbuffer *rrb, *drb;
+   uint32_t cbpitch = 0;
+   uint32_t zbpitch = 0;
+   uint32_t dwords = atom->cmd_size;
+   uint32_t depth_fmt;
+
+   rrb = radeon_get_colorbuffer(&r100->radeon);
+   if (!rrb || !rrb->bo) {
+      fprintf(stderr, "no rrb\n");
+      return;
+   }
+
+   atom->cmd[CTX_RB3D_CNTL] &= ~(0xf << 10);
+   if (rrb->cpp == 4)
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB8888;
+   else switch (rrb->base._ActualFormat) {
+   case GL_RGB5:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_RGB565;
+	break;
+   case GL_RGBA4:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB4444;
+	break;
+   case GL_RGB5_A1:
+	atom->cmd[CTX_RB3D_CNTL] |= RADEON_COLOR_FORMAT_ARGB1555;
+	break;
+   }
+
+   cbpitch = (rrb->pitch / rrb->cpp);
+   if (rrb->bo->flags & RADEON_BO_FLAGS_MACRO_TILE)
+       cbpitch |= R200_COLOR_TILE_ENABLE;
+
+   drb = radeon_get_depthbuffer(&r100->radeon);
+   if (drb) {
+     zbpitch = (drb->pitch / drb->cpp);
+     if (drb->cpp == 4)
+        depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
+     else
+        depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] &= ~RADEON_DEPTH_FORMAT_MASK;
+     atom->cmd[CTX_RB3D_ZSTENCILCNTL] |= depth_fmt;
+     
+   }
+
+   /* output the first 7 bytes of context */
+   dwords = 10;
+   if (drb)
+     dwords += 6;
+   if (rrb)
+     dwords += 6;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   /* In the CS case we need to split this up */
+   OUT_BATCH(CP_PACKET0(packet[0].start, 3));
+   OUT_BATCH_TABLE((atom->cmd + 1), 4);
+
+   if (drb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHOFFSET, 0));
+     OUT_BATCH_RELOC(0, drb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_DEPTHPITCH, 0));
+     OUT_BATCH(zbpitch);
+   }
+
+   OUT_BATCH(CP_PACKET0(RADEON_RB3D_ZSTENCILCNTL, 0));
+   OUT_BATCH(atom->cmd[CTX_RB3D_ZSTENCILCNTL]);
+   OUT_BATCH(CP_PACKET0(RADEON_PP_CNTL, 1));
+   OUT_BATCH(atom->cmd[CTX_PP_CNTL]);
+   OUT_BATCH(atom->cmd[CTX_RB3D_CNTL]);
+
+   if (rrb) {
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLOROFFSET, 0));
+     OUT_BATCH_RELOC(0, rrb->bo, 0, 0, RADEON_GEM_DOMAIN_VRAM, 0);
+
+     OUT_BATCH(CP_PACKET0(RADEON_RB3D_COLORPITCH, 0));
+     OUT_BATCH(cbpitch);
+   }
+
+   // if (atom->cmd_size == CTX_STATE_SIZE_NEWDRM) {
+   //   OUT_BATCH_TABLE((atom->cmd + 14), 4);
+   // }
+
+   END_BATCH();
+   BEGIN_BATCH_NO_AUTOSTATE(4);
+   OUT_BATCH(CP_PACKET0(RADEON_RE_TOP_LEFT, 0));
+   OUT_BATCH(0);
+   OUT_BATCH(CP_PACKET0(RADEON_RE_WIDTH_HEIGHT, 0));
+   if (rrb) {
+       OUT_BATCH(((rrb->base.Width - 1) << RADEON_RE_WIDTH_SHIFT) |
+                 ((rrb->base.Height - 1) << RADEON_RE_HEIGHT_SHIFT));
+   } else {
+       OUT_BATCH(0);
+   }
+   END_BATCH();
+}
+
+static void cube_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = 3;
+   int i = atom->idx, j;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT))
+	return;
+
+   if (!t)
+	return;
+
+   if (!t->mt)
+	return;
+
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + (5 * 3));
+   OUT_BATCH_TABLE(atom->cmd, 3);
+   lvl = &t->mt->levels[0];
+   for (j = 0; j < 5; j++) {
+	OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+   }
+   END_BATCH();
+}
+
+static void cube_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = 2;
+   int i = atom->idx, j;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+   uint32_t base_reg;
+
+   if (!(ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT))
+	return;
+
+   if (!t)
+	return;
+
+   if (!t->mt)
+	return;
+
+   switch(i) {
+	case 1: base_reg = RADEON_PP_CUBIC_OFFSET_T1_0; break;
+	case 2: base_reg = RADEON_PP_CUBIC_OFFSET_T2_0; break;
+	default:
+	case 0: base_reg = RADEON_PP_CUBIC_OFFSET_T0_0; break;
+   };
+   BEGIN_BATCH_NO_AUTOSTATE(dwords + (5 * 4));
+   OUT_BATCH_TABLE(atom->cmd, 2);
+   lvl = &t->mt->levels[0];
+   for (j = 0; j < 5; j++) {
+	OUT_BATCH(CP_PACKET0(base_reg + (4 * j), 0));
+	OUT_BATCH_RELOC(lvl->faces[j].offset, t->mt->bo, lvl->faces[j].offset,
+			RADEON_GEM_DOMAIN_VRAM, 0, 0);
+   }
+   END_BATCH();
+}
+
+static void tex_emit(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+   int i = atom->idx;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+
+   if (t && t->mt && !t->image_override)
+     dwords += 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   OUT_BATCH_TABLE(atom->cmd, 3);
+   if (t && t->mt && !t->image_override) {
+     if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
+   	lvl = &t->mt->levels[0];
+	OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
+			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     } else {
+        OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+     }
+   } else if (!t) {
+     /* workaround for old CS mechanism */
+     OUT_BATCH(r100->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP]);
+     //     OUT_BATCH(r100->radeon.radeonScreen);
+   } else {
+     OUT_BATCH(t->override_offset);
    }
 
-   rmesa->state.color.clear = 0x00000000;
+   OUT_BATCH_TABLE((atom->cmd+4), 5);
+   END_BATCH();
+}
+
+static void tex_emit_cs(GLcontext *ctx, struct radeon_state_atom *atom)
+{
+   r100ContextPtr r100 = R100_CONTEXT(ctx);
+   BATCH_LOCALS(&r100->radeon);
+   uint32_t dwords = atom->cmd_size;
+   int i = atom->idx;
+   radeonTexObj *t = r100->state.texture.unit[i].texobj;
+   radeon_mipmap_level *lvl;
+   int hastexture = 1;
+
+   if (!t)
+	hastexture = 0;
+   else {
+	if (!t->mt && !t->bo)
+		hastexture = 0;
+   }
+   dwords += 1;
+   if (hastexture)
+     dwords += 2;
+   else
+     dwords -= 2;
+   BEGIN_BATCH_NO_AUTOSTATE(dwords);
+
+   OUT_BATCH(CP_PACKET0(RADEON_PP_TXFILTER_0 + (24 * i), 1));
+   OUT_BATCH_TABLE((atom->cmd + 1), 2);
+
+   if (hastexture) {
+     OUT_BATCH(CP_PACKET0(RADEON_PP_TXOFFSET_0 + (24 * i), 0));
+     if (t->mt && !t->image_override) {
+        if ((ctx->Texture.Unit[i]._ReallyEnabled & TEXTURE_CUBE_BIT)) {
+            lvl = &t->mt->levels[0];
+	    OUT_BATCH_RELOC(lvl->faces[5].offset, t->mt->bo, lvl->faces[5].offset,
+			RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+        } else {
+           OUT_BATCH_RELOC(t->tile_bits, t->mt->bo, 0,
+		     RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+        }
+      } else {
+	if (t->bo)
+            OUT_BATCH_RELOC(t->tile_bits, t->bo, 0,
+                            RADEON_GEM_DOMAIN_GTT|RADEON_GEM_DOMAIN_VRAM, 0, 0);
+      }
+   }
+
+   OUT_BATCH(CP_PACKET0(RADEON_PP_TXCBLEND_0 + (i * 24), 1));
+   OUT_BATCH_TABLE((atom->cmd+4), 2);
+   OUT_BATCH(CP_PACKET0(RADEON_PP_BORDER_COLOR_0 + (i * 4), 0));
+   OUT_BATCH((atom->cmd[TEX_PP_BORDER_COLOR]));
+   END_BATCH();
+}
+
+/* Initialize the context's hardware state.
+ */
+void radeonInitState( r100ContextPtr rmesa )
+{
+   GLcontext *ctx = rmesa->radeon.glCtx;
+   GLuint i;
+
+   rmesa->radeon.state.color.clear = 0x00000000;
 
    switch ( ctx->Visual.depthBits ) {
    case 16:
-      rmesa->state.depth.clear = 0x0000ffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffff;
-      depth_fmt = RADEON_DEPTH_FORMAT_16BIT_INT_Z;
-      rmesa->state.stencil.clear = 0x00000000;
+      rmesa->radeon.state.depth.clear = 0x0000ffff;
+      rmesa->radeon.state.stencil.clear = 0x00000000;
       break;
    case 24:
-      rmesa->state.depth.clear = 0x00ffffff;
-      rmesa->state.depth.scale = 1.0 / (GLfloat)0xffffff;
-      depth_fmt = RADEON_DEPTH_FORMAT_24BIT_INT_Z;
-      rmesa->state.stencil.clear = 0xffff0000;
+      rmesa->radeon.state.depth.clear = 0x00ffffff;
+      rmesa->radeon.state.stencil.clear = 0xffff0000;
       break;
    default:
-      fprintf( stderr, "Error: Unsupported depth %d... exiting\n",
-	       ctx->Visual.depthBits );
-      exit( -1 );
+      break;
    }
 
-   /* Only have hw stencil when depth buffer is 24 bits deep */
-   rmesa->state.stencil.hwBuffer = ( ctx->Visual.stencilBits > 0 &&
-				     ctx->Visual.depthBits == 24 );
+   rmesa->radeon.Fallback = 0;
 
-   rmesa->Fallback = 0;
 
-   if ( ctx->Visual.doubleBufferMode && rmesa->sarea->pfCurrentPage == 0 ) {
-      drawOffset = rmesa->radeonScreen->backOffset;
-      drawPitch  = rmesa->radeonScreen->backPitch;
-   } else {
-      drawOffset = rmesa->radeonScreen->frontOffset;
-      drawPitch  = rmesa->radeonScreen->frontPitch;
-   }
+   rmesa->radeon.hw.max_state_size = 0;
 
-   rmesa->hw.max_state_size = 0;
-
-#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )				\
+#define ALLOC_STATE_IDX( ATOM, CHK, SZ, NM, FLAG, IDX )		\
    do {								\
       rmesa->hw.ATOM.cmd_size = SZ;				\
-      rmesa->hw.ATOM.cmd = (int *)CALLOC(SZ * sizeof(int));	\
-      rmesa->hw.ATOM.lastcmd = (int *)CALLOC(SZ * sizeof(int));	\
-      rmesa->hw.ATOM.name = NM;					\
+      rmesa->hw.ATOM.cmd = (GLuint *)CALLOC(SZ * sizeof(int));	\
+      rmesa->hw.ATOM.lastcmd = (GLuint *)CALLOC(SZ * sizeof(int)); \
+      rmesa->hw.ATOM.name = NM;						\
       rmesa->hw.ATOM.is_tcl = FLAG;					\
       rmesa->hw.ATOM.check = check_##CHK;				\
-      rmesa->hw.ATOM.dirty = GL_TRUE;				\
-      rmesa->hw.max_state_size += SZ * sizeof(int);		\
+      rmesa->hw.ATOM.dirty = GL_TRUE;					\
+      rmesa->hw.ATOM.idx = IDX;					\
+      rmesa->radeon.hw.max_state_size += SZ * sizeof(int);		\
    } while (0)
-      
-      
+
+#define ALLOC_STATE( ATOM, CHK, SZ, NM, FLAG )		\
+   ALLOC_STATE_IDX(ATOM, CHK, SZ, NM, FLAG, 0)
+
    /* Allocate state buffers:
     */
    ALLOC_STATE( ctx, always, CTX_STATE_SIZE, "CTX/context", 0 );
+   if (rmesa->radeon.radeonScreen->kernel_mm)
+     rmesa->hw.ctx.emit = ctx_emit_cs;
+   else
+     rmesa->hw.ctx.emit = ctx_emit;
    ALLOC_STATE( lin, always, LIN_STATE_SIZE, "LIN/line", 0 );
    ALLOC_STATE( msk, always, MSK_STATE_SIZE, "MSK/mask", 0 );
    ALLOC_STATE( vpt, always, VPT_STATE_SIZE, "VPT/viewport", 0 );
@@ -233,20 +682,32 @@ void radeonInitState( radeonContextPtr rmesa )
    ALLOC_STATE( fog, fog, FOG_STATE_SIZE, "FOG/fog", 1 );
    ALLOC_STATE( glt, tcl_lighting, GLT_STATE_SIZE, "GLT/light-global", 1 );
    ALLOC_STATE( eye, tcl_lighting, EYE_STATE_SIZE, "EYE/eye-vector", 1 );
-   ALLOC_STATE( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0 );
-   ALLOC_STATE( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0 );
-   ALLOC_STATE( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0 );
-   if (rmesa->radeonScreen->drmSupportsCubeMapsR100)
+   ALLOC_STATE_IDX( tex[0], tex0, TEX_STATE_SIZE, "TEX/tex-0", 0, 0);
+   ALLOC_STATE_IDX( tex[1], tex1, TEX_STATE_SIZE, "TEX/tex-1", 0, 1);
+   ALLOC_STATE_IDX( tex[2], tex2, TEX_STATE_SIZE, "TEX/tex-2", 0, 2);
+
+   for (i = 0; i < 3; i++) {
+      if (rmesa->radeon.radeonScreen->kernel_mm)
+          rmesa->hw.tex[i].emit = tex_emit_cs;
+      else
+          rmesa->hw.tex[i].emit = tex_emit;
+   }
+   if (rmesa->radeon.radeonScreen->drmSupportsCubeMapsR100)
    {
-      ALLOC_STATE( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
-      ALLOC_STATE( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
-      ALLOC_STATE( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
+      ALLOC_STATE_IDX( cube[0], cube0, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+      ALLOC_STATE_IDX( cube[1], cube1, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+      ALLOC_STATE_IDX( cube[2], cube2, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
+      for (i = 0; i < 3; i++)
+          if (rmesa->radeon.radeonScreen->kernel_mm)
+              rmesa->hw.cube[i].emit = cube_emit_cs;
+          else
+              rmesa->hw.cube[i].emit = cube_emit;
    }
    else
    {
-      ALLOC_STATE( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0 );
-      ALLOC_STATE( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0 );
-      ALLOC_STATE( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0 );
+      ALLOC_STATE_IDX( cube[0], never, CUBE_STATE_SIZE, "CUBE/cube-0", 0, 0 );
+      ALLOC_STATE_IDX( cube[1], never, CUBE_STATE_SIZE, "CUBE/cube-1", 0, 1 );
+      ALLOC_STATE_IDX( cube[2], never, CUBE_STATE_SIZE, "CUBE/cube-2", 0, 2 );
    }
    ALLOC_STATE( mat[0], tcl, MAT_STATE_SIZE, "MAT/modelproject", 1 );
    ALLOC_STATE( mat[1], tcl_eyespace_or_fog, MAT_STATE_SIZE, "MAT/modelview", 1 );
@@ -268,43 +729,43 @@ void radeonInitState( radeonContextPtr rmesa )
    ALLOC_STATE( lit[5], tcl_lit5, LIT_STATE_SIZE, "LIT/light-5", 1 );
    ALLOC_STATE( lit[6], tcl_lit6, LIT_STATE_SIZE, "LIT/light-6", 1 );
    ALLOC_STATE( lit[7], tcl_lit7, LIT_STATE_SIZE, "LIT/light-7", 1 );
-   ALLOC_STATE( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0 );
-   ALLOC_STATE( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0 );
-   ALLOC_STATE( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0 );
+   ALLOC_STATE_IDX( txr[0], txr0, TXR_STATE_SIZE, "TXR/txr-0", 0, 0 );
+   ALLOC_STATE_IDX( txr[1], txr1, TXR_STATE_SIZE, "TXR/txr-1", 0, 1 );
+   ALLOC_STATE_IDX( txr[2], txr2, TXR_STATE_SIZE, "TXR/txr-2", 0, 2 );
 
    radeonSetUpAtomList( rmesa );
 
    /* Fill in the packet headers:
     */
-   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(RADEON_EMIT_PP_MISC);
-   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(RADEON_EMIT_PP_CNTL);
-   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(RADEON_EMIT_RB3D_COLORPITCH);
-   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(RADEON_EMIT_RE_LINE_PATTERN);
-   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(RADEON_EMIT_SE_LINE_WIDTH);
-   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(RADEON_EMIT_RB3D_STENCILREFMASK);
-   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(RADEON_EMIT_SE_VPORT_XSCALE);
-   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(RADEON_EMIT_SE_CNTL);
-   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(RADEON_EMIT_SE_CNTL_STATUS);
-   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(RADEON_EMIT_RE_MISC);
-   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_0);
-   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_0);
-   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_1);
-   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_1);
-   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(RADEON_EMIT_PP_TXFILTER_2);
-   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(RADEON_EMIT_PP_BORDER_COLOR_2);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_0);
-   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_1);
-   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(RADEON_EMIT_PP_CUBIC_FACES_2);
-   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
-   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(RADEON_EMIT_SE_ZBIAS_FACTOR);
-   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
+   rmesa->hw.ctx.cmd[CTX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_MISC);
+   rmesa->hw.ctx.cmd[CTX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CNTL);
+   rmesa->hw.ctx.cmd[CTX_CMD_2] = cmdpkt(rmesa, RADEON_EMIT_RB3D_COLORPITCH);
+   rmesa->hw.lin.cmd[LIN_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_LINE_PATTERN);
+   rmesa->hw.lin.cmd[LIN_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_LINE_WIDTH);
+   rmesa->hw.msk.cmd[MSK_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RB3D_STENCILREFMASK);
+   rmesa->hw.vpt.cmd[VPT_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_VPORT_XSCALE);
+   rmesa->hw.set.cmd[SET_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL);
+   rmesa->hw.set.cmd[SET_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_SE_CNTL_STATUS);
+   rmesa->hw.msc.cmd[MSC_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_RE_MISC);
+   rmesa->hw.tex[0].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_0);
+   rmesa->hw.tex[0].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_0);
+   rmesa->hw.tex[1].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_1);
+   rmesa->hw.tex[1].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_1);
+   rmesa->hw.tex[2].cmd[TEX_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TXFILTER_2);
+   rmesa->hw.tex[2].cmd[TEX_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_BORDER_COLOR_2);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_0);
+   rmesa->hw.cube[0].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T0);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_1);
+   rmesa->hw.cube[1].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T1);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_FACES_2);
+   rmesa->hw.cube[2].cmd[CUBE_CMD_1] = cmdpkt(rmesa, RADEON_EMIT_PP_CUBIC_OFFSETS_T2);
+   rmesa->hw.zbs.cmd[ZBS_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_ZBIAS_FACTOR);
+   rmesa->hw.tcl.cmd[TCL_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_SE_TCL_OUTPUT_VTX_FMT);
    rmesa->hw.mtl.cmd[MTL_CMD_0] = 
-      cmdpkt(RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
-   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_0);
-   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_1);
-   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(RADEON_EMIT_PP_TEX_SIZE_2);
+      cmdpkt(rmesa, RADEON_EMIT_SE_TCL_MATERIAL_EMMISSIVE_RED);
+   rmesa->hw.txr[0].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_0);
+   rmesa->hw.txr[1].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_1);
+   rmesa->hw.txr[2].cmd[TXR_CMD_0] = cmdpkt(rmesa, RADEON_EMIT_PP_TEX_SIZE_2);
    rmesa->hw.grd.cmd[GRD_CMD_0] = 
       cmdscl( RADEON_SS_VERT_GUARD_CLIP_ADJ_ADDR, 1, 4 );
    rmesa->hw.fog.cmd[FOG_CMD_0] = 
@@ -331,6 +792,22 @@ void radeonInitState( radeonContextPtr rmesa )
 	 cmdvec( RADEON_VS_UCP_ADDR + i, 1, 4 );
    }
 
+   if (rmesa->radeon.radeonScreen->kernel_mm) {
+      rmesa->hw.grd.emit = scl_emit;
+      rmesa->hw.fog.emit = vec_emit;
+      rmesa->hw.glt.emit = vec_emit;
+      rmesa->hw.eye.emit = vec_emit;
+      
+      for (i = 0; i <= 6; i++)
+	 rmesa->hw.mat[i].emit = vec_emit;
+
+      for (i = 0; i < 8; i++)
+	 rmesa->hw.lit[i].emit = lit_emit;
+
+      for (i = 0; i < 6; i++)
+	 rmesa->hw.ucp[i].emit = vec_emit;
+   }
+
    rmesa->last_ReallyEnabled = -1;
 
    /* Initial Harware state:
@@ -352,19 +829,7 @@ void radeonInitState( radeonContextPtr rmesa )
 					    RADEON_SRC_BLEND_GL_ONE |
 					    RADEON_DST_BLEND_GL_ZERO );
 
-   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHOFFSET] =
-      rmesa->radeonScreen->depthOffset + rmesa->radeonScreen->fbLocation;
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] = 
-      ((rmesa->radeonScreen->depthPitch &
-	RADEON_DEPTHPITCH_MASK) |
-       RADEON_DEPTH_ENDIAN_NO_SWAP);
-       
-   if (rmesa->using_hyperz)
-       rmesa->hw.ctx.cmd[CTX_RB3D_DEPTHPITCH] |= RADEON_DEPTH_HYPERZ;
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (depth_fmt |
-					       RADEON_Z_TEST_LESS |
+   rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] = (RADEON_Z_TEST_LESS |
 					       RADEON_STENCIL_TEST_ALWAYS |
 					       RADEON_STENCIL_FAIL_KEEP |
 					       RADEON_STENCIL_ZPASS_KEEP |
@@ -374,7 +839,7 @@ void radeonInitState( radeonContextPtr rmesa )
    if (rmesa->using_hyperz) {
        rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_COMPRESSION_ENABLE |
 						   RADEON_Z_DECOMPRESSION_ENABLE;
-      if (rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
+      if (rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL) {
 	 /* works for q3, but slight rendering errors with glxgears ? */
 /*	 rmesa->hw.ctx.cmd[CTX_RB3D_ZSTENCILCNTL] |= RADEON_Z_HIERARCHY_ENABLE;*/
 	 /* need this otherwise get lots of lockups with q3 ??? */
@@ -386,10 +851,9 @@ void radeonInitState( radeonContextPtr rmesa )
 				     RADEON_ANTI_ALIAS_NONE);
 
    rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] = (RADEON_PLANE_MASK_ENABLE |
-				       color_fmt |
 				       RADEON_ZBLOCK16);
 
-   switch ( driQueryOptioni( &rmesa->optionCache, "dither_mode" ) ) {
+   switch ( driQueryOptioni( &rmesa->radeon.optionCache, "dither_mode" ) ) {
    case DRI_CONF_DITHER_XERRORDIFFRESET:
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_INIT;
       break;
@@ -397,31 +861,18 @@ void radeonInitState( radeonContextPtr rmesa )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_SCALE_DITHER_ENABLE;
       break;
    }
-   if ( driQueryOptioni( &rmesa->optionCache, "round_mode" ) ==
+   if ( driQueryOptioni( &rmesa->radeon.optionCache, "round_mode" ) ==
 	DRI_CONF_ROUND_ROUND )
-      rmesa->state.color.roundEnable = RADEON_ROUND_ENABLE;
+      rmesa->radeon.state.color.roundEnable = RADEON_ROUND_ENABLE;
    else
-      rmesa->state.color.roundEnable = 0;
-   if ( driQueryOptioni (&rmesa->optionCache, "color_reduction" ) ==
+      rmesa->radeon.state.color.roundEnable = 0;
+   if ( driQueryOptioni (&rmesa->radeon.optionCache, "color_reduction" ) ==
 	DRI_CONF_COLOR_REDUCTION_DITHER )
       rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= RADEON_DITHER_ENABLE;
    else
-      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->state.color.roundEnable;
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLOROFFSET] = ((drawOffset +
-					       rmesa->radeonScreen->fbLocation)
-					      & RADEON_COLOROFFSET_MASK);
-
-   rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] = ((drawPitch &
-					      RADEON_COLORPITCH_MASK) |
-					     RADEON_COLOR_ENDIAN_NO_SWAP);
+      rmesa->hw.ctx.cmd[CTX_RB3D_CNTL] |= rmesa->radeon.state.color.roundEnable;
 
 
-   /* (fixed size) sarea is initialized to zero afaics so can omit version check. Phew! */
-   if (rmesa->sarea->tiling_enabled) {
-      rmesa->hw.ctx.cmd[CTX_RB3D_COLORPITCH] |= RADEON_COLOR_TILE_ENABLE;
-   }
-
    rmesa->hw.set.cmd[SET_SE_CNTL] = (RADEON_FFACE_CULL_CCW |
 				     RADEON_BFACE_SOLID |
 				     RADEON_FFACE_SOLID |
@@ -444,7 +895,7 @@ void radeonInitState( radeonContextPtr rmesa )
   					    RADEON_VC_NO_SWAP;
 #endif
 
-   if (!(rmesa->radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
+   if (!(rmesa->radeon.radeonScreen->chip_flags & RADEON_CHIPSET_TCL)) {
      rmesa->hw.set.cmd[SET_SE_CNTL_STATUS] |= RADEON_TCL_BYPASS;
    }
 
@@ -491,8 +942,8 @@ void radeonInitState( radeonContextPtr rmesa )
 	   (2 << RADEON_TXFORMAT_HEIGHT_SHIFT));
 
       /* Initialize the texture offset to the start of the card texture heap */
-      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+      //      rmesa->hw.tex[i].cmd[TEX_PP_TXOFFSET] =
+      //	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
 
       rmesa->hw.tex[i].cmd[TEX_PP_BORDER_COLOR] = 0;
       rmesa->hw.tex[i].cmd[TEX_PP_TXCBLEND] =  
@@ -513,15 +964,15 @@ void radeonInitState( radeonContextPtr rmesa )
 
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_FACES] = 0;
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_0] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_1] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_2] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_3] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
       rmesa->hw.cube[i].cmd[CUBE_PP_CUBIC_OFFSET_4] =
-	  rmesa->radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
+	  rmesa->radeon.radeonScreen->texOffset[RADEON_LOCAL_TEX_HEAP];
    }
 
    /* Can only add ST1 at the time of doing some multitex but can keep
@@ -613,5 +1064,7 @@ void radeonInitState( radeonContextPtr rmesa )
    rmesa->hw.eye.cmd[EYE_Z] = IEEE_ONE;
    rmesa->hw.eye.cmd[EYE_RESCALE_FACTOR] = IEEE_ONE;
    
-   rmesa->hw.all_dirty = GL_TRUE;
+   rmesa->radeon.hw.all_dirty = GL_TRUE;
+
+   rcommonInitCmdBuf(&rmesa->radeon);
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.c b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
index ebea1fecdca..e31f045991c 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.c
@@ -52,8 +52,6 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_tcl.h"
 
 
-static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
-
 /* R100: xyzw, c0, c1/fog, stq[0..2]  = 4+1+1+3*3 = 15  right? */
 /* R200: xyzw, c0, c1/fog, strq[0..5] = 4+1+1+4*6 = 30 */
 #define RADEON_MAX_TNL_VERTEX_SIZE (15 * sizeof(GLfloat))	/* for mesa _tnl stage */
@@ -64,18 +62,18 @@ static void flush_last_swtcl_prim( radeonContextPtr rmesa  );
 
 #define EMIT_ATTR( ATTR, STYLE, F0 )					\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = (ATTR);	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = (STYLE);	\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = (ATTR);	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = (STYLE);	\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
    fmt_0 |= F0;								\
 } while (0)
 
 #define EMIT_PAD( N )							\
 do {									\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].attrib = 0;		\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].format = EMIT_PAD;	\
-   rmesa->swtcl.vertex_attrs[rmesa->swtcl.vertex_attr_count].offset = (N);		\
-   rmesa->swtcl.vertex_attr_count++;					\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].attrib = 0;		\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].format = EMIT_PAD;	\
+   rmesa->radeon.swtcl.vertex_attrs[rmesa->radeon.swtcl.vertex_attr_count].offset = (N);		\
+   rmesa->radeon.swtcl.vertex_attr_count++;					\
 } while (0)
 
 static GLuint radeon_cp_vc_frmts[3][2] =
@@ -87,7 +85,7 @@ static GLuint radeon_cp_vc_frmts[3][2] =
 
 static void radeonSetVertexFormat( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
    DECLARE_RENDERINPUTS(index_bitset);
@@ -106,7 +104,7 @@ static void radeonSetVertexFormat( GLcontext *ctx )
    }
 
    assert( VB->AttribPtr[VERT_ATTRIB_POS] != NULL );
-   rmesa->swtcl.vertex_attr_count = 0;
+   rmesa->radeon.swtcl.vertex_attr_count = 0;
 
    /* EMIT_ATTR's must be in order as they tell t_vertex.c how to
     * build up a hardware vertex.
@@ -204,33 +202,33 @@ static void radeonSetVertexFormat( GLcontext *ctx )
       }
    }
 
-   if (!RENDERINPUTS_EQUAL( rmesa->tnl_index_bitset, index_bitset ) ||
+   if (!RENDERINPUTS_EQUAL( rmesa->radeon.tnl_index_bitset, index_bitset ) ||
 	fmt_0 != rmesa->swtcl.vertex_format) {
       RADEON_NEWPRIM(rmesa);
       rmesa->swtcl.vertex_format = fmt_0;
-      rmesa->swtcl.vertex_size =
+      rmesa->radeon.swtcl.vertex_size =
 	  _tnl_install_attrs( ctx,
-			      rmesa->swtcl.vertex_attrs, 
-			      rmesa->swtcl.vertex_attr_count,
+			      rmesa->radeon.swtcl.vertex_attrs, 
+			      rmesa->radeon.swtcl.vertex_attr_count,
 			      NULL, 0 );
-      rmesa->swtcl.vertex_size /= 4;
-      RENDERINPUTS_COPY( rmesa->tnl_index_bitset, index_bitset );
+      rmesa->radeon.swtcl.vertex_size /= 4;
+      RENDERINPUTS_COPY( rmesa->radeon.tnl_index_bitset, index_bitset );
       if (RADEON_DEBUG & DEBUG_VERTS)
 	 fprintf( stderr, "%s: vertex_size= %d floats\n",
-		  __FUNCTION__, rmesa->swtcl.vertex_size);
+		  __FUNCTION__, rmesa->radeon.swtcl.vertex_size);
    }
 }
 
 
 static void radeonRenderStart( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
 
    radeonSetVertexFormat( ctx );
    
-   if (rmesa->dma.flush != 0 && 
-       rmesa->dma.flush != flush_last_swtcl_prim)
-      rmesa->dma.flush( rmesa );
+   if (rmesa->radeon.dma.flush != 0 && 
+       rmesa->radeon.dma.flush != rcommon_flush_last_swtcl_prim)
+      rmesa->radeon.dma.flush( ctx );
 }
 
 
@@ -241,7 +239,7 @@ static void radeonRenderStart( GLcontext *ctx )
  */
 void radeonChooseVertexState( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    TNLcontext *tnl = TNL_CONTEXT(ctx);
 
    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
@@ -254,7 +252,7 @@ void radeonChooseVertexState( GLcontext *ctx )
     * rasterization fallback.  As this function will be called again when we
     * leave a rasterization fallback, we can just skip it for now.
     */
-   if (rmesa->Fallback != 0)
+   if (rmesa->radeon.Fallback != 0)
       return;
 
    /* HW perspective divide is a win, but tiny vertex formats are a
@@ -281,80 +279,29 @@ void radeonChooseVertexState( GLcontext *ctx )
    }
 }
 
-
-/* Flush vertices in the current dma region.
- */
-static void flush_last_swtcl_prim( radeonContextPtr rmesa  )
+void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset)
 {
-   if (RADEON_DEBUG & DEBUG_IOCTL)
-      fprintf(stderr, "%s\n", __FUNCTION__);
-
-   rmesa->dma.flush = NULL;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   if (rmesa->dma.current.buf) {
-      struct radeon_dma_region *current = &rmesa->dma.current;
-      GLuint current_offset = (rmesa->radeonScreen->gart_buffer_offset +
-			       current->buf->buf->idx * RADEON_BUFFER_SIZE + 
-			       current->start);
+   rcommonEnsureCmdBufSpace(&rmesa->radeon,
+			    rmesa->radeon.hw.max_state_size + (12*sizeof(int)),
+			    __FUNCTION__);
 
-      assert (!(rmesa->swtcl.hw_primitive & RADEON_CP_VC_CNTL_PRIM_WALK_IND));
 
-      assert (current->start + 
-	      rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	      current->ptr);
+   radeonEmitState(&rmesa->radeon);
+   radeonEmitVertexAOS( rmesa,
+			rmesa->radeon.swtcl.vertex_size,
+			rmesa->radeon.dma.current,
+			current_offset);
 
-      if (rmesa->dma.current.start != rmesa->dma.current.ptr) {
-	 radeonEnsureCmdBufSpace( rmesa, VERT_AOS_BUFSZ +
-			          rmesa->hw.max_state_size + VBUF_BUFSZ );
-
-	 radeonEmitVertexAOS( rmesa,
-			      rmesa->swtcl.vertex_size,
-			      current_offset);
-
-	 radeonEmitVbufPrim( rmesa,
-			     rmesa->swtcl.vertex_format,
-			     rmesa->swtcl.hw_primitive,
-			     rmesa->swtcl.numverts);
-      }
+		      
+   radeonEmitVbufPrim( rmesa,
+		       rmesa->swtcl.vertex_format,
+		       rmesa->radeon.swtcl.hw_primitive,
+		       rmesa->radeon.swtcl.numverts);
 
-      rmesa->swtcl.numverts = 0;
-      current->start = current->ptr;
-   }
 }
 
-
-/* Alloc space in the current dma region.
- */
-static INLINE void *
-radeonAllocDmaLowVerts( radeonContextPtr rmesa, int nverts, int vsize )
-{
-   GLuint bytes = vsize * nverts;
-
-   if ( rmesa->dma.current.ptr + bytes > rmesa->dma.current.end ) 
-      radeonRefillCurrentDmaRegion( rmesa );
-
-   if (!rmesa->dma.flush) {
-      rmesa->glCtx->Driver.NeedFlush |= FLUSH_STORED_VERTICES;
-      rmesa->dma.flush = flush_last_swtcl_prim;
-   }
-
-   assert( vsize == rmesa->swtcl.vertex_size * 4 );
-   assert( rmesa->dma.flush == flush_last_swtcl_prim );
-   assert (rmesa->dma.current.start + 
-	   rmesa->swtcl.numverts * rmesa->swtcl.vertex_size * 4 ==
-	   rmesa->dma.current.ptr);
-
-
-   {
-      GLubyte *head = (GLubyte *)(rmesa->dma.current.address + rmesa->dma.current.ptr);
-      rmesa->dma.current.ptr += bytes;
-      rmesa->swtcl.numverts += nverts;
-      return head;
-   }
-
-}
-
-
 /*
  * Render unclipped vertex buffers by emitting vertices directly to
  * dma buffers.  Use strip/fan hardware primitives where possible.
@@ -387,22 +334,22 @@ static const GLuint hw_prim[GL_POLYGON+1] = {
 };
 
 static INLINE void
-radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
+radeonDmaPrimitive( r100ContextPtr rmesa, GLenum prim )
 {
    RADEON_NEWPRIM( rmesa );
-   rmesa->swtcl.hw_primitive = hw_prim[prim];
-   assert(rmesa->dma.current.ptr == rmesa->dma.current.start);
+   rmesa->radeon.swtcl.hw_primitive = hw_prim[prim];
+   //   assert(rmesa->radeon.dma.current.ptr == rmesa->radeon.dma.current.start);
 }
 
-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
+#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
 #define INIT( prim ) radeonDmaPrimitive( rmesa, prim )
 #define FLUSH()  RADEON_NEWPRIM( rmesa )
-#define GET_CURRENT_VB_MAX_VERTS() \
-  (((int)rmesa->dma.current.end - (int)rmesa->dma.current.ptr) / (rmesa->swtcl.vertex_size*4))
+#define GET_CURRENT_VB_MAX_VERTS()					10\
+//  (((int)rmesa->radeon.dma.current.end - (int)rmesa->radeon.dma.current.ptr) / (rmesa->radeon.swtcl.vertex_size*4))
 #define GET_SUBSEQUENT_VB_MAX_VERTS() \
-  ((RADEON_BUFFER_SIZE) / (rmesa->swtcl.vertex_size*4))
+  ((RADEON_BUFFER_SIZE) / (rmesa->radeon.swtcl.vertex_size*4))
 #define ALLOC_VERTS( nr ) \
-  radeonAllocDmaLowVerts( rmesa, nr, rmesa->swtcl.vertex_size * 4 )
+  rcommonAllocDmaLowVerts( &rmesa->radeon, nr, rmesa->radeon.swtcl.vertex_size * 4 )
 #define EMIT_VERTS( ctx, j, nr, buf ) \
   _tnl_emit_vertices_to_buffer(ctx, j, (j)+(nr), buf)
 
@@ -418,16 +365,13 @@ radeonDmaPrimitive( radeonContextPtr rmesa, GLenum prim )
 static GLboolean radeon_run_render( GLcontext *ctx,
 				    struct tnl_pipeline_stage *stage )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
    tnl_render_func *tab = TAG(render_tab_verts);
    GLuint i;
 
-   if (rmesa->swtcl.indexed_verts.buf) 
-      RELEASE_ELT_VERTS();
-   	
-   if (rmesa->swtcl.RenderIndex != 0 ||   
+   if (rmesa->radeon.swtcl.RenderIndex != 0 ||   
        !radeon_dma_validate_render( ctx, VB ))
       return GL_TRUE;		
 
@@ -496,13 +440,13 @@ static void radeonResetLineStipple( GLcontext *ctx );
 
 #undef LOCAL_VARS
 #undef ALLOC_VERTS
-#define CTX_ARG radeonContextPtr rmesa
-#define GET_VERTEX_DWORDS() rmesa->swtcl.vertex_size
-#define ALLOC_VERTS( n, size ) radeonAllocDmaLowVerts( rmesa, n, (size) * 4 )
+#define CTX_ARG r100ContextPtr rmesa
+#define GET_VERTEX_DWORDS() rmesa->radeon.swtcl.vertex_size
+#define ALLOC_VERTS( n, size ) rcommonAllocDmaLowVerts( &rmesa->radeon, n, (size) * 4 )
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
-   const char *radeonverts = (char *)rmesa->swtcl.verts;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);		\
+   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;
 #define VERT(x) (radeonVertex *)(radeonverts + ((x) * (vertsize) * sizeof(int)))
 #define VERTEX radeonVertex 
 #undef TAG
@@ -560,7 +504,7 @@ static struct {
 #define VERT_Y(_v) _v->v.y
 #define VERT_Z(_v) _v->v.z
 #define AREA_IS_CCW( a ) (a < 0)
-#define GET_VERTEX(e) (rmesa->swtcl.verts + ((e) * rmesa->swtcl.vertex_size * sizeof(int)))
+#define GET_VERTEX(e) (rmesa->radeon.swtcl.verts + ((e) * rmesa->radeon.swtcl.vertex_size * sizeof(int)))
 
 #define VERT_SET_RGBA( v, c )  					\
 do {								\
@@ -606,7 +550,7 @@ do {							\
 #undef INIT
 
 #define LOCAL_VARS(n)							\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);			\
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);			\
    GLuint color[n], spec[n];						\
    GLuint coloroffset = rmesa->swtcl.coloroffset;	\
    GLuint specoffset = rmesa->swtcl.specoffset;			\
@@ -617,7 +561,7 @@ do {							\
  ***********************************************************************/
 
 #define RASTERIZE(x) radeonRasterPrimitive( ctx, reduced_hw_prim[x] )
-#define RENDER_PRIMITIVE rmesa->swtcl.render_primitive
+#define RENDER_PRIMITIVE rmesa->radeon.swtcl.render_primitive
 #undef TAG
 #define TAG(x) x
 #include "tnl_dd/t_dd_unfilled.h"
@@ -673,9 +617,9 @@ static void init_rast_tab( void )
 } while (0)
 #undef LOCAL_VARS
 #define LOCAL_VARS						\
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);		\
-   const GLuint vertsize = rmesa->swtcl.vertex_size;		\
-   const char *radeonverts = (char *)rmesa->swtcl.verts;		\
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);		\
+   const GLuint vertsize = rmesa->radeon.swtcl.vertex_size;		\
+   const char *radeonverts = (char *)rmesa->radeon.swtcl.verts;		\
    const GLuint * const elt = TNL_CONTEXT(ctx)->vb.Elts;	\
    const GLboolean stipple = ctx->Line.StippleFlag;		\
    (void) elt; (void) stipple;
@@ -700,17 +644,17 @@ static void init_rast_tab( void )
 void radeonChooseRenderState( GLcontext *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint index = 0;
    GLuint flags = ctx->_TriangleCaps;
 
-   if (!rmesa->TclFallback || rmesa->Fallback) 
+   if (!rmesa->radeon.TclFallback || rmesa->radeon.Fallback) 
       return;
 
    if (flags & DD_TRI_LIGHT_TWOSIDE) index |= RADEON_TWOSIDE_BIT;
    if (flags & DD_TRI_UNFILLED)      index |= RADEON_UNFILLED_BIT;
 
-   if (index != rmesa->swtcl.RenderIndex) {
+   if (index != rmesa->radeon.swtcl.RenderIndex) {
       tnl->Driver.Render.Points = rast_tab[index].points;
       tnl->Driver.Render.Line = rast_tab[index].line;
       tnl->Driver.Render.ClippedLine = rast_tab[index].line;
@@ -727,7 +671,7 @@ void radeonChooseRenderState( GLcontext *ctx )
 	 tnl->Driver.Render.ClippedPolygon = _tnl_RenderClippedPolygon;
       }
 
-      rmesa->swtcl.RenderIndex = index;
+      rmesa->radeon.swtcl.RenderIndex = index;
    }
 }
 
@@ -739,18 +683,18 @@ void radeonChooseRenderState( GLcontext *ctx )
 
 static void radeonRasterPrimitive( GLcontext *ctx, GLuint hwprim )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   if (rmesa->swtcl.hw_primitive != hwprim) {
+   if (rmesa->radeon.swtcl.hw_primitive != hwprim) {
       RADEON_NEWPRIM( rmesa );
-      rmesa->swtcl.hw_primitive = hwprim;
+      rmesa->radeon.swtcl.hw_primitive = hwprim;
    }
 }
 
 static void radeonRenderPrimitive( GLcontext *ctx, GLenum prim )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   rmesa->swtcl.render_primitive = prim;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   rmesa->radeon.swtcl.render_primitive = prim;
    if (prim < GL_TRIANGLES || !(ctx->_TriangleCaps & DD_TRI_UNFILLED)) 
       radeonRasterPrimitive( ctx, reduced_hw_prim[prim] );
 }
@@ -761,7 +705,7 @@ static void radeonRenderFinish( GLcontext *ctx )
 
 static void radeonResetLineStipple( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    RADEON_STATECHANGE( rmesa, lin );
 }
 
@@ -795,17 +739,17 @@ static const char *getFallbackString(GLuint bit)
 
 void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->Fallback;
+   GLuint oldfallback = rmesa->radeon.Fallback;
 
    if (mode) {
-      rmesa->Fallback |= bit;
+      rmesa->radeon.Fallback |= bit;
       if (oldfallback == 0) {
-	 RADEON_FIREVERTICES( rmesa );
+	 radeon_firevertices(&rmesa->radeon);
 	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_TRUE );
 	 _swsetup_Wakeup( ctx );
-	 rmesa->swtcl.RenderIndex = ~0;
+	 rmesa->radeon.swtcl.RenderIndex = ~0;
          if (RADEON_DEBUG & DEBUG_FALLBACKS) {
             fprintf(stderr, "Radeon begin rasterization fallback: 0x%x %s\n",
                     bit, getFallbackString(bit));
@@ -813,7 +757,7 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
       }
    }
    else {
-      rmesa->Fallback &= ~bit;
+      rmesa->radeon.Fallback &= ~bit;
       if (oldfallback == bit) {
 	 _swrast_flush( ctx );
 	 tnl->Driver.Render.Start = radeonRenderStart;
@@ -826,14 +770,14 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 
 	 tnl->Driver.Render.ResetLineStipple = radeonResetLineStipple;
 	 TCL_FALLBACK( ctx, RADEON_TCL_FALLBACK_RASTER, GL_FALSE );
-	 if (rmesa->TclFallback) {
-	    /* These are already done if rmesa->TclFallback goes to
+	 if (rmesa->radeon.TclFallback) {
+	    /* These are already done if rmesa->radeon.TclFallback goes to
 	     * zero above. But not if it doesn't (RADEON_NO_TCL for
 	     * example?)
 	     */
 	    _tnl_invalidate_vertex_state( ctx, ~0 );
 	    _tnl_invalidate_vertices( ctx, ~0 );
-	    RENDERINPUTS_ZERO( rmesa->tnl_index_bitset );
+	    RENDERINPUTS_ZERO( rmesa->radeon.tnl_index_bitset );
 	    radeonChooseVertexState( ctx );
 	    radeonChooseRenderState( ctx );
 	 }
@@ -853,7 +797,7 @@ void radeonFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 void radeonInitSwtcl( GLcontext *ctx )
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    static int firsttime = 1;
 
    if (firsttime) {
@@ -872,18 +816,9 @@ void radeonInitSwtcl( GLcontext *ctx )
    _tnl_init_vertices( ctx, ctx->Const.MaxArrayLockSize + 12, 
 		       RADEON_MAX_TNL_VERTEX_SIZE);
    
-   rmesa->swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
-   rmesa->swtcl.RenderIndex = ~0;
-   rmesa->swtcl.render_primitive = GL_TRIANGLES;
-   rmesa->swtcl.hw_primitive = 0;
+   rmesa->radeon.swtcl.verts = (GLubyte *)tnl->clipspace.vertex_buf;
+   rmesa->radeon.swtcl.RenderIndex = ~0;
+   rmesa->radeon.swtcl.render_primitive = GL_TRIANGLES;
+   rmesa->radeon.swtcl.hw_primitive = 0;
 }
 
-
-void radeonDestroySwtcl( GLcontext *ctx )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (rmesa->swtcl.indexed_verts.buf) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-			      __FUNCTION__ );
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_swtcl.h b/src/mesa/drivers/dri/radeon/radeon_swtcl.h
index e485052ad77..da89158eeb9 100644
--- a/src/mesa/drivers/dri/radeon/radeon_swtcl.h
+++ b/src/mesa/drivers/dri/radeon/radeon_swtcl.h
@@ -40,7 +40,6 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "radeon_context.h"
 
 extern void radeonInitSwtcl( GLcontext *ctx );
-extern void radeonDestroySwtcl( GLcontext *ctx );
 
 extern void radeonChooseRenderState( GLcontext *ctx );
 extern void radeonChooseVertexState( GLcontext *ctx );
@@ -63,5 +62,5 @@ extern void radeon_translate_vertex( GLcontext *ctx,
 
 extern void radeon_print_vertex( GLcontext *ctx, const radeonVertex *v );
 
-
+extern void r100_swtcl_flush(GLcontext *ctx, uint32_t current_offset);
 #endif
diff --git a/src/mesa/drivers/dri/radeon/radeon_tcl.c b/src/mesa/drivers/dri/radeon/radeon_tcl.c
index 779e9ae5df0..df6708f05e3 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tcl.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tcl.c
@@ -42,6 +42,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "tnl/tnl.h"
 #include "tnl/t_pipeline.h"
 
+#include "radeon_common.h"
 #include "radeon_context.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
@@ -104,7 +105,7 @@ static GLboolean discrete_prim[0x10] = {
 };
    
 
-#define LOCAL_VARS radeonContextPtr rmesa = RADEON_CONTEXT(ctx)
+#define LOCAL_VARS r100ContextPtr rmesa = R100_CONTEXT(ctx)
 #define ELT_TYPE  GLushort
 
 #define ELT_INIT(prim, hw_prim) \
@@ -125,7 +126,7 @@ static GLboolean discrete_prim[0x10] = {
 
 #define RESET_STIPPLE() do {			\
    RADEON_STATECHANGE( rmesa, lin );		\
-   radeonEmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);			\
 } while (0)
 
 #define AUTO_STIPPLE( mode )  do {		\
@@ -136,31 +137,29 @@ static GLboolean discrete_prim[0x10] = {
    else						\
       rmesa->hw.lin.cmd[LIN_RE_LINE_PATTERN] &=	\
 	 ~RADEON_LINE_PATTERN_AUTO_RESET;	\
-   radeonEmitState( rmesa );			\
+   radeonEmitState(&rmesa->radeon);		\
 } while (0)
 
 
 
 #define ALLOC_ELTS(nr)	radeonAllocElts( rmesa, nr )
 
-static GLushort *radeonAllocElts( radeonContextPtr rmesa, GLuint nr ) 
+static GLushort *radeonAllocElts( r100ContextPtr rmesa, GLuint nr ) 
 {
-   if (rmesa->dma.flush)
-      rmesa->dma.flush( rmesa );
+      if (rmesa->radeon.dma.flush)
+	 rmesa->radeon.dma.flush( rmesa->radeon.glCtx );
 
-   radeonEnsureCmdBufSpace(rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			   rmesa->hw.max_state_size + ELTS_BUFSZ(nr));
+      rcommonEnsureCmdBufSpace(&rmesa->radeon, rmesa->radeon.hw.max_state_size + ELTS_BUFSZ(nr) + 
+			       AOS_BUFSZ(rmesa->radeon.tcl.aos_count), __FUNCTION__);
 
-   radeonEmitAOS( rmesa,
-		rmesa->tcl.aos_components,
-		rmesa->tcl.nr_aos_components, 0 );
+      radeonEmitAOS( rmesa,
+		     rmesa->radeon.tcl.aos_count, 0 );
 
-   return radeonAllocEltsOpenEnded( rmesa,
-				    rmesa->tcl.vertex_format, 
-				    rmesa->tcl.hw_primitive, nr );
+      return radeonAllocEltsOpenEnded( rmesa, rmesa->tcl.vertex_format,
+				       rmesa->tcl.hw_primitive, nr );
 }
 
-#define CLOSE_ELTS()  RADEON_NEWPRIM( rmesa )
+#define CLOSE_ELTS() if (0)  RADEON_NEWPRIM( rmesa )
 
 
 
@@ -174,15 +173,15 @@ static void radeonEmitPrim( GLcontext *ctx,
 		       GLuint start, 
 		       GLuint count)	
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT( ctx );
+   r100ContextPtr rmesa = R100_CONTEXT( ctx );
    radeonTclPrimitive( ctx, prim, hwprim );
    
-   radeonEnsureCmdBufSpace( rmesa, AOS_BUFSZ(rmesa->tcl.nr_aos_components) +
-			    rmesa->hw.max_state_size + VBUF_BUFSZ );
+   rcommonEnsureCmdBufSpace( &rmesa->radeon,
+			     AOS_BUFSZ(rmesa->radeon.tcl.aos_count) +
+			     rmesa->radeon.hw.max_state_size + VBUF_BUFSZ, __FUNCTION__ );
 
    radeonEmitAOS( rmesa,
-		  rmesa->tcl.aos_components,
-		  rmesa->tcl.nr_aos_components,
+		  rmesa->radeon.tcl.aos_count,
 		  start );
    
    /* Why couldn't this packet have taken an offset param?
@@ -254,7 +253,7 @@ void radeonTclPrimitive( GLcontext *ctx,
 			 GLenum prim,
 			 int hw_prim )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint se_cntl;
    GLuint newprim = hw_prim | RADEON_CP_VC_CNTL_TCL_ENABLE;
 
@@ -371,7 +370,7 @@ radeonComputeFogBlendFactor( GLcontext *ctx, GLfloat fogcoord )
 static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 					struct tnl_pipeline_stage *stage )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    struct vertex_buffer *VB = &tnl->vb;
    GLuint inputs = VERT_BIT_POS | VERT_BIT_COLOR0;
@@ -379,7 +378,7 @@ static GLboolean radeon_run_tcl_render( GLcontext *ctx,
 
    /* TODO: separate this from the swtnl pipeline 
     */
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       return GL_TRUE;	/* fallback to software t&l */
 
    if (VB->Count == 0)
@@ -461,7 +460,7 @@ const struct tnl_pipeline_stage _radeon_tcl_stage =
 
 static void transition_to_swtnl( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLuint se_cntl;
 
@@ -490,7 +489,7 @@ static void transition_to_swtnl( GLcontext *ctx )
 
 static void transition_to_hwtnl( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
 
@@ -509,15 +508,15 @@ static void transition_to_hwtnl( GLcontext *ctx )
 
    tnl->Driver.NotifyMaterialChange = radeonUpdateMaterial;
 
-   if ( rmesa->dma.flush )			
-      rmesa->dma.flush( rmesa );	
+   if ( rmesa->radeon.dma.flush )			
+      rmesa->radeon.dma.flush( rmesa->radeon.glCtx );	
 
-   rmesa->dma.flush = NULL;
+   rmesa->radeon.dma.flush = NULL;
    rmesa->swtcl.vertex_format = 0;
    
-   if (rmesa->swtcl.indexed_verts.buf) 
-      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
-			      __FUNCTION__ );
+   //   if (rmesa->swtcl.indexed_verts.buf) 
+   //      radeonReleaseDmaRegion( rmesa, &rmesa->swtcl.indexed_verts, 
+   //			      __FUNCTION__ );
 
    if (RADEON_DEBUG & DEBUG_FALLBACKS) 
       fprintf(stderr, "Radeon end tcl fallback\n");
@@ -550,11 +549,11 @@ static char *getFallbackString(GLuint bit)
 
 void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   GLuint oldfallback = rmesa->TclFallback;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   GLuint oldfallback = rmesa->radeon.TclFallback;
 
    if (mode) {
-      rmesa->TclFallback |= bit;
+      rmesa->radeon.TclFallback |= bit;
       if (oldfallback == 0) {
 	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
 	    fprintf(stderr, "Radeon begin tcl fallback %s\n",
@@ -563,7 +562,7 @@ void radeonTclFallback( GLcontext *ctx, GLuint bit, GLboolean mode )
       }
    }
    else {
-      rmesa->TclFallback &= ~bit;
+      rmesa->radeon.TclFallback &= ~bit;
       if (oldfallback == bit) {
 	 if (RADEON_DEBUG & DEBUG_FALLBACKS) 
 	    fprintf(stderr, "Radeon end tcl fallback %s\n",
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.c b/src/mesa/drivers/dri/radeon/radeon_tex.c
index f2b6deb9c04..2549d5cb5cb 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.c
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.c
@@ -44,6 +44,7 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/texobj.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
 #include "radeon_swtcl.h"
@@ -170,10 +171,13 @@ static void radeonSetTexFilter( radeonTexObjPtr t, GLenum minf, GLenum magf )
 {
    GLuint anisotropy = (t->pp_txfilter & RADEON_MAX_ANISO_MASK);
 
+   /* Force revalidation to account for switches from/to mipmapping. */
+   t->validated = GL_FALSE;
+
    t->pp_txfilter &= ~(RADEON_MIN_FILTER_MASK | RADEON_MAG_FILTER_MASK);
 
    /* r100 chips can't handle mipmaps/aniso for cubemap/volume textures */
-   if ( t->base.tObj->Target == GL_TEXTURE_CUBE_MAP ) {
+   if ( t->base.Target == GL_TEXTURE_CUBE_MAP ) {
       switch ( minf ) {
       case GL_NEAREST:
       case GL_NEAREST_MIPMAP_NEAREST:
@@ -249,433 +253,13 @@ static void radeonSetTexBorderColor( radeonTexObjPtr t, const GLfloat color[4] )
    t->pp_border_color = radeonPackColor( 4, c[0], c[1], c[2], c[3] );
 }
 
-
-/**
- * Allocate space for and load the mesa images into the texture memory block.
- * This will happen before drawing with a new texture, or drawing with a
- * texture after it was swapped out or teximaged again.
- */
-
-static radeonTexObjPtr radeonAllocTexObj( struct gl_texture_object *texObj )
-{
-   radeonTexObjPtr t;
-
-   t = CALLOC_STRUCT( radeon_tex_obj );
-   texObj->DriverData = t;
-   if ( t != NULL ) {
-      if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
-	 fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)texObj, (void *)t );
-      }
-
-      /* Initialize non-image-dependent parts of the state:
-       */
-      t->base.tObj = texObj;
-      t->border_fallback = GL_FALSE;
-
-      t->pp_txfilter = RADEON_BORDER_MODE_OGL;
-      t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
-			RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
-
-      make_empty_list( & t->base );
-
-      radeonSetTexWrap( t, texObj->WrapS, texObj->WrapT );
-      radeonSetTexMaxAnisotropy( t, texObj->MaxAnisotropy );
-      radeonSetTexFilter( t, texObj->MinFilter, texObj->MagFilter );
-      radeonSetTexBorderColor( t, texObj->BorderColor );
-   }
-
-   return t;
-}
-
-
-static const struct gl_texture_format *
-radeonChooseTextureFormat( GLcontext *ctx, GLint internalFormat,
-                           GLenum format, GLenum type )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   const GLboolean do32bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32 );
-   const GLboolean force16bpt =
-       ( rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16 );
-   (void) format;
-
-   switch ( internalFormat ) {
-   case 4:
-   case GL_RGBA:
-   case GL_COMPRESSED_RGBA:
-      switch ( type ) {
-      case GL_UNSIGNED_INT_10_10_10_2:
-      case GL_UNSIGNED_INT_2_10_10_10_REV:
-	 return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      default:
-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_argb4444;
-      }
-
-   case 3:
-   case GL_RGB:
-   case GL_COMPRESSED_RGB:
-      switch ( type ) {
-      case GL_UNSIGNED_SHORT_4_4_4_4:
-      case GL_UNSIGNED_SHORT_4_4_4_4_REV:
-	 return _dri_texformat_argb4444;
-      case GL_UNSIGNED_SHORT_5_5_5_1:
-      case GL_UNSIGNED_SHORT_1_5_5_5_REV:
-	 return _dri_texformat_argb1555;
-      case GL_UNSIGNED_SHORT_5_6_5:
-      case GL_UNSIGNED_SHORT_5_6_5_REV:
-	 return _dri_texformat_rgb565;
-      default:
-         return do32bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-      }
-
-   case GL_RGBA8:
-   case GL_RGB10_A2:
-   case GL_RGBA12:
-   case GL_RGBA16:
-      return !force16bpt ?
-	  _dri_texformat_argb8888 : _dri_texformat_argb4444;
-
-   case GL_RGBA4:
-   case GL_RGBA2:
-      return _dri_texformat_argb4444;
-
-   case GL_RGB5_A1:
-      return _dri_texformat_argb1555;
-
-   case GL_RGB8:
-   case GL_RGB10:
-   case GL_RGB12:
-   case GL_RGB16:
-      return !force16bpt ? _dri_texformat_argb8888 : _dri_texformat_rgb565;
-
-   case GL_RGB5:
-   case GL_RGB4:
-   case GL_R3_G3_B2:
-      return _dri_texformat_rgb565;
-
-   case GL_ALPHA:
-   case GL_ALPHA4:
-   case GL_ALPHA8:
-   case GL_ALPHA12:
-   case GL_ALPHA16:
-   case GL_COMPRESSED_ALPHA:
-      return _dri_texformat_a8;
-
-   case 1:
-   case GL_LUMINANCE:
-   case GL_LUMINANCE4:
-   case GL_LUMINANCE8:
-   case GL_LUMINANCE12:
-   case GL_LUMINANCE16:
-   case GL_COMPRESSED_LUMINANCE:
-      return _dri_texformat_l8;
-
-   case 2:
-   case GL_LUMINANCE_ALPHA:
-   case GL_LUMINANCE4_ALPHA4:
-   case GL_LUMINANCE6_ALPHA2:
-   case GL_LUMINANCE8_ALPHA8:
-   case GL_LUMINANCE12_ALPHA4:
-   case GL_LUMINANCE12_ALPHA12:
-   case GL_LUMINANCE16_ALPHA16:
-   case GL_COMPRESSED_LUMINANCE_ALPHA:
-      return _dri_texformat_al88;
-
-   case GL_INTENSITY:
-   case GL_INTENSITY4:
-   case GL_INTENSITY8:
-   case GL_INTENSITY12:
-   case GL_INTENSITY16:
-   case GL_COMPRESSED_INTENSITY:
-      return _dri_texformat_i8;
-
-   case GL_YCBCR_MESA:
-      if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
-          type == GL_UNSIGNED_BYTE)
-         return &_mesa_texformat_ycbcr;
-      else
-         return &_mesa_texformat_ycbcr_rev;
-
-   case GL_RGB_S3TC:
-   case GL_RGB4_S3TC:
-   case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgb_dxt1;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
-      return &_mesa_texformat_rgba_dxt1;
-
-   case GL_RGBA_S3TC:
-   case GL_RGBA4_S3TC:
-   case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
-      return &_mesa_texformat_rgba_dxt3;
-
-   case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
-      return &_mesa_texformat_rgba_dxt5;
-
-   default:
-      _mesa_problem(ctx, "unexpected texture format in %s", __FUNCTION__);
-      return NULL;
-   }
-
-   return NULL; /* never get here */
-}
-
-
-static void radeonTexImage1D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage1D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_teximage1d(ctx, target, level, internalFormat,
-                          width, border, format, type, pixels,
-                          &ctx->Unpack, texObj, texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void radeonTexSubImage1D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset,
-                                 GLsizei width,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage1D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage1d(ctx, target, level, xoffset, width,
-			     format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[0] |= (1 << level);
-}
-
-
-static void radeonTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLenum format, GLenum type, const GLvoid *pixels,
-                              const struct gl_pixelstore_attrib *packing,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage2D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_teximage2d(ctx, target, level, internalFormat,
-                          width, height, border, format, type, pixels,
-                          &ctx->Unpack, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-static void radeonTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format, GLenum type,
-                                 const GLvoid *pixels,
-                                 const struct gl_pixelstore_attrib *packing,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-			     height, format, type, pixels, packing, texObj,
-			     texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-static void radeonCompressedTexImage2D( GLcontext *ctx, GLenum target, GLint level,
-                              GLint internalFormat,
-                              GLint width, GLint height, GLint border,
-                              GLsizei imageSize, const GLvoid *data,
-                              struct gl_texture_object *texObj,
-                              struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   if ( t != NULL ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexImage2D");
-         return;
-      }
-   }
-
-   /* Note, this will call ChooseTextureFormat */
-   _mesa_store_compressed_teximage2d(ctx, target, level, internalFormat, width,
-                                 height, border, imageSize, data, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
-
-static void radeonCompressedTexSubImage2D( GLcontext *ctx, GLenum target, GLint level,
-                                 GLint xoffset, GLint yoffset,
-                                 GLsizei width, GLsizei height,
-                                 GLenum format,
-                                 GLsizei imageSize, const GLvoid *data,
-                                 struct gl_texture_object *texObj,
-                                 struct gl_texture_image *texImage )
-{
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
-   GLuint face;
-
-
-   /* which cube face or ordinary 2D image */
-   switch (target) {
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
-   case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
-   case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
-      face = (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
-      ASSERT(face < 6);
-      break;
-   default:
-      face = 0;
-   }
-
-   assert( t ); /* this _should_ be true */
-   if ( t ) {
-      driSwapOutTextureObject( t );
-   }
-   else {
-      t = (driTextureObject *) radeonAllocTexObj( texObj );
-      if (!t) {
-         _mesa_error(ctx, GL_OUT_OF_MEMORY, "glCompressedTexSubImage2D");
-         return;
-      }
-   }
-
-   _mesa_store_compressed_texsubimage2d(ctx, target, level, xoffset, yoffset, width,
-                                 height, format, imageSize, data, texObj, texImage);
-
-   t->dirty_images[face] |= (1 << level);
-}
-
 #define SCALED_FLOAT_TO_BYTE( x, scale ) \
 		(((GLuint)((255.0F / scale) * (x))) / 2)
 
 static void radeonTexEnv( GLcontext *ctx, GLenum target,
 			  GLenum pname, const GLfloat *param )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint unit = ctx->Texture.CurrentUnit;
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
 
@@ -706,7 +290,7 @@ static void radeonTexEnv( GLcontext *ctx, GLenum target,
        * functions, one mapping [-1.0,0.0] to [-128,0] and one mapping
        * [0.0,4.0] to [0,127].
        */
-      min = driQueryOptionb (&rmesa->optionCache, "no_neg_lod_bias") ?
+      min = driQueryOptionb (&rmesa->radeon.optionCache, "no_neg_lod_bias") ?
 	  0.0 : -1.0;
       bias = CLAMP( *param, min, 4.0 );
       if ( bias == 0 ) {
@@ -739,7 +323,7 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
 				struct gl_texture_object *texObj,
 				GLenum pname, const GLfloat *params )
 {
-   radeonTexObjPtr t = (radeonTexObjPtr) texObj->DriverData;
+   radeonTexObj* t = radeon_tex_obj(texObj);
 
    if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
       fprintf( stderr, "%s( %s )\n", __FUNCTION__,
@@ -767,57 +351,51 @@ static void radeonTexParameter( GLcontext *ctx, GLenum target,
    case GL_TEXTURE_MAX_LEVEL:
    case GL_TEXTURE_MIN_LOD:
    case GL_TEXTURE_MAX_LOD:
+
       /* This isn't the most efficient solution but there doesn't appear to
        * be a nice alternative.  Since there's no LOD clamping,
        * we just have to rely on loading the right subset of mipmap levels
        * to simulate a clamped LOD.
        */
-      driSwapOutTextureObject( (driTextureObject *) t );
+      if (t->mt) {
+         radeon_miptree_unreference(t->mt);
+	 t->mt = 0;
+	 t->validated = GL_FALSE;
+      }
       break;
 
    default:
       return;
    }
-
-   /* Mark this texobj as dirty (one bit per tex unit)
-    */
-   t->dirty_state = TEX_ALL;
-}
-
-
-static void radeonBindTexture( GLcontext *ctx, GLenum target,
-			       struct gl_texture_object *texObj )
-{
-   if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
-      fprintf( stderr, "%s( %p ) unit=%d\n", __FUNCTION__, (void *)texObj,
-	       ctx->Texture.CurrentUnit );
-   }
-
-   assert( (target != GL_TEXTURE_1D && target != GL_TEXTURE_2D &&
-            target != GL_TEXTURE_RECTANGLE_NV && target != GL_TEXTURE_CUBE_MAP) ||
-           (texObj->DriverData != NULL) );
 }
 
-
 static void radeonDeleteTexture( GLcontext *ctx,
 				 struct gl_texture_object *texObj )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   driTextureObject * t = (driTextureObject *) texObj->DriverData;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   radeonTexObj* t = radeon_tex_obj(texObj);
+   int i;
 
    if ( RADEON_DEBUG & (DEBUG_STATE|DEBUG_TEXTURE) ) {
       fprintf( stderr, "%s( %p (target = %s) )\n", __FUNCTION__, (void *)texObj,
 	       _mesa_lookup_enum_by_nr( texObj->Target ) );
    }
 
-   if ( t != NULL ) {
-      if ( rmesa ) {
-         RADEON_FIREVERTICES( rmesa );
-      }
-
-      driDestroyTextureObject( t );
+   if ( rmesa ) {
+     radeon_firevertices(&rmesa->radeon);
+     for ( i = 0 ; i < rmesa->radeon.glCtx->Const.MaxTextureUnits ; i++ ) {
+       if ( t == rmesa->state.texture.unit[i].texobj ) {
+	 rmesa->state.texture.unit[i].texobj = NULL;
+	 rmesa->hw.tex[i].dirty = GL_FALSE;
+	 rmesa->hw.cube[i].dirty = GL_FALSE;
+       }
+     }
    }
 
+   if (t->mt) {
+      radeon_miptree_unreference(t->mt);
+      t->mt = 0;
+   }
    /* Free mipmap images and the texture object itself */
    _mesa_delete_texture_object(ctx, texObj);
 }
@@ -837,7 +415,7 @@ static void radeonTexGen( GLcontext *ctx,
 			  GLenum pname,
 			  const GLfloat *params )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLuint unit = ctx->Texture.CurrentUnit;
    rmesa->recheck_texgen[unit] = GL_TRUE;
 }
@@ -851,29 +429,40 @@ static void radeonTexGen( GLcontext *ctx,
 static struct gl_texture_object *
 radeonNewTextureObject( GLcontext *ctx, GLuint name, GLenum target )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_object *obj;
-   obj = _mesa_new_texture_object(ctx, name, target);
-   if (!obj)
-      return NULL;
-   obj->MaxAnisotropy = rmesa->initialMaxAnisotropy;
-   radeonAllocTexObj( obj );
-   return obj;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   radeonTexObj* t = CALLOC_STRUCT(radeon_tex_obj);
+
+   _mesa_initialize_texture_object(&t->base, name, target);
+   t->base.MaxAnisotropy = rmesa->radeon.initialMaxAnisotropy;
+
+   t->border_fallback = GL_FALSE;
+
+   t->pp_txfilter = RADEON_BORDER_MODE_OGL;
+   t->pp_txformat = (RADEON_TXFORMAT_ENDIAN_NO_SWAP |
+		     RADEON_TXFORMAT_PERSPECTIVE_ENABLE);
+   
+   radeonSetTexWrap( t, t->base.WrapS, t->base.WrapT );
+   radeonSetTexMaxAnisotropy( t, t->base.MaxAnisotropy );
+   radeonSetTexFilter( t, t->base.MinFilter, t->base.MagFilter );
+   radeonSetTexBorderColor( t, t->base.BorderColor );
+   return &t->base;
 }
 
 
+
 void radeonInitTextureFuncs( struct dd_function_table *functions )
 {
-   functions->ChooseTextureFormat	= radeonChooseTextureFormat;
+   functions->ChooseTextureFormat	= radeonChooseTextureFormat_mesa;
    functions->TexImage1D		= radeonTexImage1D;
    functions->TexImage2D		= radeonTexImage2D;
    functions->TexSubImage1D		= radeonTexSubImage1D;
    functions->TexSubImage2D		= radeonTexSubImage2D;
+   functions->GetTexImage               = radeonGetTexImage;
+   functions->GetCompressedTexImage     = radeonGetCompressedTexImage;
 
    functions->NewTextureObject		= radeonNewTextureObject;
-   functions->BindTexture		= radeonBindTexture;
+   //   functions->BindTexture		= radeonBindTexture;
    functions->DeleteTexture		= radeonDeleteTexture;
-   functions->IsTextureResident		= driIsTextureResident;
 
    functions->TexEnv			= radeonTexEnv;
    functions->TexParameter		= radeonTexParameter;
@@ -882,5 +471,12 @@ void radeonInitTextureFuncs( struct dd_function_table *functions )
    functions->CompressedTexImage2D	= radeonCompressedTexImage2D;
    functions->CompressedTexSubImage2D	= radeonCompressedTexSubImage2D;
 
+   functions->GenerateMipmap = radeonGenerateMipmap;
+
+   functions->NewTextureImage = radeonNewTextureImage;
+   functions->FreeTexImageData = radeonFreeTexImageData;
+   functions->MapTexture = radeonMapTexture;
+   functions->UnmapTexture = radeonUnmapTexture;
+
    driInitTextureFormats();
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_tex.h b/src/mesa/drivers/dri/radeon/radeon_tex.h
index 80008808289..a4aaddc74fa 100644
--- a/src/mesa/drivers/dri/radeon/radeon_tex.h
+++ b/src/mesa/drivers/dri/radeon/radeon_tex.h
@@ -41,12 +41,16 @@ extern void radeonSetTexOffset(__DRIcontext *pDRICtx, GLint texname,
                                unsigned long long offset, GLint depth,
                                GLuint pitch);
 
+extern void radeonSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv);
+extern void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+			       __DRIdrawable *dPriv);
+
 extern void radeonUpdateTextureState( GLcontext *ctx );
 
-extern int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t,
+extern int radeonUploadTexImages( r100ContextPtr rmesa, radeonTexObjPtr t,
 				  GLuint face );
 
-extern void radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t );
+extern void radeonDestroyTexObj( r100ContextPtr rmesa, radeonTexObjPtr t );
 
 extern void radeonInitTextureFuncs( struct dd_function_table *functions );
 
diff --git a/src/mesa/drivers/dri/radeon/radeon_texmem.c b/src/mesa/drivers/dri/radeon/radeon_texmem.c
deleted file mode 100644
index 5f7bbe6a4cb..00000000000
--- a/src/mesa/drivers/dri/radeon/radeon_texmem.c
+++ /dev/null
@@ -1,404 +0,0 @@
-/**************************************************************************
-
-Copyright 2000, 2001 ATI Technologies Inc., Ontario, Canada, and
-                     VA Linux Systems Inc., Fremont, California.
-
-All Rights Reserved.
-
-Permission is hereby granted, free of charge, to any person obtaining
-a copy of this software and associated documentation files (the
-"Software"), to deal in the Software without restriction, including
-without limitation on the rights to use, copy, modify, merge, publish,
-distribute, sub license, and/or sell copies of the Software, and to
-permit persons to whom the Software is furnished to do so, subject to
-the following conditions:
-
-The above copyright notice and this permission notice (including the
-next paragraph) shall be included in all copies or substantial
-portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
-NON-INFRINGEMENT. IN NO EVENT SHALL ATI, VA LINUX SYSTEMS AND/OR THEIR
-SUPPLIERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
-IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
-IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
-
-**************************************************************************/
-
-/*
- * Authors:
- *   Kevin E. Martin <[email protected]>
- *   Gareth Hughes <[email protected]>
- *
- */
-#include <errno.h> 
-
-#include "main/glheader.h"
-#include "main/imports.h"
-#include "main/context.h"
-#include "main/macros.h"
-
-#include "radeon_context.h"
-#include "radeon_ioctl.h"
-#include "radeon_tex.h"
-
-#include <unistd.h>  /* for usleep() */
-
-
-/**
- * Destroy any device-dependent state associated with the texture.  This may
- * include NULLing out hardware state that points to the texture.
- */
-void
-radeonDestroyTexObj( radeonContextPtr rmesa, radeonTexObjPtr t )
-{
-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p )\n", __FUNCTION__, (void *)t, (void *)t->base.tObj );
-   }
-
-   if ( rmesa != NULL ) {
-      unsigned   i;
-
-
-      for ( i = 0 ; i < rmesa->glCtx->Const.MaxTextureUnits ; i++ ) {
-	 if ( t == rmesa->state.texture.unit[i].texobj ) {
-	    rmesa->state.texture.unit[i].texobj = NULL;
-	 }
-      }
-   }
-}
-
-
-/* ------------------------------------------------------------
- * Texture image conversions
- */
-
-
-static void radeonUploadRectSubImage( radeonContextPtr rmesa,
-				      radeonTexObjPtr t, 
-				      struct gl_texture_image *texImage,
-				      GLint x, GLint y, 
-				      GLint width, GLint height )
-{
-   const struct gl_texture_format *texFormat = texImage->TexFormat;
-   int blit_format, dstPitch, done;
-
-   switch ( texFormat->TexelBytes ) {
-   case 1:
-      blit_format = RADEON_GMC_DST_8BPP_CI;
-      break;
-   case 2:
-      blit_format = RADEON_GMC_DST_16BPP;
-      break;
-   case 4:
-      blit_format = RADEON_GMC_DST_32BPP;
-      break;
-   default:
-      fprintf( stderr, "radeonUploadRectSubImage: unknown blit_format (texelbytes=%d)\n", 
-      	       texFormat->TexelBytes);
-      return;
-   }
-
-   t->image[0][0].data = texImage->Data;
-
-   /* Currently don't need to cope with small pitches.
-    */
-   width = texImage->Width;
-   height = texImage->Height;
-   dstPitch = t->pp_txpitch + 32;
-
-   {	/* FIXME: prefer GART-texturing if possible */
-      /* Data not in GART memory, or bad pitch.
-       */
-      for (done = 0; done < height ; ) {
-	 struct radeon_dma_region region;
-	 int lines = MIN2( height - done, RADEON_BUFFER_SIZE / dstPitch );
-	 int src_pitch;
-	 char *tex;
-
-         src_pitch = texImage->RowStride * texFormat->TexelBytes;
-
-	 tex = (char *)texImage->Data + done * src_pitch;
-
-	 memset(&region, 0, sizeof(region));
-	 radeonAllocDmaRegion( rmesa, &region, lines * dstPitch, 1024 );
-
-	 /* Copy texdata to dma:
-	  */
-	 if (0)
-	    fprintf(stderr, "%s: src_pitch %d dst_pitch %d\n",
-		    __FUNCTION__, src_pitch, dstPitch);
-
-	 if (src_pitch == dstPitch) {
-	    memcpy( region.address + region.start, tex, lines * src_pitch );
-	 } 
-	 else {
-	    char *buf = region.address + region.start;
-	    int i;
-	    for (i = 0 ; i < lines ; i++) {
-	       memcpy( buf, tex, src_pitch );
-	       buf += dstPitch;
-	       tex += src_pitch;
-	    }
-	 }
-
-	 radeonEmitWait( rmesa, RADEON_WAIT_3D );
-
-	 
-
-	 /* Blit to framebuffer
-	  */
-	 radeonEmitBlit( rmesa,
-		       blit_format,
-		       dstPitch, GET_START( &region ),
-		       dstPitch, t->bufAddr,
-		       0, 0,
-		       0, done,
-		       width, lines );
-	 
-	 radeonEmitWait( rmesa, RADEON_WAIT_2D );
-
-	 radeonReleaseDmaRegion( rmesa, &region, __FUNCTION__ );
-	 done += lines;
-      }
-   }
-}
-
-
-/**
- * Upload the texture image associated with texture \a t at the specified
- * level at the address relative to \a start.
- */
-static void uploadSubImage( radeonContextPtr rmesa, radeonTexObjPtr t, 
-			    GLint hwlevel,
-			    GLint x, GLint y, GLint width, GLint height,
-			    GLuint face )
-{
-   struct gl_texture_image *texImage = NULL;
-   GLuint offset;
-   GLint imageWidth, imageHeight;
-   GLint ret;
-   drm_radeon_texture_t tex;
-   drm_radeon_tex_image_t tmp;
-   const int level = hwlevel + t->base.firstLevel;
-
-   if ( RADEON_DEBUG & DEBUG_TEXTURE ) {
-      fprintf( stderr, "%s( %p, %p ) level/width/height/face = %d/%d/%d/%u\n", 
-	       __FUNCTION__, (void *)t, (void *)t->base.tObj, level, width, height, face );
-   }
-
-   ASSERT(face < 6);
-
-   /* Ensure we have a valid texture to upload */
-   if ( ( hwlevel < 0 ) || ( hwlevel >= RADEON_MAX_TEXTURE_LEVELS ) ) {
-      _mesa_problem(NULL, "bad texture level in %s", __FUNCTION__);
-      return;
-   }
-
-   texImage = t->base.tObj->Image[face][level];
-
-   if ( !texImage ) {
-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: texImage %d is NULL!\n", __FUNCTION__, level );
-      return;
-   }
-   if ( !texImage->Data ) {
-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is NULL!\n", __FUNCTION__ );
-      return;
-   }
-
-
-   if (t->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-      assert(level == 0);
-      assert(hwlevel == 0);
-      if ( RADEON_DEBUG & DEBUG_TEXTURE )
-	 fprintf( stderr, "%s: image data is rectangular\n", __FUNCTION__);
-      radeonUploadRectSubImage( rmesa, t, texImage, x, y, width, height );
-      return;
-   }
-
-   imageWidth = texImage->Width;
-   imageHeight = texImage->Height;
-
-   offset = t->bufAddr + t->base.totalSize * face / 6;
-
-   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      GLint imageX = 0;
-      GLint imageY = 0;
-      GLint blitX = t->image[face][hwlevel].x;
-      GLint blitY = t->image[face][hwlevel].y;
-      GLint blitWidth = t->image[face][hwlevel].width;
-      GLint blitHeight = t->image[face][hwlevel].height;
-      fprintf( stderr, "   upload image: %d,%d at %d,%d\n",
-	       imageWidth, imageHeight, imageX, imageY );
-      fprintf( stderr, "   upload  blit: %d,%d at %d,%d\n",
-	       blitWidth, blitHeight, blitX, blitY );
-      fprintf( stderr, "       blit ofs: 0x%07x level: %d/%d\n",
-	       (GLuint)offset, hwlevel, level );
-   }
-
-   t->image[face][hwlevel].data = texImage->Data;
-
-   /* Init the DRM_RADEON_TEXTURE command / drm_radeon_texture_t struct.
-    * NOTE: we're always use a 1KB-wide blit and I8 texture format.
-    * We used to use 1, 2 and 4-byte texels and used to use the texture
-    * width to dictate the blit width - but that won't work for compressed
-    * textures. (Brian)
-    * NOTE: can't do that with texture tiling. (sroland)
-    */
-   tex.offset = offset;
-   tex.image = &tmp;
-   /* copy (x,y,width,height,data) */
-   memcpy( &tmp, &t->image[face][hwlevel], sizeof(drm_radeon_tex_image_t) );
-
-   if (texImage->TexFormat->TexelBytes) {
-      /* use multi-byte upload scheme */
-      tex.height = imageHeight;
-      tex.width = imageWidth;
-      tex.format = t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK;
-      tex.pitch = MAX2((texImage->Width * texImage->TexFormat->TexelBytes) / 64, 1);
-      tex.offset += tmp.x & ~1023;
-      tmp.x = tmp.x % 1024;
-      if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
-	 /* need something like "tiled coordinates" ? */
-	 tmp.y = tmp.x / (tex.pitch * 128) * 2;
-	 tmp.x = tmp.x % (tex.pitch * 128) / 2 / texImage->TexFormat->TexelBytes;
-	 tex.pitch |= RADEON_DST_TILE_MICRO >> 22;
-      }
-      else {
-	 tmp.x = tmp.x >> (texImage->TexFormat->TexelBytes >> 1);
-      }
-      if ((t->tile_bits & RADEON_TXO_MACRO_TILE) &&
-	 (texImage->Width * texImage->TexFormat->TexelBytes >= 256)) {
-	 /* radeon switches off macro tiling for small textures/mipmaps it seems */
-	 tex.pitch |= RADEON_DST_TILE_MACRO >> 22;
-      }
-   }
-   else {
-      /* In case of for instance 8x8 texture (2x2 dxt blocks), padding after the first two blocks is
-         needed (only with dxt1 since 2 dxt3/dxt5 blocks already use 32 Byte). */
-      /* set tex.height to 1/4 since 1 "macropixel" (dxt-block) has 4 real pixels. Needed
-         so the kernel module reads the right amount of data. */
-      tex.format = RADEON_TXFORMAT_I8; /* any 1-byte texel format */
-      tex.pitch = (BLIT_WIDTH_BYTES / 64);
-      tex.height = (imageHeight + 3) / 4;
-      tex.width = (imageWidth + 3) / 4;
-      switch (t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) {
-      case RADEON_TXFORMAT_DXT1:
-         tex.width *= 8;
-         break;
-      case RADEON_TXFORMAT_DXT23:
-      case RADEON_TXFORMAT_DXT45:
-         tex.width *= 16;
-         break;
-      }
-   }
-
-   LOCK_HARDWARE( rmesa );
-   do {
-      ret = drmCommandWriteRead( rmesa->dri.fd, DRM_RADEON_TEXTURE,
-                                 &tex, sizeof(drm_radeon_texture_t) );
-   } while ( ret == -EAGAIN );
-
-   UNLOCK_HARDWARE( rmesa );
-
-   if ( ret ) {
-      fprintf( stderr, "DRM_RADEON_TEXTURE: return = %d\n", ret );
-      fprintf( stderr, "   offset=0x%08x\n",
-	       offset );
-      fprintf( stderr, "   image width=%d height=%d\n",
-	       imageWidth, imageHeight );
-      fprintf( stderr, "    blit width=%d height=%d data=%p\n",
-	       t->image[face][hwlevel].width, t->image[face][hwlevel].height,
-	       t->image[face][hwlevel].data );
-      exit( 1 );
-   }
-}
-
-
-/**
- * Upload the texture images associated with texture \a t.  This might
- * require the allocation of texture memory.
- * 
- * \param rmesa Context pointer
- * \param t Texture to be uploaded
- * \param face Cube map face to be uploaded.  Zero for non-cube maps.
- */
-
-int radeonUploadTexImages( radeonContextPtr rmesa, radeonTexObjPtr t, GLuint face )
-{
-   int numLevels;
-
-   if ( !t || t->base.totalSize == 0 || t->image_override )
-      return 0;
-
-   if ( RADEON_DEBUG & (DEBUG_TEXTURE|DEBUG_IOCTL) ) {
-      fprintf( stderr, "%s( %p, %p ) sz=%d lvls=%d-%d\n", __FUNCTION__,
-	       (void *)rmesa->glCtx, (void *)t->base.tObj, t->base.totalSize,
-	       t->base.firstLevel, t->base.lastLevel );
-   }
-
-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   if (RADEON_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      radeonFinish( rmesa->glCtx );
-   }
-
-   LOCK_HARDWARE( rmesa );
-
-   if ( t->base.memBlock == NULL ) {
-      int heap;
-
-      heap = driAllocateTexture( rmesa->texture_heaps, rmesa->nr_heaps,
-				 (driTextureObject *) t );
-      if ( heap == -1 ) {
-	 UNLOCK_HARDWARE( rmesa );
-	 return -1;
-      }
-
-      /* Set the base offset of the texture image */
-      t->bufAddr = rmesa->radeonScreen->texOffset[heap] 
-	   + t->base.memBlock->ofs;
-      t->pp_txoffset = t->bufAddr;
-
-      if (!(t->base.tObj->Image[0][0]->IsClientData)) {
-	 /* hope it's safe to add that here... */
-	 t->pp_txoffset |= t->tile_bits;
-      }
-
-      /* Mark this texobj as dirty on all units:
-       */
-      t->dirty_state = TEX_ALL;
-   }
-
-
-   /* Let the world know we've used this memory recently.
-    */
-   driUpdateTextureLRU( (driTextureObject *) t );
-   UNLOCK_HARDWARE( rmesa );
-
-
-   /* Upload any images that are new */
-   if (t->base.dirty_images[face]) {
-      int i;
-      for ( i = 0 ; i < numLevels ; i++ ) {
-         if ( (t->base.dirty_images[face] & (1 << (i+t->base.firstLevel))) != 0 ) {
-            uploadSubImage( rmesa, t, i, 0, 0, t->image[face][i].width,
-			    t->image[face][i].height, face );
-         }
-      }
-      t->base.dirty_images[face] = 0;
-   }
-
-   if (RADEON_DEBUG & DEBUG_SYNC) {
-      fprintf(stderr, "%s: Syncing\n", __FUNCTION__ );
-      radeonFinish( rmesa->glCtx );
-   }
-
-   return 0;
-}
diff --git a/src/mesa/drivers/dri/radeon/radeon_texstate.c b/src/mesa/drivers/dri/radeon/radeon_texstate.c
index b165205c093..c29105d7b85 100644
--- a/src/mesa/drivers/dri/radeon/radeon_texstate.c
+++ b/src/mesa/drivers/dri/radeon/radeon_texstate.c
@@ -39,10 +39,12 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #include "main/context.h"
 #include "main/macros.h"
 #include "main/texformat.h"
+#include "main/teximage.h"
 #include "main/texobj.h"
 #include "main/enums.h"
 
 #include "radeon_context.h"
+#include "radeon_mipmap_tree.h"
 #include "radeon_state.h"
 #include "radeon_ioctl.h"
 #include "radeon_swtcl.h"
@@ -75,10 +77,11 @@ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
 #define VALID_FORMAT(f) ( ((f) <= MESA_FORMAT_RGBA_DXT5) \
 			     && (tx_table[f].format != 0xffffffff) )
 
-static const struct {
+struct tx_table {
    GLuint format, filter;
-}
-tx_table[] =
+};
+
+static const struct tx_table tx_table[] =
 {
    _ALPHA(RGBA8888),
    _ALPHA_REV(RGBA8888),
@@ -111,252 +114,6 @@ tx_table[] =
 #undef _ALPHA
 #undef _INVALID
 
-/**
- * This function computes the number of bytes of storage needed for
- * the given texture object (all mipmap levels, all cube faces).
- * The \c image[face][level].x/y/width/height parameters for upload/blitting
- * are computed here.  \c pp_txfilter, \c pp_txformat, etc. will be set here
- * too.
- * 
- * \param rmesa Context pointer
- * \param tObj GL texture object whose images are to be posted to
- *                 hardware state.
- */
-static void radeonSetTexImages( radeonContextPtr rmesa,
-				struct gl_texture_object *tObj )
-{
-   radeonTexObjPtr t = (radeonTexObjPtr)tObj->DriverData;
-   const struct gl_texture_image *baseImage = tObj->Image[0][tObj->BaseLevel];
-   GLint curOffset, blitWidth;
-   GLint i, texelBytes;
-   GLint numLevels;
-   GLint log2Width, log2Height, log2Depth;
-
-   /* Set the hardware texture format
-    */
-   if ( !t->image_override ) {
-      t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
-                          RADEON_TXFORMAT_ALPHA_IN_MAP);
-      t->pp_txfilter &= ~RADEON_YUV_TO_RGB;
-
-      if ( VALID_FORMAT( baseImage->TexFormat->MesaFormat ) ) {
-         t->pp_txformat |= tx_table[ baseImage->TexFormat->MesaFormat ].format;
-         t->pp_txfilter |= tx_table[ baseImage->TexFormat->MesaFormat ].filter;
-      }
-      else {
-         _mesa_problem(NULL, "unexpected texture format in %s", __FUNCTION__);
-         return;
-      }
-   }
-
-   texelBytes = baseImage->TexFormat->TexelBytes;
-
-   /* Compute which mipmap levels we really want to send to the hardware.
-    */
-
-   if (tObj->Target != GL_TEXTURE_CUBE_MAP)
-      driCalculateTextureFirstLastLevel( (driTextureObject *) t );
-   else {
-      /* r100 can't handle mipmaps for cube/3d textures, so don't waste
-         memory for them */
-      t->base.firstLevel = t->base.lastLevel = tObj->BaseLevel;
-   }
-   log2Width  = tObj->Image[0][t->base.firstLevel]->WidthLog2;
-   log2Height = tObj->Image[0][t->base.firstLevel]->HeightLog2;
-   log2Depth  = tObj->Image[0][t->base.firstLevel]->DepthLog2;
-
-   numLevels = t->base.lastLevel - t->base.firstLevel + 1;
-
-   assert(numLevels <= RADEON_MAX_TEXTURE_LEVELS);
-
-   /* Calculate mipmap offsets and dimensions for blitting (uploading)
-    * The idea is that we lay out the mipmap levels within a block of
-    * memory organized as a rectangle of width BLIT_WIDTH_BYTES.
-    */
-   curOffset = 0;
-   blitWidth = BLIT_WIDTH_BYTES;
-   t->tile_bits = 0;
-
-   /* figure out if this texture is suitable for tiling. */
-   if (texelBytes && (tObj->Target != GL_TEXTURE_RECTANGLE_NV)) {
-      if (rmesa->texmicrotile && (baseImage->Height > 1)) {
-	 /* allow 32 (bytes) x 1 mip (which will use two times the space
-	    the non-tiled version would use) max if base texture is large enough */
-	 if ((numLevels == 1) ||
-	   (((baseImage->Width * texelBytes / baseImage->Height) <= 32) &&
-	       (baseImage->Width * texelBytes > 64)) ||
-	    ((baseImage->Width * texelBytes / baseImage->Height) <= 16)) {
-	    /* R100 has two microtile bits (only the txoffset reg, not the blitter)
-	       weird: X2 + OPT: 32bit correct, 16bit completely hosed
-		      X2: 32bit correct, 16bit correct
-		      OPT: 32bit large mips correct, small mips hosed, 16bit completely hosed */
-	    t->tile_bits |= RADEON_TXO_MICRO_TILE_X2 /*| RADEON_TXO_MICRO_TILE_OPT*/;
-	 }
-      }
-      if ((baseImage->Width * texelBytes >= 256) && (baseImage->Height >= 16)) {
-	 /* R100 disables macro tiling only if mip width is smaller than 256 bytes, and not
-	    in the case if height is smaller than 16 (not 100% sure), as does the r200,
-	    so need to disable macro tiling in that case */
-	 if ((numLevels == 1) || ((baseImage->Width * texelBytes / baseImage->Height) <= 4)) {
-	    t->tile_bits |= RADEON_TXO_MACRO_TILE;
-	 }
-      }
-   }
-
-   for (i = 0; i < numLevels; i++) {
-      const struct gl_texture_image *texImage;
-      GLuint size;
-
-      texImage = tObj->Image[0][i + t->base.firstLevel];
-      if ( !texImage )
-	 break;
-
-      /* find image size in bytes */
-      if (texImage->IsCompressed) {
-      /* need to calculate the size AFTER padding even though the texture is
-         submitted without padding.
-         Only handle pot textures currently - don't know if npot is even possible,
-         size calculation would certainly need (trivial) adjustments.
-         Align (and later pad) to 32byte, not sure what that 64byte blit width is
-         good for? */
-         if ((t->pp_txformat & RADEON_TXFORMAT_FORMAT_MASK) == RADEON_TXFORMAT_DXT1) {
-            /* RGB_DXT1/RGBA_DXT1, 8 bytes per block */
-            if ((texImage->Width + 3) < 8) /* width one block */
-               size = texImage->CompressedSize * 4;
-            else if ((texImage->Width + 3) < 16)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-         }
-         else /* DXT3/5, 16 bytes per block */
-            if ((texImage->Width + 3) < 8)
-               size = texImage->CompressedSize * 2;
-            else size = texImage->CompressedSize;
-      }
-      else if (tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
-	 size = ((texImage->Width * texelBytes + 63) & ~63) * texImage->Height;
-      }
-      else if (t->tile_bits & RADEON_TXO_MICRO_TILE_X2) {
-	 /* tile pattern is 16 bytes x2. mipmaps stay 32 byte aligned,
-	    though the actual offset may be different (if texture is less than
-	    32 bytes width) to the untiled case */
-	 int w = (texImage->Width * texelBytes * 2 + 31) & ~31;
-	 size = (w * ((texImage->Height + 1) / 2)) * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      else {
-	 int w = (texImage->Width * texelBytes + 31) & ~31;
-	 size = w * texImage->Height * texImage->Depth;
-	 blitWidth = MAX2(texImage->Width, 64 / texelBytes);
-      }
-      assert(size > 0);
-
-      /* Align to 32-byte offset.  It is faster to do this unconditionally
-       * (no branch penalty).
-       */
-
-      curOffset = (curOffset + 0x1f) & ~0x1f;
-
-      if (texelBytes) {
-	 t->image[0][i].x = curOffset; /* fix x and y coords up later together with offset */
-	 t->image[0][i].y = 0;
-	 t->image[0][i].width = MIN2(size / texelBytes, blitWidth);
-	 t->image[0][i].height = (size / texelBytes) / t->image[0][i].width;
-      }
-      else {
-         t->image[0][i].x = curOffset % BLIT_WIDTH_BYTES;
-         t->image[0][i].y = curOffset / BLIT_WIDTH_BYTES;
-         t->image[0][i].width  = MIN2(size, BLIT_WIDTH_BYTES);
-         t->image[0][i].height = size / t->image[0][i].width;     
-      }
-
-#if 0
-      /* for debugging only and only  applicable to non-rectangle targets */
-      assert(size % t->image[0][i].width == 0);
-      assert(t->image[0][i].x == 0
-             || (size < BLIT_WIDTH_BYTES && t->image[0][i].height == 1));
-#endif
-
-      if (0)
-         fprintf(stderr,
-                 "level %d: %dx%d x=%d y=%d w=%d h=%d size=%d at %d\n",
-                 i, texImage->Width, texImage->Height,
-                 t->image[0][i].x, t->image[0][i].y,
-                 t->image[0][i].width, t->image[0][i].height, size, curOffset);
-
-      curOffset += size;
-
-   }
-
-   /* Align the total size of texture memory block.
-    */
-   t->base.totalSize = (curOffset + RADEON_OFFSET_MASK) & ~RADEON_OFFSET_MASK;
-
-   /* Setup remaining cube face blits, if needed */
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      const GLuint faceSize = t->base.totalSize;
-      GLuint face;
-      /* reuse face 0 x/y/width/height - just update the offset when uploading */
-      for (face = 1; face < 6; face++) {
-         for (i = 0; i < numLevels; i++) {
-            t->image[face][i].x =  t->image[0][i].x;
-            t->image[face][i].y =  t->image[0][i].y;
-            t->image[face][i].width  = t->image[0][i].width;
-            t->image[face][i].height = t->image[0][i].height;
-         }
-      }
-      t->base.totalSize = 6 * faceSize; /* total texmem needed */
-   }
-
-   /* Hardware state:
-    */
-   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
-   t->pp_txfilter |= (numLevels - 1) << RADEON_MAX_MIP_LEVEL_SHIFT;
-
-   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
-		       RADEON_TXFORMAT_HEIGHT_MASK |
-                       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
-                       RADEON_TXFORMAT_F5_WIDTH_MASK |
-                       RADEON_TXFORMAT_F5_HEIGHT_MASK);
-   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
-		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
-
-   if (tObj->Target == GL_TEXTURE_CUBE_MAP) {
-      assert(log2Width == log2Height);
-      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
-                         (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
-                         (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
-      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
-                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
-                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
-                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
-                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
-   }
-
-   t->pp_txsize = (((tObj->Image[0][t->base.firstLevel]->Width - 1) << 0) |
-                   ((tObj->Image[0][t->base.firstLevel]->Height - 1) << 16));
-
-   /* Only need to round to nearest 32 for textures, but the blitter
-    * requires 64-byte aligned pitches, and we may/may not need the
-    * blitter.   NPOT only!
-    */
-   if ( !t->image_override ) {
-      if (baseImage->IsCompressed)
-         t->pp_txpitch = (tObj->Image[0][t->base.firstLevel]->Width + 63) & ~(63);
-      else
-         t->pp_txpitch = ((tObj->Image[0][t->base.firstLevel]->Width * texelBytes) + 63) & ~(63);
-      t->pp_txpitch -= 32;
-   }
-
-   t->dirty_state = TEX_ALL;
-
-   /* FYI: radeonUploadTexImages( rmesa, t ); used to be called here */
-}
-
-
-
 /* ================================================================
  * Texture combine functions
  */
@@ -503,7 +260,7 @@ do {							\
 
 static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    const struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    GLuint color_combine, alpha_combine;
    const GLuint color_combine0 = RADEON_COLOR_ARG_A_ZERO | RADEON_COLOR_ARG_B_ZERO
@@ -846,22 +603,21 @@ static GLboolean radeonUpdateTextureEnv( GLcontext *ctx, int unit )
 void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
                         unsigned long long offset, GLint depth, GLuint pitch)
 {
-	radeonContextPtr rmesa = pDRICtx->driverPrivate;
+	r100ContextPtr rmesa = pDRICtx->driverPrivate;
 	struct gl_texture_object *tObj =
-	    _mesa_lookup_texture(rmesa->glCtx, texname);
-	radeonTexObjPtr t;
+	    _mesa_lookup_texture(rmesa->radeon.glCtx, texname);
+	radeonTexObjPtr t = radeon_tex_obj(tObj);
 
 	if (tObj == NULL)
 		return;
 
-	t = (radeonTexObjPtr) tObj->DriverData;
-
 	t->image_override = GL_TRUE;
 
 	if (!offset)
 		return;
-
-	t->pp_txoffset = offset;
+	
+	t->bo = NULL;
+	t->override_offset = offset;
 	t->pp_txpitch = pitch - 32;
 
 	switch (depth) {
@@ -881,6 +637,125 @@ void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
 	}
 }
 
+void radeonSetTexBuffer2(__DRIcontext *pDRICtx, GLint target, GLint glx_texture_format,
+			 __DRIdrawable *dPriv)
+{
+	struct gl_texture_unit *texUnit;
+	struct gl_texture_object *texObj;
+	struct gl_texture_image *texImage;
+	struct radeon_renderbuffer *rb;
+	radeon_texture_image *rImage;
+	radeonContextPtr radeon;
+	r100ContextPtr rmesa;
+	struct radeon_framebuffer *rfb;
+	radeonTexObjPtr t;
+	uint32_t pitch_val;
+	uint32_t internalFormat, type, format;
+
+	type = GL_BGRA;
+	format = GL_UNSIGNED_BYTE;
+	internalFormat = (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT ? 3 : 4);
+
+	radeon = pDRICtx->driverPrivate;
+	rmesa = pDRICtx->driverPrivate;
+
+	rfb = dPriv->driverPrivate;
+        texUnit = &radeon->glCtx->Texture.Unit[radeon->glCtx->Texture.CurrentUnit];
+	texObj = _mesa_select_tex_object(radeon->glCtx, texUnit, target);
+        texImage = _mesa_get_tex_image(radeon->glCtx, texObj, target, 0);
+
+	rImage = get_radeon_texture_image(texImage);
+	t = radeon_tex_obj(texObj);
+        if (t == NULL) {
+    	    return;
+    	}
+
+	radeon_update_renderbuffers(pDRICtx, dPriv);
+	/* back & depth buffer are useless free them right away */
+	rb = (void*)rfb->base.Attachment[BUFFER_DEPTH].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+        rb->bo = NULL;
+	}
+	rb = (void*)rfb->base.Attachment[BUFFER_BACK_LEFT].Renderbuffer;
+	if (rb && rb->bo) {
+		radeon_bo_unref(rb->bo);
+		rb->bo = NULL;
+	}
+	rb = rfb->color_rb[0];
+	if (rb->bo == NULL) {
+		/* Failed to BO for the buffer */
+		return;
+	}
+	
+	_mesa_lock_texture(radeon->glCtx, texObj);
+	if (t->bo) {
+		radeon_bo_unref(t->bo);
+		t->bo = NULL;
+	}
+	if (rImage->bo) {
+		radeon_bo_unref(rImage->bo);
+		rImage->bo = NULL;
+	}
+	if (t->mt) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = NULL;
+	}
+	if (rImage->mt) {
+		radeon_miptree_unreference(rImage->mt);
+		rImage->mt = NULL;
+	}
+	_mesa_init_teximage_fields(radeon->glCtx, target, texImage,
+				   rb->base.Width, rb->base.Height, 1, 0, rb->cpp);
+	texImage->RowStride = rb->pitch / rb->cpp;
+	texImage->TexFormat = radeonChooseTextureFormat(radeon->glCtx,
+							internalFormat,
+							type, format, 0);
+	rImage->bo = rb->bo;
+	radeon_bo_ref(rImage->bo);
+	t->bo = rb->bo;
+	radeon_bo_ref(t->bo);
+	t->tile_bits = 0;
+	t->image_override = GL_TRUE;
+	t->override_offset = 0;
+	t->pp_txpitch &= (1 << 13) -1;
+	pitch_val = rb->pitch;
+	switch (rb->cpp) {
+	case 4:
+		if (glx_texture_format == GLX_TEXTURE_FORMAT_RGB_EXT)
+			t->pp_txformat = tx_table[MESA_FORMAT_RGB888].format;
+		else
+			t->pp_txformat = tx_table[MESA_FORMAT_ARGB8888].format;
+		t->pp_txfilter |= tx_table[MESA_FORMAT_ARGB8888].filter;
+		break;
+	case 3:
+	default:
+		t->pp_txformat = tx_table[MESA_FORMAT_RGB888].format;
+		t->pp_txfilter |= tx_table[MESA_FORMAT_RGB888].filter;
+		break;
+	case 2:
+		t->pp_txformat = tx_table[MESA_FORMAT_RGB565].format;
+		t->pp_txfilter |= tx_table[MESA_FORMAT_RGB565].filter;
+		break;
+	}
+        t->pp_txsize = ((rb->base.Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		   | ((rb->base.Height - 1) << RADEON_TEX_VSIZE_SHIFT);
+        t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
+	t->pp_txpitch = pitch_val;
+        t->pp_txpitch -= 32;
+
+	t->validated = GL_TRUE;
+	_mesa_unlock_texture(radeon->glCtx, texObj);
+	return;
+}
+
+
+void radeonSetTexBuffer(__DRIcontext *pDRICtx, GLint target, __DRIdrawable *dPriv)
+{
+        radeonSetTexBuffer2(pDRICtx, target, GLX_TEXTURE_FORMAT_RGBA_EXT, dPriv);
+}
+
+
 #define TEXOBJ_TXFILTER_MASK (RADEON_MAX_MIP_LEVEL_MASK |	\
 			      RADEON_MIN_FILTER_MASK | 		\
 			      RADEON_MAG_FILTER_MASK |		\
@@ -901,12 +776,53 @@ void radeonSetTexOffset(__DRIcontext * pDRICtx, GLint texname,
                               RADEON_TXFORMAT_NON_POWER2)
 
 
-static void import_tex_obj_state( radeonContextPtr rmesa,
+static void disable_tex_obj_state( r100ContextPtr rmesa, 
+				   int unit )
+{
+   RADEON_STATECHANGE( rmesa, tex[unit] );
+
+   RADEON_STATECHANGE( rmesa, tcl );
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
+					     RADEON_Q_BIT(unit));
+   
+   if (rmesa->radeon.TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
+     TCL_FALLBACK( rmesa->radeon.glCtx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
+     rmesa->recheck_texgen[unit] = GL_TRUE;
+   }
+
+   if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
+     /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
+	cubic_map bit on unit 2 when the unit is disabled, otherwise every
+	2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
+	units, better be safe than sorry though).*/
+     RADEON_STATECHANGE( rmesa, tex[unit] );
+     rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
+   }
+
+   {
+      GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
+      GLuint tmp = rmesa->TexGenEnabled;
+
+      rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
+      rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
+      rmesa->TexGenNeedNormals[unit] = 0;
+      rmesa->TexGenEnabled |= 
+	(RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
+
+      if (tmp != rmesa->TexGenEnabled) {
+	rmesa->recheck_texgen[unit] = GL_TRUE;
+	rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
+      }
+   }
+}
+
+static void import_tex_obj_state( r100ContextPtr rmesa,
 				  int unit,
 				  radeonTexObjPtr texobj )
 {
 /* do not use RADEON_DB_STATE to avoid stale texture caches */
-   int *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
+   uint32_t *cmd = &rmesa->hw.tex[unit].cmd[TEX_CMD_0];
    GLuint se_coord_fmt = rmesa->hw.set.cmd[SET_SE_COORDFMT];
 
    RADEON_STATECHANGE( rmesa, tex[unit] );
@@ -915,10 +831,9 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
    cmd[TEX_PP_TXFILTER] |= texobj->pp_txfilter & TEXOBJ_TXFILTER_MASK;
    cmd[TEX_PP_TXFORMAT] &= ~TEXOBJ_TXFORMAT_MASK;
    cmd[TEX_PP_TXFORMAT] |= texobj->pp_txformat & TEXOBJ_TXFORMAT_MASK;
-   cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset;
    cmd[TEX_PP_BORDER_COLOR] = texobj->pp_border_color;
 
-   if (texobj->base.tObj->Target == GL_TEXTURE_RECTANGLE_NV) {
+   if (texobj->base.Target == GL_TEXTURE_RECTANGLE_NV) {
       GLuint *txr_cmd = RADEON_DB_STATE( txr[unit] );
       txr_cmd[TXR_PP_TEX_SIZE] = texobj->pp_txsize; /* NPOT only! */
       txr_cmd[TXR_PP_TEX_PITCH] = texobj->pp_txpitch; /* NPOT only! */
@@ -928,22 +843,12 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
    else {
       se_coord_fmt &= ~(RADEON_VTX_ST0_NONPARAMETRIC << unit);
 
-      if (texobj->base.tObj->Target == GL_TEXTURE_CUBE_MAP) {
-	 int *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
-	 GLuint bytesPerFace = texobj->base.totalSize / 6;
-	 ASSERT(texobj->base.totalSize % 6 == 0);
+      if (texobj->base.Target == GL_TEXTURE_CUBE_MAP) {
+	 uint32_t *cube_cmd = &rmesa->hw.cube[unit].cmd[CUBE_CMD_0];
 
 	 RADEON_STATECHANGE( rmesa, cube[unit] );
 	 cube_cmd[CUBE_PP_CUBIC_FACES] = texobj->pp_cubic_faces;
-	 /* dont know if this setup conforms to OpenGL.. 
-	  * at least it matches the behavior of mesa software renderer
-	  */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_0] = texobj->pp_txoffset; /* right */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_1] = texobj->pp_txoffset + 1 * bytesPerFace; /* left */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_2] = texobj->pp_txoffset + 2 * bytesPerFace; /* top */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_3] = texobj->pp_txoffset + 3 * bytesPerFace; /* bottom */
-	 cube_cmd[CUBE_PP_CUBIC_OFFSET_4] = texobj->pp_txoffset + 4 * bytesPerFace; /* front */
-	 cmd[TEX_PP_TXOFFSET] = texobj->pp_txoffset + 5 * bytesPerFace; /* back */
+	 /* state filled out in the cube_emit */
       }
    }
 
@@ -952,13 +857,11 @@ static void import_tex_obj_state( radeonContextPtr rmesa,
       rmesa->hw.set.cmd[SET_SE_COORDFMT] = se_coord_fmt;
    }
 
-   texobj->dirty_state &= ~(1<<unit);
+   rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
 }
 
 
-
-
-static void set_texgen_matrix( radeonContextPtr rmesa, 
+static void set_texgen_matrix( r100ContextPtr rmesa, 
 			       GLuint unit,
 			       const GLfloat *s_plane,
 			       const GLfloat *t_plane,
@@ -986,14 +889,14 @@ static void set_texgen_matrix( radeonContextPtr rmesa,
    rmesa->TexGenMatrix[unit].m[15] = q_plane[3];
 
    rmesa->TexGenEnabled |= RADEON_TEXMAT_0_ENABLE << unit;
-   rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+   rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
 }
 
 /* Returns GL_FALSE if fallback required.
  */
 static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
    GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
    GLuint tmp = rmesa->TexGenEnabled;
@@ -1094,283 +997,189 @@ static GLboolean radeon_validate_texgen( GLcontext *ctx, GLuint unit )
    }
 
    if (tmp != rmesa->TexGenEnabled) {
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
    return GL_TRUE;
 }
 
-
-static void disable_tex( GLcontext *ctx, int unit )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-
-   if (rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit)) {
-      /* Texture unit disabled */
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
-
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= ~(1UL << unit);
-	 rmesa->state.texture.unit[unit].texobj = NULL;
-      }
-
-      RADEON_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= 
-	  ~((RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit);
-
-      RADEON_STATECHANGE( rmesa, tcl );
-      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] &= ~(RADEON_ST_BIT(unit) |
-						RADEON_Q_BIT(unit));
-
-      if (rmesa->TclFallback & (RADEON_TCL_FALLBACK_TEXGEN_0<<unit)) {
-	 TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), GL_FALSE);
-	 rmesa->recheck_texgen[unit] = GL_TRUE;
-      }
-
-      if (rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] & RADEON_TXFORMAT_CUBIC_MAP_ENABLE) {
-      /* this seems to be a genuine (r100 only?) hw bug. Need to remove the
-         cubic_map bit on unit 2 when the unit is disabled, otherwise every
-	 2nd (2d) mipmap on unit 0 will be broken (may not be needed for other
-	 units, better be safe than sorry though).*/
-	 RADEON_STATECHANGE( rmesa, tex[unit] );
-	 rmesa->hw.tex[unit].cmd[TEX_PP_TXFORMAT] &= ~RADEON_TXFORMAT_CUBIC_MAP_ENABLE;
-      }
-
-      {
-	 GLuint inputshift = RADEON_TEXGEN_0_INPUT_SHIFT + unit*4;
-	 GLuint tmp = rmesa->TexGenEnabled;
-
-	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenEnabled &= ~(RADEON_TEXMAT_0_ENABLE<<unit);
-	 rmesa->TexGenEnabled &= ~(RADEON_TEXGEN_INPUT_MASK<<inputshift);
-	 rmesa->TexGenNeedNormals[unit] = 0;
-	 rmesa->TexGenEnabled |= 
-	     (RADEON_TEXGEN_INPUT_TEXCOORD_0+unit) << inputshift;
-
-	 if (tmp != rmesa->TexGenEnabled) {
-	    rmesa->recheck_texgen[unit] = GL_TRUE;
-	    rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-	 }
-      }
-   }
-}
-
-static GLboolean enable_tex_2d( GLcontext *ctx, int unit )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
-
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
-   }
-
-   ASSERT(tObj->Target == GL_TEXTURE_2D || tObj->Target == GL_TEXTURE_1D);
-
-   if ( t->base.dirty_images[0] ) {
-      RADEON_FIREVERTICES( rmesa );
-      radeonSetTexImages( rmesa, tObj );
-      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock && !t->image_override ) 
-	return GL_FALSE;
-   }
-
-   return GL_TRUE;
-}
-
-static GLboolean enable_tex_cube( GLcontext *ctx, int unit )
+/**
+ * Compute the cached hardware register values for the given texture object.
+ *
+ * \param rmesa Context pointer
+ * \param t the r300 texture object
+ */
+static GLboolean setup_hardware_state(r100ContextPtr rmesa, radeonTexObj *t, int unit)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
-   GLuint face;
+   const struct gl_texture_image *firstImage;
+   GLint log2Width, log2Height, log2Depth, texelBytes;
 
-   /* Need to load the 2d images associated with this unit.
-    */
-   if (t->pp_txformat & RADEON_TXFORMAT_NON_POWER2) {
-      t->pp_txformat &= ~RADEON_TXFORMAT_NON_POWER2;
-      for (face = 0; face < 6; face++)
-         t->base.dirty_images[face] = ~0;
+   if ( t->bo ) {
+	return GL_TRUE;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_CUBE_MAP);
+   firstImage = t->base.Image[0][t->mt->firstLevel];   
 
-   if ( t->base.dirty_images[0] || t->base.dirty_images[1] ||
-        t->base.dirty_images[2] || t->base.dirty_images[3] ||
-        t->base.dirty_images[4] || t->base.dirty_images[5] ) {
-      /* flush */
-      RADEON_FIREVERTICES( rmesa );
-      /* layout memory space, once for all faces */
-      radeonSetTexImages( rmesa, tObj );
+   if (firstImage->Border > 0) {
+      fprintf(stderr, "%s: border\n", __FUNCTION__);
+      return GL_FALSE;
    }
 
-   /* upload (per face) */
-   for (face = 0; face < 6; face++) {
-      if (t->base.dirty_images[face]) {
-         radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, face );
+   log2Width  = firstImage->WidthLog2;
+   log2Height = firstImage->HeightLog2;
+   log2Depth  = firstImage->DepthLog2;
+   texelBytes = firstImage->TexFormat->TexelBytes;
+
+   if (!t->image_override) {
+      if (VALID_FORMAT(firstImage->TexFormat->MesaFormat)) {
+	const struct tx_table *table = tx_table;
+
+	 t->pp_txformat &= ~(RADEON_TXFORMAT_FORMAT_MASK |
+			     RADEON_TXFORMAT_ALPHA_IN_MAP);
+	 t->pp_txfilter &= ~RADEON_YUV_TO_RGB;	 
+	 
+	 t->pp_txformat |= table[ firstImage->TexFormat->MesaFormat ].format;
+	 t->pp_txfilter |= table[ firstImage->TexFormat->MesaFormat ].filter;
+      } else {
+	 _mesa_problem(NULL, "unexpected texture format in %s",
+		       __FUNCTION__);
+	 return GL_FALSE;
       }
    }
-      
-   if ( !t->base.memBlock ) {
-      /* texmem alloc failed, use s/w fallback */
-      return GL_FALSE;
+   
+   t->pp_txfilter &= ~RADEON_MAX_MIP_LEVEL_MASK;
+   t->pp_txfilter |= (t->mt->lastLevel - t->mt->firstLevel) << RADEON_MAX_MIP_LEVEL_SHIFT;
+	
+   t->pp_txformat &= ~(RADEON_TXFORMAT_WIDTH_MASK |
+		       RADEON_TXFORMAT_HEIGHT_MASK |
+		       RADEON_TXFORMAT_CUBIC_MAP_ENABLE |
+		       RADEON_TXFORMAT_F5_WIDTH_MASK |
+		       RADEON_TXFORMAT_F5_HEIGHT_MASK);
+   t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_WIDTH_SHIFT) |
+		      (log2Height << RADEON_TXFORMAT_HEIGHT_SHIFT));
+   
+   t->tile_bits = 0;
+   
+   if (t->base.Target == GL_TEXTURE_CUBE_MAP) {
+      ASSERT(log2Width == log2Height);
+      t->pp_txformat |= ((log2Width << RADEON_TXFORMAT_F5_WIDTH_SHIFT) |
+			 (log2Height << RADEON_TXFORMAT_F5_HEIGHT_SHIFT) |
+			 /* don't think we need this bit, if it exists at all - fglrx does not set it */
+			 (RADEON_TXFORMAT_CUBIC_MAP_ENABLE));
+      t->pp_cubic_faces = ((log2Width << RADEON_FACE_WIDTH_1_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_1_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_2_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_2_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_3_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_3_SHIFT) |
+                           (log2Width << RADEON_FACE_WIDTH_4_SHIFT) |
+                           (log2Height << RADEON_FACE_HEIGHT_4_SHIFT));
    }
 
-   return GL_TRUE;
-}
-
-static GLboolean enable_tex_rect( GLcontext *ctx, int unit )
-{
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
+   t->pp_txsize = (((firstImage->Width - 1) << RADEON_TEX_USIZE_SHIFT)
+		   | ((firstImage->Height - 1) << RADEON_TEX_VSIZE_SHIFT));
 
-   if (!(t->pp_txformat & RADEON_TXFORMAT_NON_POWER2)) {
-      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
-      t->base.dirty_images[0] = ~0;
+   if ( !t->image_override ) {
+      if (firstImage->IsCompressed)
+         t->pp_txpitch = (firstImage->Width + 63) & ~(63);
+      else
+         t->pp_txpitch = ((firstImage->Width * texelBytes) + 63) & ~(63);
+      t->pp_txpitch -= 32;
    }
 
-   ASSERT(tObj->Target == GL_TEXTURE_RECTANGLE_NV);
-
-   if ( t->base.dirty_images[0] ) {
-      RADEON_FIREVERTICES( rmesa );
-      radeonSetTexImages( rmesa, tObj );
-      radeonUploadTexImages( rmesa, (radeonTexObjPtr) tObj->DriverData, 0 );
-      if ( !t->base.memBlock &&
-           !t->image_override /* && !rmesa->prefer_gart_client_texturing  FIXME */ ) {
-	 fprintf(stderr, "%s: upload failed\n", __FUNCTION__);
-	 return GL_FALSE;
-      }
+   if (t->base.Target == GL_TEXTURE_RECTANGLE_NV) {
+      t->pp_txformat |= RADEON_TXFORMAT_NON_POWER2;
    }
 
    return GL_TRUE;
 }
 
-
-static GLboolean update_tex_common( GLcontext *ctx, int unit )
+static GLboolean radeon_validate_texture(GLcontext *ctx, struct gl_texture_object *texObj, int unit)
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
-   struct gl_texture_object *tObj = texUnit->_Current;
-   radeonTexObjPtr t = (radeonTexObjPtr) tObj->DriverData;
-   GLenum format;
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
+   radeonTexObj *t = radeon_tex_obj(texObj);
+   int ret;
 
-   /* Fallback if there's a texture border */
-   if ( tObj->Image[0][tObj->BaseLevel]->Border > 0 ) {
-      fprintf(stderr, "%s: border\n", __FUNCTION__);
+   if (!radeon_validate_texture_miptree(ctx, texObj))
       return GL_FALSE;
-   }
+
+   ret = setup_hardware_state(rmesa, t, unit);
+   if (ret == GL_FALSE)
+     return GL_FALSE;
+
    /* yuv conversion only works in first unit */
    if (unit != 0 && (t->pp_txfilter & RADEON_YUV_TO_RGB))
       return GL_FALSE;
 
-   /* Update state if this is a different texture object to last
-    * time.
-    */
-   if ( rmesa->state.texture.unit[unit].texobj != t ) {
-      if ( rmesa->state.texture.unit[unit].texobj != NULL ) {
-	 /* The old texture is no longer bound to this texture unit.
-	  * Mark it as such.
-	  */
-
-	 rmesa->state.texture.unit[unit].texobj->base.bound &= 
-	     ~(1UL << unit);
-      }
-
-      rmesa->state.texture.unit[unit].texobj = t;
-      t->base.bound |= (1UL << unit);
-      t->dirty_state |= 1<<unit;
-      driUpdateTextureLRU( (driTextureObject *) t ); /* XXX: should be locked! */
-   }
+   RADEON_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
+     (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
 
+   RADEON_STATECHANGE( rmesa, tcl );
+   rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
 
-   /* Newly enabled?
-    */
-   if ( !(rmesa->hw.ctx.cmd[CTX_PP_CNTL] & (RADEON_TEX_0_ENABLE<<unit))) {
-      RADEON_STATECHANGE( rmesa, ctx );
-      rmesa->hw.ctx.cmd[CTX_PP_CNTL] |= 
-	  (RADEON_TEX_0_ENABLE | RADEON_TEX_BLEND_0_ENABLE) << unit;
+   rmesa->recheck_texgen[unit] = GL_TRUE;
 
-      RADEON_STATECHANGE( rmesa, tcl );
-
-      rmesa->hw.tcl.cmd[TCL_OUTPUT_VTXFMT] |= RADEON_ST_BIT(unit);
-
-      rmesa->recheck_texgen[unit] = GL_TRUE;
-   }
-
-   if (t->dirty_state & (1<<unit)) {
-      import_tex_obj_state( rmesa, unit, t );
-      /* may need to update texture matrix (for texrect adjustments) */
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
-   }
+   import_tex_obj_state( rmesa, unit, t );
 
    if (rmesa->recheck_texgen[unit]) {
       GLboolean fallback = !radeon_validate_texgen( ctx, unit );
       TCL_FALLBACK( ctx, (RADEON_TCL_FALLBACK_TEXGEN_0<<unit), fallback);
       rmesa->recheck_texgen[unit] = 0;
-      rmesa->NewGLState |= _NEW_TEXTURE_MATRIX;
+      rmesa->radeon.NewGLState |= _NEW_TEXTURE_MATRIX;
    }
 
-   format = tObj->Image[0][tObj->BaseLevel]->_BaseFormat;
-   if ( rmesa->state.texture.unit[unit].format != format ||
-	rmesa->state.texture.unit[unit].envMode != texUnit->EnvMode ) {
-      rmesa->state.texture.unit[unit].format = format;
-      rmesa->state.texture.unit[unit].envMode = texUnit->EnvMode;
-      if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
-	 return GL_FALSE;
-      }
+   if ( ! radeonUpdateTextureEnv( ctx, unit ) ) {
+     return GL_FALSE;
    }
-
    FALLBACK( rmesa, RADEON_FALLBACK_BORDER_MODE, t->border_fallback );
+
+   t->validated = GL_TRUE;
    return !t->border_fallback;
 }
 
-
-
 static GLboolean radeonUpdateTextureUnit( GLcontext *ctx, int unit )
 {
-   struct gl_texture_unit *texUnit = &ctx->Texture.Unit[unit];
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
 
-   if ( texUnit->_ReallyEnabled & (TEXTURE_RECT_BIT) ) {
-      return (enable_tex_rect( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_1D_BIT | TEXTURE_2D_BIT) ) {
-      return (enable_tex_2d( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
-   }
-   else if ( texUnit->_ReallyEnabled & (TEXTURE_CUBE_BIT) ) {
-      return (enable_tex_cube( ctx, unit ) &&
-	      update_tex_common( ctx, unit ));
+   if (ctx->Texture.Unit[unit]._ReallyEnabled & TEXTURE_3D_BIT) {
+     rmesa->state.texture.unit[unit].texobj = NULL;
+     return GL_FALSE;
    }
-   else if ( texUnit->_ReallyEnabled ) {
-      return GL_FALSE;
+
+   if (!ctx->Texture.Unit[unit]._ReallyEnabled) {
+     /* disable the unit */
+     disable_tex_obj_state(rmesa, unit);
+     rmesa->state.texture.unit[unit].texobj = NULL;
+     return GL_TRUE;
    }
-   else {
-      disable_tex( ctx, unit );
-      return GL_TRUE;
+
+   if (!radeon_validate_texture(ctx, ctx->Texture.Unit[unit]._Current, unit)) {
+    _mesa_warning(ctx,
+		  "failed to validate texture for unit %d.\n",
+		  unit);
+     rmesa->state.texture.unit[unit].texobj = NULL;
+     return GL_FALSE;
    }
+   rmesa->state.texture.unit[unit].texobj = radeon_tex_obj(ctx->Texture.Unit[unit]._Current);
+   return GL_TRUE;
 }
 
 void radeonUpdateTextureState( GLcontext *ctx )
 {
-   radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+   r100ContextPtr rmesa = R100_CONTEXT(ctx);
    GLboolean ok;
 
+   /* set the ctx all textures off */
+   RADEON_STATECHANGE( rmesa, ctx );
+   rmesa->hw.ctx.cmd[CTX_PP_CNTL] &= ~((RADEON_TEX_ENABLE_MASK) | (RADEON_TEX_BLEND_ENABLE_MASK));
+
    ok = (radeonUpdateTextureUnit( ctx, 0 ) &&
 	 radeonUpdateTextureUnit( ctx, 1 ) &&
 	 radeonUpdateTextureUnit( ctx, 2 ));
 
    FALLBACK( rmesa, RADEON_FALLBACK_TEXTURE, !ok );
 
-   if (rmesa->TclFallback)
+   if (rmesa->radeon.TclFallback)
       radeonChooseVertexState( ctx );
 }
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.c b/src/mesa/drivers/dri/radeon/radeon_texture.c
new file mode 100644
index 00000000000..ad501c454ce
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.c
@@ -0,0 +1,1033 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ * Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+ *
+ * The Weather Channel (TM) funded Tungsten Graphics to develop the
+ * initial release of the Radeon 8500 driver under the XFree86 license.
+ * This notice must be preserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "main/glheader.h"
+#include "main/imports.h"
+#include "main/context.h"
+#include "main/convolve.h"
+#include "main/mipmap.h"
+#include "main/texcompress.h"
+#include "main/texformat.h"
+#include "main/texstore.h"
+#include "main/teximage.h"
+#include "main/texobj.h"
+#include "main/texgetimage.h"
+
+#include "xmlpool.h"		/* for symbolic values of enum-type options */
+
+#include "radeon_common.h"
+
+#include "radeon_mipmap_tree.h"
+
+
+static void copy_rows(void* dst, GLuint dststride, const void* src, GLuint srcstride,
+	GLuint numrows, GLuint rowsize)
+{
+	assert(rowsize <= dststride);
+	assert(rowsize <= srcstride);
+
+	if (rowsize == srcstride && rowsize == dststride) {
+		memcpy(dst, src, numrows*rowsize);
+	} else {
+		GLuint i;
+		for(i = 0; i < numrows; ++i) {
+			memcpy(dst, src, rowsize);
+			dst += dststride;
+			src += srcstride;
+		}
+	}
+}
+
+/* textures */
+/**
+ * Allocate an empty texture image object.
+ */
+struct gl_texture_image *radeonNewTextureImage(GLcontext *ctx)
+{
+	return CALLOC(sizeof(radeon_texture_image));
+}
+
+/**
+ * Free memory associated with this texture image.
+ */
+void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage)
+{
+	radeon_texture_image* image = get_radeon_texture_image(timage);
+
+	if (image->mt) {
+		radeon_miptree_unreference(image->mt);
+		image->mt = 0;
+		assert(!image->base.Data);
+	} else {
+		_mesa_free_texture_image_data(ctx, timage);
+	}
+	if (image->bo) {
+		radeon_bo_unref(image->bo);
+		image->bo = NULL;
+	}
+	if (timage->Data) {
+		_mesa_free_texmemory(timage->Data);
+		timage->Data = NULL;
+	}
+}
+
+/* Set Data pointer and additional data for mapped texture image */
+static void teximage_set_map_data(radeon_texture_image *image)
+{
+	radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+
+	image->base.Data = image->mt->bo->ptr + lvl->faces[image->mtface].offset;
+	image->base.RowStride = lvl->rowstride / image->mt->bpp;
+}
+
+
+/**
+ * Map a single texture image for glTexImage and friends.
+ */
+void radeon_teximage_map(radeon_texture_image *image, GLboolean write_enable)
+{
+	if (image->mt) {
+		assert(!image->base.Data);
+
+		radeon_bo_map(image->mt->bo, write_enable);
+		teximage_set_map_data(image);
+	}
+}
+
+
+void radeon_teximage_unmap(radeon_texture_image *image)
+{
+	if (image->mt) {
+		assert(image->base.Data);
+
+		image->base.Data = 0;
+		radeon_bo_unmap(image->mt->bo);
+	}
+}
+
+static void map_override(GLcontext *ctx, radeonTexObj *t)
+{
+	radeon_texture_image *img = get_radeon_texture_image(t->base.Image[0][0]);
+
+	radeon_bo_map(t->bo, GL_FALSE);
+
+	img->base.Data = t->bo->ptr;
+	_mesa_set_fetch_functions(&img->base, 2);
+}
+
+static void unmap_override(GLcontext *ctx, radeonTexObj *t)
+{
+	radeon_texture_image *img = get_radeon_texture_image(t->base.Image[0][0]);
+
+	radeon_bo_unmap(t->bo);
+
+	img->base.Data = NULL;
+}
+
+/**
+ * Map a validated texture for reading during software rendering.
+ */
+void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	int face, level;
+
+	if (!radeon_validate_texture_miptree(ctx, texObj))
+	  return;
+
+	/* for r100 3D sw fallbacks don't have mt */
+	if (t->image_override && t->bo)
+		map_override(ctx, t);
+
+	if (!t->mt)
+		return;
+
+	radeon_bo_map(t->mt->bo, GL_FALSE);
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+			teximage_set_map_data(get_radeon_texture_image(texObj->Image[face][level]));
+	}
+}
+
+void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	int face, level;
+
+	if (t->image_override && t->bo)
+		unmap_override(ctx, t);
+	/* for r100 3D sw fallbacks don't have mt */
+	if (!t->mt)
+	  return;
+
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level)
+			texObj->Image[face][level]->Data = 0;
+	}
+	radeon_bo_unmap(t->mt->bo);
+}
+
+GLuint radeon_face_for_target(GLenum target)
+{
+	switch (target) {
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_X:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_X:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Y:
+	case GL_TEXTURE_CUBE_MAP_POSITIVE_Z:
+	case GL_TEXTURE_CUBE_MAP_NEGATIVE_Z:
+		return (GLuint) target - (GLuint) GL_TEXTURE_CUBE_MAP_POSITIVE_X;
+	default:
+		return 0;
+	}
+}
+
+/**
+ * Wraps Mesa's implementation to ensure that the base level image is mapped.
+ *
+ * This relies on internal details of _mesa_generate_mipmap, in particular
+ * the fact that the memory for recreated texture images is always freed.
+ */
+static void radeon_generate_mipmap(GLcontext *ctx, GLenum target,
+				   struct gl_texture_object *texObj)
+{
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	GLuint nr_faces = (t->base.Target == GL_TEXTURE_CUBE_MAP) ? 6 : 1;
+	int i, face;
+
+
+	_mesa_generate_mipmap(ctx, target, texObj);
+
+	for (face = 0; face < nr_faces; face++) {
+		for (i = texObj->BaseLevel + 1; i < texObj->MaxLevel; i++) {
+			radeon_texture_image *image;
+
+			image = get_radeon_texture_image(texObj->Image[face][i]);
+
+			if (image == NULL)
+				break;
+
+			image->mtlevel = i;
+			image->mtface = face;
+
+			radeon_miptree_unreference(image->mt);
+			image->mt = NULL;
+		}
+	}
+	
+}
+
+void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj)
+{
+	GLuint face = radeon_face_for_target(target);
+	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[face][texObj->BaseLevel]);
+
+	radeon_teximage_map(baseimage, GL_FALSE);
+	radeon_generate_mipmap(ctx, target, texObj);
+	radeon_teximage_unmap(baseimage);
+}
+
+
+/* try to find a format which will only need a memcopy */
+static const struct gl_texture_format *radeonChoose8888TexFormat(radeonContextPtr rmesa,
+								 GLenum srcFormat,
+								 GLenum srcType, GLboolean fbo)
+{
+	const GLuint ui = 1;
+	const GLubyte littleEndian = *((const GLubyte *)&ui);
+
+	/* r100 can only do this */
+	if (IS_R100_CLASS(rmesa->radeonScreen) || fbo)
+	  return _dri_texformat_argb8888;
+
+	if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+	    (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+	    (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && littleEndian)) {
+		return &_mesa_texformat_rgba8888;
+	} else if ((srcFormat == GL_RGBA && srcType == GL_UNSIGNED_INT_8_8_8_8_REV) ||
+		   (srcFormat == GL_RGBA && srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_INT_8_8_8_8) ||
+		   (srcFormat == GL_ABGR_EXT && srcType == GL_UNSIGNED_BYTE && !littleEndian)) {
+		return &_mesa_texformat_rgba8888_rev;
+	} else if (IS_R200_CLASS(rmesa->radeonScreen)) {
+		return _dri_texformat_argb8888;
+	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && !littleEndian) ||
+					    srcType == GL_UNSIGNED_INT_8_8_8_8)) {
+		return &_mesa_texformat_argb8888_rev;
+	} else if (srcFormat == GL_BGRA && ((srcType == GL_UNSIGNED_BYTE && littleEndian) ||
+					    srcType == GL_UNSIGNED_INT_8_8_8_8_REV)) {
+		return &_mesa_texformat_argb8888;
+	} else
+		return _dri_texformat_argb8888;
+}
+
+const struct gl_texture_format *radeonChooseTextureFormat_mesa(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type)
+{
+	return radeonChooseTextureFormat(ctx, internalFormat, format,
+					 type, 0);
+}
+
+const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type, GLboolean fbo)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	const GLboolean do32bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_32);
+	const GLboolean force16bpt =
+	    (rmesa->texture_depth == DRI_CONF_TEXTURE_DEPTH_FORCE_16);
+	(void)format;
+
+#if 0
+	fprintf(stderr, "InternalFormat=%s(%d) type=%s format=%s\n",
+		_mesa_lookup_enum_by_nr(internalFormat), internalFormat,
+		_mesa_lookup_enum_by_nr(type), _mesa_lookup_enum_by_nr(format));
+	fprintf(stderr, "do32bpt=%d force16bpt=%d\n", do32bpt, force16bpt);
+#endif
+
+	switch (internalFormat) {
+	case 4:
+	case GL_RGBA:
+	case GL_COMPRESSED_RGBA:
+		switch (type) {
+		case GL_UNSIGNED_INT_10_10_10_2:
+		case GL_UNSIGNED_INT_2_10_10_10_REV:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		default:
+			return do32bpt ? radeonChoose8888TexFormat(rmesa, format, type, fbo) :
+			    _dri_texformat_argb4444;
+		}
+
+	case 3:
+	case GL_RGB:
+	case GL_COMPRESSED_RGB:
+		switch (type) {
+		case GL_UNSIGNED_SHORT_4_4_4_4:
+		case GL_UNSIGNED_SHORT_4_4_4_4_REV:
+			return _dri_texformat_argb4444;
+		case GL_UNSIGNED_SHORT_5_5_5_1:
+		case GL_UNSIGNED_SHORT_1_5_5_5_REV:
+			return _dri_texformat_argb1555;
+		case GL_UNSIGNED_SHORT_5_6_5:
+		case GL_UNSIGNED_SHORT_5_6_5_REV:
+			return _dri_texformat_rgb565;
+		default:
+			return do32bpt ? _dri_texformat_argb8888 :
+			    _dri_texformat_rgb565;
+		}
+
+	case GL_RGBA8:
+	case GL_RGB10_A2:
+	case GL_RGBA12:
+	case GL_RGBA16:
+		return !force16bpt ?
+			radeonChoose8888TexFormat(rmesa, format, type, fbo) :
+			_dri_texformat_argb4444;
+
+	case GL_RGBA4:
+	case GL_RGBA2:
+		return _dri_texformat_argb4444;
+
+	case GL_RGB5_A1:
+		return _dri_texformat_argb1555;
+
+	case GL_RGB8:
+	case GL_RGB10:
+	case GL_RGB12:
+	case GL_RGB16:
+		return !force16bpt ? _dri_texformat_argb8888 :
+		    _dri_texformat_rgb565;
+
+	case GL_RGB5:
+	case GL_RGB4:
+	case GL_R3_G3_B2:
+		return _dri_texformat_rgb565;
+
+	case GL_ALPHA:
+	case GL_ALPHA4:
+	case GL_ALPHA8:
+	case GL_ALPHA12:
+	case GL_ALPHA16:
+	case GL_COMPRESSED_ALPHA:
+		/* r200: can't use a8 format since interpreting hw I8 as a8 would result
+		   in wrong rgb values (same as alpha value instead of 0). */
+		if (IS_R200_CLASS(rmesa->radeonScreen))
+			return _dri_texformat_al88;
+		else
+			return _dri_texformat_a8;
+	case 1:
+	case GL_LUMINANCE:
+	case GL_LUMINANCE4:
+	case GL_LUMINANCE8:
+	case GL_LUMINANCE12:
+	case GL_LUMINANCE16:
+	case GL_COMPRESSED_LUMINANCE:
+		return _dri_texformat_l8;
+
+	case 2:
+	case GL_LUMINANCE_ALPHA:
+	case GL_LUMINANCE4_ALPHA4:
+	case GL_LUMINANCE6_ALPHA2:
+	case GL_LUMINANCE8_ALPHA8:
+	case GL_LUMINANCE12_ALPHA4:
+	case GL_LUMINANCE12_ALPHA12:
+	case GL_LUMINANCE16_ALPHA16:
+	case GL_COMPRESSED_LUMINANCE_ALPHA:
+		return _dri_texformat_al88;
+
+	case GL_INTENSITY:
+	case GL_INTENSITY4:
+	case GL_INTENSITY8:
+	case GL_INTENSITY12:
+	case GL_INTENSITY16:
+	case GL_COMPRESSED_INTENSITY:
+		return _dri_texformat_i8;
+
+	case GL_YCBCR_MESA:
+		if (type == GL_UNSIGNED_SHORT_8_8_APPLE ||
+		    type == GL_UNSIGNED_BYTE)
+			return &_mesa_texformat_ycbcr;
+		else
+			return &_mesa_texformat_ycbcr_rev;
+
+	case GL_RGB_S3TC:
+	case GL_RGB4_S3TC:
+	case GL_COMPRESSED_RGB_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgb_dxt1;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT1_EXT:
+		return &_mesa_texformat_rgba_dxt1;
+
+	case GL_RGBA_S3TC:
+	case GL_RGBA4_S3TC:
+	case GL_COMPRESSED_RGBA_S3TC_DXT3_EXT:
+		return &_mesa_texformat_rgba_dxt3;
+
+	case GL_COMPRESSED_RGBA_S3TC_DXT5_EXT:
+		return &_mesa_texformat_rgba_dxt5;
+
+	case GL_ALPHA16F_ARB:
+		return &_mesa_texformat_alpha_float16;
+	case GL_ALPHA32F_ARB:
+		return &_mesa_texformat_alpha_float32;
+	case GL_LUMINANCE16F_ARB:
+		return &_mesa_texformat_luminance_float16;
+	case GL_LUMINANCE32F_ARB:
+		return &_mesa_texformat_luminance_float32;
+	case GL_LUMINANCE_ALPHA16F_ARB:
+		return &_mesa_texformat_luminance_alpha_float16;
+	case GL_LUMINANCE_ALPHA32F_ARB:
+		return &_mesa_texformat_luminance_alpha_float32;
+	case GL_INTENSITY16F_ARB:
+		return &_mesa_texformat_intensity_float16;
+	case GL_INTENSITY32F_ARB:
+		return &_mesa_texformat_intensity_float32;
+	case GL_RGB16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGB32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+	case GL_RGBA16F_ARB:
+		return &_mesa_texformat_rgba_float16;
+	case GL_RGBA32F_ARB:
+		return &_mesa_texformat_rgba_float32;
+
+	case GL_DEPTH_COMPONENT:
+	case GL_DEPTH_COMPONENT16:
+	case GL_DEPTH_COMPONENT24:
+	case GL_DEPTH_COMPONENT32:
+	case GL_DEPTH_STENCIL_EXT:
+	case GL_DEPTH24_STENCIL8_EXT:
+		return &_mesa_texformat_s8_z24;
+
+	/* EXT_texture_sRGB */
+	case GL_SRGB:
+	case GL_SRGB8:
+	case GL_SRGB_ALPHA:
+	case GL_SRGB8_ALPHA8:
+	case GL_COMPRESSED_SRGB:
+	case GL_COMPRESSED_SRGB_ALPHA:
+		return &_mesa_texformat_srgba8;
+
+	case GL_SLUMINANCE:
+	case GL_SLUMINANCE8:
+	case GL_COMPRESSED_SLUMINANCE:
+		return &_mesa_texformat_sl8;
+
+	case GL_SLUMINANCE_ALPHA:
+	case GL_SLUMINANCE8_ALPHA8:
+	case GL_COMPRESSED_SLUMINANCE_ALPHA:
+		return &_mesa_texformat_sla8;
+
+	default:
+		_mesa_problem(ctx,
+			      "unexpected internalFormat 0x%x in %s",
+			      (int)internalFormat, __func__);
+		return NULL;
+	}
+
+	return NULL;		/* never get here */
+}
+
+/**
+ * All glTexImage calls go through this function.
+ */
+static void radeon_teximage(
+	GLcontext *ctx, int dims,
+	GLenum target, GLint level,
+	GLint internalFormat,
+	GLint width, GLint height, GLint depth,
+	GLsizei imageSize,
+	GLenum format, GLenum type, const GLvoid * pixels,
+	const struct gl_pixelstore_attrib *packing,
+	struct gl_texture_object *texObj,
+	struct gl_texture_image *texImage,
+	int compressed)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+	GLuint dstRowStride;
+	GLint postConvWidth = width;
+	GLint postConvHeight = height;
+	GLuint texelBytes;
+	GLuint face = radeon_face_for_target(target);
+
+	radeon_firevertices(rmesa);
+
+	t->validated = GL_FALSE;
+
+	if (ctx->_ImageTransferState & IMAGE_CONVOLUTION_BIT) {
+	       _mesa_adjust_image_for_convolution(ctx, dims, &postConvWidth,
+						  &postConvHeight);
+	}
+
+	/* Choose and fill in the texture format for this image */
+	texImage->TexFormat = radeonChooseTextureFormat(ctx, internalFormat, format, type, 0);
+	_mesa_set_fetch_functions(texImage, dims);
+
+	if (texImage->TexFormat->TexelBytes == 0) {
+		texelBytes = 0;
+		texImage->IsCompressed = GL_TRUE;
+		texImage->CompressedSize =
+			ctx->Driver.CompressedTextureSize(ctx, texImage->Width,
+					   texImage->Height, texImage->Depth,
+					   texImage->TexFormat->MesaFormat);
+	} else {
+		texImage->IsCompressed = GL_FALSE;
+		texImage->CompressedSize = 0;
+
+		texelBytes = texImage->TexFormat->TexelBytes;
+		/* Minimum pitch of 32 bytes */
+		if (postConvWidth * texelBytes < 32) {
+		  postConvWidth = 32 / texelBytes;
+		  texImage->RowStride = postConvWidth;
+		}
+		if (!image->mt) {      
+			assert(texImage->RowStride == postConvWidth);
+		}
+	}
+
+	/* Allocate memory for image */
+	radeonFreeTexImageData(ctx, texImage); /* Mesa core only clears texImage->Data but not image->mt */
+
+	if (t->mt &&
+	    t->mt->firstLevel == level &&
+	    t->mt->lastLevel == level &&
+	    t->mt->target != GL_TEXTURE_CUBE_MAP_ARB &&
+	    !radeon_miptree_matches_image(t->mt, texImage, face, level)) {
+	  radeon_miptree_unreference(t->mt);
+	  t->mt = NULL;
+	}
+
+	if (!t->mt)
+		radeon_try_alloc_miptree(rmesa, t, texImage, face, level);
+	if (t->mt && radeon_miptree_matches_image(t->mt, texImage, face, level)) {
+		radeon_mipmap_level *lvl;
+		image->mt = t->mt;
+		image->mtlevel = level - t->mt->firstLevel;
+		image->mtface = face;
+		radeon_miptree_reference(t->mt);
+		lvl = &image->mt->levels[image->mtlevel];
+		dstRowStride = lvl->rowstride;
+	} else {
+		int size;
+		if (texImage->IsCompressed) {
+			size = texImage->CompressedSize;
+		} else {
+			size = texImage->Width * texImage->Height * texImage->Depth * texImage->TexFormat->TexelBytes;
+		}
+		texImage->Data = _mesa_alloc_texmemory(size);
+	}
+
+	/* Upload texture image; note that the spec allows pixels to be NULL */
+	if (compressed) {
+		pixels = _mesa_validate_pbo_compressed_teximage(
+			ctx, imageSize, pixels, packing, "glCompressedTexImage");
+	} else {
+		pixels = _mesa_validate_pbo_teximage(
+			ctx, dims, width, height, depth,
+			format, type, pixels, packing, "glTexImage");
+	}
+
+	if (pixels) {
+		radeon_teximage_map(image, GL_TRUE);
+
+		if (compressed) {
+			memcpy(texImage->Data, pixels, imageSize);
+		} else {
+			GLuint dstRowStride;
+			GLuint *dstImageOffsets;
+
+			if (image->mt) {
+				radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+				dstRowStride = lvl->rowstride;
+			} else {
+				dstRowStride = texImage->Width * texImage->TexFormat->TexelBytes;
+			}
+
+			if (dims == 3) {
+				int i;
+
+				dstImageOffsets = _mesa_malloc(depth * sizeof(GLuint)) ;
+				if (!dstImageOffsets)
+					_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
+
+				for (i = 0; i < depth; ++i) {
+					dstImageOffsets[i] = dstRowStride/texImage->TexFormat->TexelBytes * height * i;
+				}
+			} else {
+				dstImageOffsets = texImage->ImageOffsets;
+			}
+
+			if (!texImage->TexFormat->StoreImage(ctx, dims,
+						texImage->_BaseFormat,
+						texImage->TexFormat,
+						texImage->Data, 0, 0, 0, /* dstX/Y/Zoffset */
+						dstRowStride,
+						dstImageOffsets,
+						width, height, depth,
+						format, type, pixels, packing))
+				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexImage");
+
+			if (dims == 3)
+				_mesa_free(dstImageOffsets);
+		}
+
+		/* SGIS_generate_mipmap */
+		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+			radeon_generate_mipmap(ctx, target, texObj);
+		}
+	}
+
+	_mesa_unmap_teximage_pbo(ctx, packing);
+
+	if (pixels)
+	  radeon_teximage_unmap(image);
+
+
+}
+
+void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage)
+{
+	radeon_teximage(ctx, 1, target, level, internalFormat, width, 1, 1,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
+
+void radeonTexImage2D(GLcontext * ctx, GLenum target, GLint level,
+			   GLint internalFormat,
+			   GLint width, GLint height, GLint border,
+			   GLenum format, GLenum type, const GLvoid * pixels,
+			   const struct gl_pixelstore_attrib *packing,
+			   struct gl_texture_object *texObj,
+			   struct gl_texture_image *texImage)
+
+{
+	radeon_teximage(ctx, 2, target, level, internalFormat, width, height, 1,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
+
+void radeonCompressedTexImage2D(GLcontext * ctx, GLenum target,
+				     GLint level, GLint internalFormat,
+				     GLint width, GLint height, GLint border,
+				     GLsizei imageSize, const GLvoid * data,
+				     struct gl_texture_object *texObj,
+				     struct gl_texture_image *texImage)
+{
+	radeon_teximage(ctx, 2, target, level, internalFormat, width, height, 1,
+		imageSize, 0, 0, data, &ctx->Unpack, texObj, texImage, 1);
+}
+
+void radeonTexImage3D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint height, GLint depth,
+		      GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage)
+{
+	radeon_teximage(ctx, 3, target, level, internalFormat, width, height, depth,
+		0, format, type, pixels, packing, texObj, texImage, 0);
+}
+
+/**
+ * Update a subregion of the given texture image.
+ */
+static void radeon_texsubimage(GLcontext* ctx, int dims, GLenum target, int level,
+		GLint xoffset, GLint yoffset, GLint zoffset,
+		GLsizei width, GLsizei height, GLsizei depth,
+		GLsizei imageSize,
+		GLenum format, GLenum type,
+		const GLvoid * pixels,
+		const struct gl_pixelstore_attrib *packing,
+		struct gl_texture_object *texObj,
+		struct gl_texture_image *texImage,
+		int compressed)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj* t = radeon_tex_obj(texObj);
+	radeon_texture_image* image = get_radeon_texture_image(texImage);
+
+	radeon_firevertices(rmesa);
+
+	t->validated = GL_FALSE;
+	if (compressed) {
+		pixels = _mesa_validate_pbo_compressed_teximage(
+			ctx, imageSize, pixels, packing, "glCompressedTexImage");
+	} else {
+		pixels = _mesa_validate_pbo_teximage(ctx, dims,
+			width, height, depth, format, type, pixels, packing, "glTexSubImage1D");
+	}
+
+	if (pixels) {
+		GLint dstRowStride;
+		radeon_teximage_map(image, GL_TRUE);
+
+		if (image->mt) {
+			radeon_mipmap_level *lvl = &image->mt->levels[image->mtlevel];
+			dstRowStride = lvl->rowstride;
+		} else {
+			dstRowStride = texImage->RowStride * texImage->TexFormat->TexelBytes;
+		}
+
+		if (compressed) {
+			uint32_t srcRowStride, bytesPerRow, rows; 
+			dstRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, texImage->Width);
+			srcRowStride = _mesa_compressed_row_stride(texImage->TexFormat->MesaFormat, width);
+			bytesPerRow = srcRowStride;
+			rows = height / 4;
+
+			copy_rows(texImage->Data, dstRowStride,  image->base.Data, srcRowStride, rows,
+				  bytesPerRow);
+			
+		} else {
+			if (!texImage->TexFormat->StoreImage(ctx, dims, texImage->_BaseFormat,
+							     texImage->TexFormat, texImage->Data,
+							     xoffset, yoffset, zoffset,
+							     dstRowStride,
+							     texImage->ImageOffsets,
+							     width, height, depth,
+							     format, type, pixels, packing))
+				_mesa_error(ctx, GL_OUT_OF_MEMORY, "glTexSubImage");
+		}
+
+		/* GL_SGIS_generate_mipmap */
+		if (level == texObj->BaseLevel && texObj->GenerateMipmap) {
+			radeon_generate_mipmap(ctx, target, texObj);
+		}
+	}
+
+	radeon_teximage_unmap(image);
+
+	_mesa_unmap_teximage_pbo(ctx, packing);
+
+
+}
+
+void radeonTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset,
+			 GLsizei width,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 1, target, level, xoffset, 0, 0, width, 1, 1, 0,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
+
+void radeonTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset, GLint yoffset,
+			 GLsizei width, GLsizei height,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 2, target, level, xoffset, yoffset, 0, width, height, 1,
+			   0, format, type, pixels, packing, texObj, texImage,
+			   0);
+}
+
+void radeonCompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+				   GLint level, GLint xoffset,
+				   GLint yoffset, GLsizei width,
+				   GLsizei height, GLenum format,
+				   GLsizei imageSize, const GLvoid * data,
+				   struct gl_texture_object *texObj,
+				   struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 2, target, level, xoffset, yoffset, 0, width, height, 1,
+		imageSize, format, 0, data, &ctx->Unpack, texObj, texImage, 1);
+}
+
+
+void radeonTexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset, GLint yoffset, GLint zoffset,
+			 GLsizei width, GLsizei height, GLsizei depth,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage)
+{
+	radeon_texsubimage(ctx, 3, target, level, xoffset, yoffset, zoffset, width, height, depth, 0,
+		format, type, pixels, packing, texObj, texImage, 0);
+}
+
+
+
+/**
+ * Ensure that the given image is stored in the given miptree from now on.
+ */
+static void migrate_image_to_miptree(radeon_mipmap_tree *mt, radeon_texture_image *image, int face, int level)
+{
+	radeon_mipmap_level *dstlvl = &mt->levels[level - mt->firstLevel];
+	unsigned char *dest;
+
+	assert(image->mt != mt);
+	assert(dstlvl->width == image->base.Width);
+	assert(dstlvl->height == image->base.Height);
+	assert(dstlvl->depth == image->base.Depth);
+
+
+	radeon_bo_map(mt->bo, GL_TRUE);
+	dest = mt->bo->ptr + dstlvl->faces[face].offset;
+
+	if (image->mt) {
+		/* Format etc. should match, so we really just need a memcpy().
+		 * In fact, that memcpy() could be done by the hardware in many
+		 * cases, provided that we have a proper memory manager.
+		 */
+		radeon_mipmap_level *srclvl = &image->mt->levels[image->mtlevel-image->mt->firstLevel];
+
+		assert(srclvl->size == dstlvl->size);
+		assert(srclvl->rowstride == dstlvl->rowstride);
+
+		radeon_bo_map(image->mt->bo, GL_FALSE);
+
+		memcpy(dest,
+			image->mt->bo->ptr + srclvl->faces[face].offset,
+			dstlvl->size);
+		radeon_bo_unmap(image->mt->bo);
+
+		radeon_miptree_unreference(image->mt);
+	} else {
+		uint32_t srcrowstride;
+		uint32_t height;
+		/* need to confirm this value is correct */
+		if (mt->compressed) {
+			height = image->base.Height / 4;
+			srcrowstride = image->base.RowStride * mt->bpp;
+		} else {
+			height = image->base.Height * image->base.Depth;
+			srcrowstride = image->base.Width * image->base.TexFormat->TexelBytes;
+		}
+
+//		if (mt->tilebits)
+//			WARN_ONCE("%s: tiling not supported yet", __FUNCTION__);
+
+		copy_rows(dest, dstlvl->rowstride, image->base.Data, srcrowstride,
+			  height, srcrowstride);
+
+		_mesa_free_texmemory(image->base.Data);
+		image->base.Data = 0;
+	}
+
+	radeon_bo_unmap(mt->bo);
+
+	image->mt = mt;
+	image->mtface = face;
+	image->mtlevel = level;
+	radeon_miptree_reference(image->mt);
+}
+
+int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj)
+{
+	radeonContextPtr rmesa = RADEON_CONTEXT(ctx);
+	radeonTexObj *t = radeon_tex_obj(texObj);
+	radeon_texture_image *baseimage = get_radeon_texture_image(texObj->Image[0][texObj->BaseLevel]);
+	int face, level;
+
+	if (t->validated || t->image_override)
+		return GL_TRUE;
+
+	if (RADEON_DEBUG & DEBUG_TEXTURE)
+		fprintf(stderr, "%s: Validating texture %p now\n", __FUNCTION__, texObj);
+
+	if (baseimage->base.Border > 0)
+		return GL_FALSE;
+
+	/* Ensure a matching miptree exists.
+	 *
+	 * Differing mipmap trees can result when the app uses TexImage to
+	 * change texture dimensions.
+	 *
+	 * Prefer to use base image's miptree if it
+	 * exists, since that most likely contains more valid data (remember
+	 * that the base level is usually significantly larger than the rest
+	 * of the miptree, so cubemaps are the only possible exception).
+	 */
+	if (baseimage->mt &&
+	    baseimage->mt != t->mt &&
+	    radeon_miptree_matches_texture(baseimage->mt, &t->base)) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = baseimage->mt;
+		radeon_miptree_reference(t->mt);
+	} else if (t->mt && !radeon_miptree_matches_texture(t->mt, &t->base)) {
+		radeon_miptree_unreference(t->mt);
+		t->mt = 0;
+	}
+
+	if (!t->mt) {
+		if (RADEON_DEBUG & DEBUG_TEXTURE)
+			fprintf(stderr, " Allocate new miptree\n");
+		radeon_try_alloc_miptree(rmesa, t, &baseimage->base, 0, texObj->BaseLevel);
+		if (!t->mt) {
+			_mesa_problem(ctx, "r300_validate_texture failed to alloc miptree");
+			return GL_FALSE;
+		}
+	}
+
+	/* Ensure all images are stored in the single main miptree */
+	for(face = 0; face < t->mt->faces; ++face) {
+		for(level = t->mt->firstLevel; level <= t->mt->lastLevel; ++level) {
+			radeon_texture_image *image = get_radeon_texture_image(texObj->Image[face][level]);
+			if (RADEON_DEBUG & DEBUG_TEXTURE)
+				fprintf(stderr, " face %i, level %i... %p vs %p ", face, level, t->mt, image->mt);
+			if (t->mt == image->mt) {
+				if (RADEON_DEBUG & DEBUG_TEXTURE)
+					fprintf(stderr, "OK\n");
+
+				continue;
+			}
+
+			if (RADEON_DEBUG & DEBUG_TEXTURE)
+				fprintf(stderr, "migrating\n");
+			migrate_image_to_miptree(t->mt, image, face, level);
+		}
+	}
+
+	return GL_TRUE;
+}
+
+
+/**
+ * Need to map texture image into memory before copying image data,
+ * then unmap it.
+ */
+static void
+radeon_get_tex_image(GLcontext * ctx, GLenum target, GLint level,
+		     GLenum format, GLenum type, GLvoid * pixels,
+		     struct gl_texture_object *texObj,
+		     struct gl_texture_image *texImage, int compressed)
+{
+	radeon_texture_image *image = get_radeon_texture_image(texImage);
+
+	if (image->mt) {
+		/* Map the texture image read-only */
+		radeon_teximage_map(image, GL_FALSE);
+	} else {
+		/* Image hasn't been uploaded to a miptree yet */
+		assert(image->base.Data);
+	}
+
+	if (compressed) {
+		_mesa_get_compressed_teximage(ctx, target, level, pixels,
+					      texObj, texImage);
+	} else {
+		_mesa_get_teximage(ctx, target, level, format, type, pixels,
+				   texObj, texImage);
+	}
+     
+	if (image->mt) {
+		radeon_teximage_unmap(image);
+	}
+}
+
+void
+radeonGetTexImage(GLcontext * ctx, GLenum target, GLint level,
+		  GLenum format, GLenum type, GLvoid * pixels,
+		  struct gl_texture_object *texObj,
+		  struct gl_texture_image *texImage)
+{
+	radeon_get_tex_image(ctx, target, level, format, type, pixels,
+			     texObj, texImage, 0);
+}
+
+void
+radeonGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
+			    GLvoid *pixels,
+			    struct gl_texture_object *texObj,
+			    struct gl_texture_image *texImage)
+{
+	radeon_get_tex_image(ctx, target, level, 0, 0, pixels,
+			     texObj, texImage, 1);
+}
diff --git a/src/mesa/drivers/dri/radeon/radeon_texture.h b/src/mesa/drivers/dri/radeon/radeon_texture.h
new file mode 100644
index 00000000000..888a55ba911
--- /dev/null
+++ b/src/mesa/drivers/dri/radeon/radeon_texture.h
@@ -0,0 +1,122 @@
+/*
+ * Copyright (C) 2008 Nicolai Haehnle.
+ * Copyright (C) The Weather Channel, Inc.  2002.  All Rights Reserved.
+ *
+ * The Weather Channel (TM) funded Tungsten Graphics to develop the
+ * initial release of the Radeon 8500 driver under the XFree86 license.
+ * This notice must be preserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#ifndef RADEON_TEXTURE_H
+#define RADEON_TEXTURE_H
+struct gl_texture_image *radeonNewTextureImage(GLcontext *ctx);
+void radeonFreeTexImageData(GLcontext *ctx, struct gl_texture_image *timage);
+
+void radeon_teximage_map(radeon_texture_image *image, GLboolean write_enable);
+void radeon_teximage_unmap(radeon_texture_image *image);
+void radeonMapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
+void radeonUnmapTexture(GLcontext *ctx, struct gl_texture_object *texObj);
+void radeonGenerateMipmap(GLcontext* ctx, GLenum target, struct gl_texture_object *texObj);
+int radeon_validate_texture_miptree(GLcontext * ctx, struct gl_texture_object *texObj);
+GLuint radeon_face_for_target(GLenum target);
+const struct gl_texture_format *radeonChooseTextureFormat_mesa(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type);
+const struct gl_texture_format *radeonChooseTextureFormat(GLcontext * ctx,
+							  GLint internalFormat,
+							  GLenum format,
+							  GLenum type, GLboolean fbo);
+
+void radeonTexImage1D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage);
+void radeonTexImage2D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint height, GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage);
+void radeonCompressedTexImage2D(GLcontext * ctx, GLenum target,
+				GLint level, GLint internalFormat,
+				GLint width, GLint height, GLint border,
+				GLsizei imageSize, const GLvoid * data,
+				struct gl_texture_object *texObj,
+				struct gl_texture_image *texImage);
+void radeonTexImage3D(GLcontext * ctx, GLenum target, GLint level,
+		      GLint internalFormat,
+		      GLint width, GLint height, GLint depth,
+		      GLint border,
+		      GLenum format, GLenum type, const GLvoid * pixels,
+		      const struct gl_pixelstore_attrib *packing,
+		      struct gl_texture_object *texObj,
+		      struct gl_texture_image *texImage);
+void radeonTexSubImage1D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset,
+			 GLsizei width,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage);
+void radeonTexSubImage2D(GLcontext * ctx, GLenum target, GLint level,
+				GLint xoffset, GLint yoffset,
+				GLsizei width, GLsizei height,
+				GLenum format, GLenum type,
+				const GLvoid * pixels,
+				const struct gl_pixelstore_attrib *packing,
+				struct gl_texture_object *texObj,
+				struct gl_texture_image *texImage);
+void radeonCompressedTexSubImage2D(GLcontext * ctx, GLenum target,
+				   GLint level, GLint xoffset,
+				   GLint yoffset, GLsizei width,
+				   GLsizei height, GLenum format,
+				   GLsizei imageSize, const GLvoid * data,
+				   struct gl_texture_object *texObj,
+				   struct gl_texture_image *texImage);
+
+void radeonTexSubImage3D(GLcontext * ctx, GLenum target, GLint level,
+			 GLint xoffset, GLint yoffset, GLint zoffset,
+			 GLsizei width, GLsizei height, GLsizei depth,
+			 GLenum format, GLenum type,
+			 const GLvoid * pixels,
+			 const struct gl_pixelstore_attrib *packing,
+			 struct gl_texture_object *texObj,
+			 struct gl_texture_image *texImage);
+
+void radeonGetTexImage(GLcontext * ctx, GLenum target, GLint level,
+		       GLenum format, GLenum type, GLvoid * pixels,
+		       struct gl_texture_object *texObj,
+		       struct gl_texture_image *texImage);
+void radeonGetCompressedTexImage(GLcontext *ctx, GLenum target, GLint level,
+				 GLvoid *pixels,
+				 struct gl_texture_object *texObj,
+				 struct gl_texture_image *texImage);
+
+#endif
diff --git a/src/mesa/drivers/dri/radeon/server/radeon_reg.h b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
index ae2ccdf292a..866807462a4 100644
--- a/src/mesa/drivers/dri/radeon/server/radeon_reg.h
+++ b/src/mesa/drivers/dri/radeon/server/radeon_reg.h
@@ -2031,6 +2031,9 @@
 #define RADEON_CP_PACKET3_3D_DRAW_INDX              0xC0002A00
 #define RADEON_CP_PACKET3_LOAD_PALETTE              0xC0002C00
 #define RADEON_CP_PACKET3_3D_LOAD_VBPNTR            0xC0002F00
+#define R200_CP_CMD_3D_DRAW_VBUF_2      0xC0003400
+#define R200_CP_CMD_3D_DRAW_IMMD_2      0xC0003500
+#define R200_CP_CMD_3D_DRAW_INDX_2      0xC0003600
 #define RADEON_CP_PACKET3_CNTL_PAINT                0xC0009100
 #define RADEON_CP_PACKET3_CNTL_BITBLT               0xC0009200
 #define RADEON_CP_PACKET3_CNTL_SMALLTEXT            0xC0009300
diff --git a/src/mesa/drivers/x11/xm_dd.c b/src/mesa/drivers/x11/xm_dd.c
index 305df548fa2..9a01465bdf9 100644
--- a/src/mesa/drivers/x11/xm_dd.c
+++ b/src/mesa/drivers/x11/xm_dd.c
@@ -912,8 +912,9 @@ xmesa_update_state( GLcontext *ctx, GLbitfield new_state )
    /*
     * GL_DITHER, GL_READ/DRAW_BUFFER, buffer binding state, etc. effect
     * renderbuffer span/clear funcs.
+    * Check _NEW_COLOR to detect dither enable/disable.
     */
-   if (new_state & (_NEW_COLOR | _NEW_PIXEL | _NEW_BUFFERS)) {
+   if (new_state & (_NEW_COLOR | _NEW_BUFFERS)) {
       XMesaBuffer xmbuf = XMESA_BUFFER(ctx->DrawBuffer);
       struct xmesa_renderbuffer *front_xrb, *back_xrb;