78 files changed, 2352 insertions, 381 deletions
diff --git a/common.py b/common.py
index b44f20e8216..1d0c6a71fa5 100644
--- a/common.py
+++ b/common.py
@@ -15,6 +15,8 @@ import SCons.Script.SConscript
 # Defaults
 
 host_platform = _platform.system().lower()
+if host_platform.startswith('cygwin'):
+    host_platform = 'cygwin'
 
 # Search sys.argv[] for a "platform=foo" argument since we don't have
 # an 'env' variable at this point.
@@ -81,7 +83,7 @@ def AddOptions(opts):
 	opts.Add(EnumOption('machine', 'use machine-specific assembly code', default_machine,
 											 allowed_values=('generic', 'ppc', 'x86', 'x86_64')))
 	opts.Add(EnumOption('platform', 'target platform', host_platform,
-											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince', 'darwin', 'embedded', 'cygwin_nt-5.1', 'cygwin_nt-6.1', 'sunos5', 'freebsd8')))
+											 allowed_values=('linux', 'cell', 'windows', 'winddk', 'wince', 'darwin', 'embedded', 'cygwin', 'sunos5', 'freebsd8')))
 	opts.Add('toolchain', 'compiler toolchain', default_toolchain)
 	opts.Add(BoolOption('gles', 'EXPERIMENTAL: enable OpenGL ES support', 'no'))
 	opts.Add(BoolOption('llvm', 'use LLVM', default_llvm))
diff --git a/docs/GL3.txt b/docs/GL3.txt
index 9ff25a95297..0c53bc42d54 100644
--- a/docs/GL3.txt
+++ b/docs/GL3.txt
@@ -27,7 +27,7 @@ Non-normalized Integer texture/framebuffer formats    ~50% done
 1D/2D Texture arrays                                  core Mesa, swrast done
 Packed depth/stencil formats                          DONE
 Per-buffer blend and masks (GL_EXT_draw_buffers2)     DONE
-GL_EXT_texture_compression_rgtc                       not started
+GL_EXT_texture_compression_rgtc                       DONE (swrast, gallium r600)
 Red and red/green texture formats                     DONE (swrast, i965, gallium)
 Transform feedback (GL_EXT_transform_feedback)        ~50% done
    glBindFragDataLocation, glGetFragDataLocation,
diff --git a/docs/MESA_multithread_makecurrent.spec b/docs/MESA_multithread_makecurrent.spec
new file mode 100644
index 00000000000..5065c2fc0a3
--- /dev/null
+++ b/docs/MESA_multithread_makecurrent.spec
@@ -0,0 +1,158 @@
+Name
+
+    MESA_multithread_makecurrent
+
+Name Strings
+
+    GLX_MESA_multithread_makecurrent
+
+Contact
+
+    Eric Anholt ([email protected])
+
+Status
+
+    Not shipping.
+
+Version
+
+    Last Modified Date:  21 February 2011
+
+Number
+
+    TBD
+
+Dependencies
+
+    OpenGL 1.0 or later is required.
+    GLX 1.3 or later is required.
+
+Overview
+
+    The GLX context setup encourages multithreaded applications to
+    create a context per thread which each operate on their own
+    objects in parallel, and leaves synchronization for write access
+    to shared objects up to the application.
+
+    For some applications, maintaining per-thread contexts and
+    ensuring that the glFlush happens in one thread before another
+    thread starts working on that object is difficult.  For them,
+    using the same context across multiple threads and protecting its
+    usage with a mutex is both higher performance and easier to
+    implement.  This extension gives those applications that option by
+    relaxing the context binding requirements.
+
+    This new behavior matches the requirements of AGL, while providing
+    a feature not specified in WGL.
+
+IP Status
+
+    Open-source; freely implementable.
+
+Issues
+
+    None.
+
+New Procedures and Functions
+
+    None.
+
+New Tokens
+
+    None.
+
+Changes to Chapter 2 of the GLX 1.3 Specification (Functions and Errors)
+
+    Replace the following sentence from section 2.2 Rendering Contexts:
+	In addition, a rendering context can be current for only one
+	thread at a time.
+    with:
+	In addition, an indirect rendering context can be current for
+	only one thread at a time.  A direct rendering context may be
+	current to multiple threads, with synchronization of access to
+	the context thruogh the GL managed by the application through
+	mutexes.
+
+Changes to Chapter 3 of the GLX 1.3 Specification (Functions and Errors)
+
+    Replace the following sentence from section 3.3.7 Rendering Contexts:
+	If ctx is current to some other thread, then
+	glXMakeContextCurrent will generate a BadAccess error.
+    with:
+	If ctx is an indirect context current to some other thread,
+	then glXMakeContextCurrent will generate a BadAccess error.
+
+    Replace the following sentence from section 3.5 Rendering Contexts:
+	If ctx is current to some other thread, then
+	glXMakeCurrent will generate a BadAccess error.
+    with:
+	If ctx is an indirect context current to some other thread,
+	then glXMakeCurrent will generate a BadAccess error.
+
+GLX Protocol
+
+    None.  The GLX extension only extends to direct rendering contexts.
+
+Errors
+
+    None.
+
+New State
+
+    None.
+
+Issues
+
+    (1) What happens if the app binds a context/drawable in multiple
+	threads, then binds a different context/thread in one of them?
+
+    As with binding a new context from the current thread, the old
+    context's refcount is reduced and the new context's refcount is
+    increased.
+
+    (2) What happens if the app binds a context/drawable in multiple
+	threads, then binds None/None in one of them?
+
+    The GLX context is unreferenced from that thread, and the other
+    threads retain their GLX context binding.
+
+    (3) What happens if the app binds a context/drawable in 7 threads,
+	then destroys the context in one of them?
+
+    As with GLX context destruction previously, the XID is destroyed
+    but the context remains usable by threads that have the context
+    current.
+
+    (4) What happens if the app binds a new drawable/readable with
+        glXMakeCurrent() when it is already bound to another thread?
+
+    The context becomes bound to the new drawable/readable, and
+    further rendering in either thread will use the new
+    drawable/readable.
+
+    (5) What requirements should be placed on the user managing contexts
+        from multiple threads?
+
+    The intention is to allow multithreaded access to the GL at the
+    minimal performance cost, so requiring that the GL do general
+    synchronization (beyond that already required by context sharing)
+    is not an option, and synchronizing of GL's access to the GL
+    context between multiple threads is left to the application to do
+    across GL calls.  However, it would be unfortunate for a library
+    doing multithread_makecurrent to require that other libraries
+    share in synchronization for binding of their own contexts, so the
+    refcounting of the contexts is required to be threadsafe.
+
+    (6) Does this apply to indirect contexts?
+
+    This was ignored in the initial revision of the spec.  Behavior
+    for indirect contexts is left as-is.
+
+Revision History
+
+    20 November 2009 Eric Anholt - initial specification
+    22 November 2009 Eric Anholt - added issues from Ian Romanick.
+    3 February 2011 Eric Anholt - updated with resolution to issues 1-3
+    3 February 2011 Eric Anholt - added issue 4, 5
+    21 February 2011 Eric Anholt - Include glXMakeCurrent() sentence
+    along with glXMakeContextCurrent() for removal.
diff --git a/docs/relnotes-7.11.html b/docs/relnotes-7.11.html
index 6c6622ed3f4..4b1730b17ec 100644
--- a/docs/relnotes-7.11.html
+++ b/docs/relnotes-7.11.html
@@ -38,6 +38,7 @@ tbd
 <ul>
 <li>GL_ARB_draw_instanced extension (gallium drivers, swrast)
 <li>GL_ARB_instanced_arrays extension (gallium drivers)
+<li>GL_ARB_texture_compression_rgtc (gallium r600, swrast)
 <li>GL_ARB_draw_buffers_blend (gallium)
 <li>GL_EXT_texture_sRGB_decode (gallium drivers, swrast, i965)
 </ul>
diff --git a/scons/gallium.py b/scons/gallium.py
index 9118257ac05..112f6c89dca 100755
--- a/scons/gallium.py
+++ b/scons/gallium.py
@@ -195,6 +195,8 @@ def generate(env):
     # Determine whether we are cross compiling; in particular, whether we need
     # to compile code generators with a different compiler as the target code.
     host_platform = _platform.system().lower()
+    if host_platform.startswith('cygwin'):
+        host_platform = 'cygwin'
     host_machine = os.environ.get('PROCESSOR_ARCHITEW6432', os.environ.get('PROCESSOR_ARCHITECTURE', _platform.machine()))
     host_machine = {
         'x86': 'x86',
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index e685f4b73f0..1fec3adf5b1 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -832,14 +832,14 @@ lp_build_sample_mipmap(struct lp_build_sample_context *bld,
                        LLVMValueRef *colors_out)
 {
    LLVMBuilderRef builder = bld->gallivm->builder;
-   LLVMValueRef size0;
-   LLVMValueRef size1;
-   LLVMValueRef row_stride0_vec;
-   LLVMValueRef row_stride1_vec;
-   LLVMValueRef img_stride0_vec;
-   LLVMValueRef img_stride1_vec;
-   LLVMValueRef data_ptr0;
-   LLVMValueRef data_ptr1;
+   LLVMValueRef size0 = NULL;
+   LLVMValueRef size1 = NULL;
+   LLVMValueRef row_stride0_vec = NULL;
+   LLVMValueRef row_stride1_vec = NULL;
+   LLVMValueRef img_stride0_vec = NULL;
+   LLVMValueRef img_stride1_vec = NULL;
+   LLVMValueRef data_ptr0 = NULL;
+   LLVMValueRef data_ptr1 = NULL;
    LLVMValueRef colors0[4], colors1[4];
    unsigned chan;
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_ureg.h b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
index b8d193f3f89..9d5553f0ea0 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_ureg.h
+++ b/src/gallium/auxiliary/tgsi/tgsi_ureg.h
@@ -43,24 +43,24 @@ struct ureg_program;
  */
 struct ureg_src
 {
-   unsigned File        : 4;  /* TGSI_FILE_ */
-   unsigned SwizzleX    : 2;  /* TGSI_SWIZZLE_ */
-   unsigned SwizzleY    : 2;  /* TGSI_SWIZZLE_ */
-   unsigned SwizzleZ    : 2;  /* TGSI_SWIZZLE_ */
-   unsigned SwizzleW    : 2;  /* TGSI_SWIZZLE_ */
-   unsigned Indirect    : 1;  /* BOOL */
-   unsigned DimIndirect : 1;  /* BOOL */
-   unsigned Dimension   : 1;  /* BOOL */
-   unsigned Absolute    : 1;  /* BOOL */
-   unsigned Negate      : 1;  /* BOOL */
-   int      Index       : 16; /* SINT */
+   unsigned File             : 4;  /* TGSI_FILE_ */
+   unsigned SwizzleX         : 2;  /* TGSI_SWIZZLE_ */
+   unsigned SwizzleY         : 2;  /* TGSI_SWIZZLE_ */
+   unsigned SwizzleZ         : 2;  /* TGSI_SWIZZLE_ */
+   unsigned SwizzleW         : 2;  /* TGSI_SWIZZLE_ */
+   unsigned Indirect         : 1;  /* BOOL */
+   unsigned DimIndirect      : 1;  /* BOOL */
+   unsigned Dimension        : 1;  /* BOOL */
+   unsigned Absolute         : 1;  /* BOOL */
+   unsigned Negate           : 1;  /* BOOL */
    unsigned IndirectFile     : 4;  /* TGSI_FILE_ */
-   int      IndirectIndex    : 16; /* SINT */
    unsigned IndirectSwizzle  : 2;  /* TGSI_SWIZZLE_ */
-   int      DimensionIndex   : 16; /* SINT */
    unsigned DimIndFile       : 4;  /* TGSI_FILE_ */
-   int      DimIndIndex      : 16; /* SINT */
    unsigned DimIndSwizzle    : 2;  /* TGSI_SWIZZLE_ */
+   int      Index            : 16; /* SINT */
+   int      IndirectIndex    : 16; /* SINT */
+   int      DimensionIndex   : 16; /* SINT */
+   int      DimIndIndex      : 16; /* SINT */
 };
 
 /* Very similar to a tgsi_dst_register, removing unsupported fields
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index 3b6342ad8d1..4f1b0e71934 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -67,7 +67,7 @@ struct gen_mipmap_state
    struct pipe_vertex_element velem[2];
 
    void *vs;
-   void *fs1d, *fs2d, *fs3d, *fsCube;
+   void *fs1d, *fs2d, *fs3d, *fsCube, *fs1da, *fs2da;
 
    struct pipe_resource *vbuf;  /**< quad vertices */
    unsigned vbuf_slot;
@@ -1321,6 +1321,13 @@ util_create_gen_mipmap(struct pipe_context *pipe,
                                                TGSI_INTERPOLATE_LINEAR);
    ctx->fsCube = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_CUBE,
                                                TGSI_INTERPOLATE_LINEAR);
+   if (pipe->screen->get_param(pipe->screen, PIPE_CAP_ARRAY_TEXTURES)) {
+      ctx->fs1da = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_1D_ARRAY,
+                                                 TGSI_INTERPOLATE_LINEAR);
+      ctx->fs2da = util_make_fragment_tex_shader(pipe, TGSI_TEXTURE_2D_ARRAY,
+                                                 TGSI_INTERPOLATE_LINEAR);
+   }
+
 
    /* vertex data that doesn't change */
    for (i = 0; i < 4; i++) {
@@ -1390,8 +1397,25 @@ set_vertex_data(struct gen_mipmap_state *ctx,
       util_map_texcoords2d_onto_cubemap(layer, &st[0][0], 2,
                                         &ctx->vertices[0][1][0], 8);
    }
-   else {
-      /* 1D/2D/3D */
+   else if (tex_target == PIPE_TEXTURE_1D_ARRAY) {
+      /* 1D texture array  */
+      ctx->vertices[0][1][0] = 0.0f; /*s*/
+      ctx->vertices[0][1][1] = r; /*t*/
+      ctx->vertices[0][1][2] = 0.0f;    /*r*/
+
+      ctx->vertices[1][1][0] = 1.0f;
+      ctx->vertices[1][1][1] = r;
+      ctx->vertices[1][1][2] = 0.0f;
+
+      ctx->vertices[2][1][0] = 1.0f;
+      ctx->vertices[2][1][1] = r;
+      ctx->vertices[2][1][2] = 0.0f;
+
+      ctx->vertices[3][1][0] = 0.0f;
+      ctx->vertices[3][1][1] = r;
+      ctx->vertices[3][1][2] = 0.0f;
+   } else {
+      /* 1D/2D/3D/2D array */
       ctx->vertices[0][1][0] = 0.0f; /*s*/
       ctx->vertices[0][1][1] = 0.0f; /*t*/
       ctx->vertices[0][1][2] = r;    /*r*/
@@ -1427,6 +1451,10 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
 {
    struct pipe_context *pipe = ctx->pipe;
 
+   if (ctx->fs2da)
+      pipe->delete_fs_state(pipe, ctx->fs2da);
+   if (ctx->fs1da)
+      pipe->delete_fs_state(pipe, ctx->fs1da);
    pipe->delete_fs_state(pipe, ctx->fsCube);
    pipe->delete_fs_state(pipe, ctx->fs3d);
    pipe->delete_fs_state(pipe, ctx->fs2d);
@@ -1499,7 +1527,11 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
       fs = ctx->fsCube;
       break;
    case PIPE_TEXTURE_1D_ARRAY:
+      fs = ctx->fs1da;
+      break;
    case PIPE_TEXTURE_2D_ARRAY:
+      fs = ctx->fs2da;
+      break;
    default:
       assert(0);
       fs = ctx->fs2d;
@@ -1555,6 +1587,8 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
 
       if (pt->target == PIPE_TEXTURE_3D)
          nr_layers = u_minify(pt->depth0, dstLevel);
+      else if (pt->target == PIPE_TEXTURE_2D_ARRAY || pt->target == PIPE_TEXTURE_1D_ARRAY)
+	 nr_layers = pt->array_size;
       else
          nr_layers = 1;
 
@@ -1564,11 +1598,12 @@ util_gen_mipmap(struct gen_mipmap_state *ctx,
             /* in theory with geom shaders and driver with full layer support
                could do that in one go. */
             layer = i;
-            offset = 1.0f / (float)(nr_layers * 2);
             /* XXX hmm really? */
             rcoord = (float)layer / (float)nr_layers + 1.0f / (float)(nr_layers * 2);
-         }
-         else
+         } else if (pt->target == PIPE_TEXTURE_2D_ARRAY || pt->target == PIPE_TEXTURE_1D_ARRAY) {
+	    layer = i;
+	    rcoord = (float)layer;
+	 } else
             layer = face;
 
          memset(&surf_templ, 0, sizeof(surf_templ));
diff --git a/src/gallium/auxiliary/util/u_vbuf_mgr.c b/src/gallium/auxiliary/util/u_vbuf_mgr.c
index dec8dd717e8..3cf8ee0831d 100644
--- a/src/gallium/auxiliary/util/u_vbuf_mgr.c
+++ b/src/gallium/auxiliary/util/u_vbuf_mgr.c
@@ -531,7 +531,10 @@ static void u_vbuf_upload_buffers(struct u_vbuf_mgr_priv *mgr,
          unsigned first, size;
          boolean flushed;
 
-         if (vb->stride) {
+         if (mgr->ve->ve[i].instance_divisor) {
+            first = 0;
+            size = vb->buffer->width0;
+         } else if (vb->stride) {
             first = vb->stride * min_index;
             size = vb->stride * count;
          } else {
diff --git a/src/gallium/drivers/i915/i915_batchbuffer.h b/src/gallium/drivers/i915/i915_batchbuffer.h
index d92b2ccb31e..b4a91dabb37 100644
--- a/src/gallium/drivers/i915/i915_batchbuffer.h
+++ b/src/gallium/drivers/i915/i915_batchbuffer.h
@@ -75,6 +75,14 @@ i915_winsys_batchbuffer_write(struct i915_winsys_batchbuffer *batch,
    batch->ptr += size;
 }
 
+static INLINE boolean
+i915_winsys_validate_buffers(struct i915_winsys_batchbuffer *batch,
+			     struct i915_winsys_buffer **buffers,
+			     int num_of_buffers)
+{
+   return batch->iws->validate_buffers(batch, buffers, num_of_buffers);
+}
+
 static INLINE int
 i915_winsys_batchbuffer_reloc(struct i915_winsys_batchbuffer *batch,
                               struct i915_winsys_buffer *buffer,
diff --git a/src/gallium/drivers/i915/i915_blit.c b/src/gallium/drivers/i915/i915_blit.c
index 97c25665156..f885417f8ed 100644
--- a/src/gallium/drivers/i915/i915_blit.c
+++ b/src/gallium/drivers/i915/i915_blit.c
@@ -49,6 +49,11 @@ i915_fill_blit(struct i915_context *i915,
    I915_DBG(DBG_BLIT, "%s dst:buf(%p)/%d+%d %d,%d sz:%dx%d\n",
             __FUNCTION__, dst_buffer, dst_pitch, dst_offset, x, y, w, h);
 
+   if(!i915_winsys_validate_buffers(i915->batch, &dst_buffer, 1)) {
+      FLUSH_BATCH(NULL);
+      assert(i915_winsys_validate_buffers(i915->batch, &dst_buffer, 1));
+   }
+
    switch (cpp) {
    case 1:
    case 2:
@@ -76,6 +81,8 @@ i915_fill_blit(struct i915_context *i915,
    OUT_BATCH(((y + h) << 16) | (x + w));
    OUT_RELOC_FENCED(dst_buffer, I915_USAGE_2D_TARGET, dst_offset);
    OUT_BATCH(color);
+
+   i915_set_flush_dirty(i915, I915_FLUSH_CACHE);
 }
 
 void
@@ -94,6 +101,7 @@ i915_copy_blit(struct i915_context *i915,
    unsigned CMD, BR13;
    int dst_y2 = dst_y + h;
    int dst_x2 = dst_x + w;
+   struct i915_winsys_buffer *buffers[2] = {src_buffer, dst_buffer};
 
 
    I915_DBG(DBG_BLIT,
@@ -102,6 +110,11 @@ i915_copy_blit(struct i915_context *i915,
             src_buffer, src_pitch, src_offset, src_x, src_y,
             dst_buffer, dst_pitch, dst_offset, dst_x, dst_y, w, h);
 
+   if(!i915_winsys_validate_buffers(i915->batch, buffers, 2)) {
+      FLUSH_BATCH(NULL);
+      assert(i915_winsys_validate_buffers(i915->batch, buffers, 2));
+   }
+
    switch (cpp) {
    case 1:
    case 2:
@@ -142,4 +155,6 @@ i915_copy_blit(struct i915_context *i915,
    OUT_BATCH((src_y << 16) | src_x);
    OUT_BATCH(((int) src_pitch & 0xffff));
    OUT_RELOC_FENCED(src_buffer, I915_USAGE_2D_SOURCE, src_offset);
+
+   i915_set_flush_dirty(i915, I915_FLUSH_CACHE);
 }
diff --git a/src/gallium/drivers/i915/i915_context.c b/src/gallium/drivers/i915/i915_context.c
index 707b2e9f956..cbf919754e5 100644
--- a/src/gallium/drivers/i915/i915_context.c
+++ b/src/gallium/drivers/i915/i915_context.c
@@ -73,10 +73,13 @@ i915_draw_vbo(struct pipe_context *pipe, const struct pipe_draw_info *info)
    draw_set_mapped_index_buffer(draw, mapped_indices);
 
    if (cbuf_dirty) {
-      draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
-                                      i915_buffer(i915->constants[PIPE_SHADER_VERTEX])->data,
-                                      (i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
+      if (i915->constants[PIPE_SHADER_VERTEX])
+         draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0,
+                                         i915_buffer(i915->constants[PIPE_SHADER_VERTEX])->data,
+                                         (i915->current.num_user_constants[PIPE_SHADER_VERTEX] * 
                                          4 * sizeof(float)));
+      else
+         draw_set_mapped_constant_buffer(draw, PIPE_SHADER_VERTEX, 0, NULL, 0);
    }
 
    /*
@@ -165,6 +168,7 @@ i915_create_context(struct pipe_screen *screen, void *priv)
    i915->hardware_dirty = ~0;
    i915->immediate_dirty = ~0;
    i915->dynamic_dirty = ~0;
+   i915->flush_dirty = 0;
 
    /* Batch stream debugging is a bit hacked up at the moment:
     */
diff --git a/src/gallium/drivers/i915/i915_context.h b/src/gallium/drivers/i915/i915_context.h
index 2cf53424f06..1da637d068e 100644
--- a/src/gallium/drivers/i915/i915_context.h
+++ b/src/gallium/drivers/i915/i915_context.h
@@ -150,6 +150,15 @@ struct i915_state
    /** Describes the current hardware vertex layout */
    struct vertex_info vertex_info;
 
+   /* static state (dst/depth buffer state) */
+   struct i915_winsys_buffer *cbuf_bo;
+   unsigned cbuf_flags;
+   struct i915_winsys_buffer *depth_bo;
+   unsigned depth_flags;
+   unsigned dst_buf_vars;
+   uint32_t draw_offset;
+   uint32_t draw_size;
+
    unsigned id;			/* track lost context events */
 };
 
@@ -237,6 +246,10 @@ struct i915_context {
    unsigned hardware_dirty;
    unsigned immediate_dirty;
    unsigned dynamic_dirty;
+   unsigned flush_dirty;
+
+   struct i915_winsys_buffer *validation_buffers[2 + 1 + I915_TEX_UNITS];
+   int num_validation_buffers;
 
    struct util_slab_mempool transfer_pool;
 };
@@ -277,6 +290,18 @@ struct i915_context {
 #define I915_HW_CONSTANTS         (1<<I915_CACHE_CONSTANTS)
 #define I915_HW_IMMEDIATE         (1<<(I915_MAX_CACHE+0))
 #define I915_HW_INVARIANT         (1<<(I915_MAX_CACHE+1))
+#define I915_HW_FLUSH             (1<<(I915_MAX_CACHE+1))
+
+/* hw flush handling */
+#define I915_FLUSH_CACHE		1
+#define I915_PIPELINE_FLUSH		2
+
+static INLINE
+void i915_set_flush_dirty(struct i915_context *i915, unsigned flush)
+{
+   i915->hardware_dirty |= I915_HW_FLUSH;
+   i915->flush_dirty |= flush;
+}
 
 
 /***********************************************************************
diff --git a/src/gallium/drivers/i915/i915_flush.c b/src/gallium/drivers/i915/i915_flush.c
index 911c051d1f2..22a2c7b2cb4 100644
--- a/src/gallium/drivers/i915/i915_flush.c
+++ b/src/gallium/drivers/i915/i915_flush.c
@@ -96,4 +96,6 @@ void i915_flush(struct i915_context *i915, struct pipe_fence_handle **fence)
    i915->hardware_dirty = ~0;
    i915->immediate_dirty = ~0;
    i915->dynamic_dirty = ~0;
+   /* kernel emits flushes in between batchbuffers */
+   i915->flush_dirty = 0;
 }
diff --git a/src/gallium/drivers/i915/i915_state_emit.c b/src/gallium/drivers/i915/i915_state_emit.c
index 509d487b498..0323ad940f9 100644
--- a/src/gallium/drivers/i915/i915_state_emit.c
+++ b/src/gallium/drivers/i915/i915_state_emit.c
@@ -36,73 +36,105 @@
 #include "pipe/p_defines.h"
 
 #include "util/u_math.h"
+#include "util/u_memory.h"
 
-static unsigned translate_format( enum pipe_format format )
+struct i915_tracked_hw_state {
+   const char *name;
+   void (*validate)(struct i915_context *);
+   void (*emit)(struct i915_context *);
+   unsigned dirty, batch_space;
+};
+
+
+static void
+emit_flush(struct i915_context *i915)
 {
-   switch (format) {
-   case PIPE_FORMAT_B8G8R8A8_UNORM:
-      return COLOR_BUF_ARGB8888;
-   case PIPE_FORMAT_B5G6R5_UNORM:
-      return COLOR_BUF_RGB565;
-   default:
-      assert(0);
-      return 0;
-   }
+   /* Cache handling is very cheap atm. State handling can request to flushes:
+    * - I915_FLUSH_CACHE which is a flush everything request and
+    * - I915_PIPELINE_FLUSH which is specifically for the draw_offset flush.
+    * Because the cache handling is so dumb, no explicit "invalidate map cache".
+    * Also, the first is a strict superset of the latter, so the following logic
+    * works. */
+   if (i915->flush_dirty & I915_FLUSH_CACHE)
+      OUT_BATCH(MI_FLUSH | FLUSH_MAP_CACHE);
+   else if (i915->flush_dirty & I915_PIPELINE_FLUSH)
+      OUT_BATCH(MI_FLUSH | INHIBIT_FLUSH_RENDER_CACHE);
+}
+
+static void
+validate_immediate(struct i915_context *i915)
+{
+   if (i915->immediate_dirty & (1 << I915_IMMEDIATE_S0))
+      i915->validation_buffers[i915->num_validation_buffers++] = i915->vbo;
+}
+
+static void
+validate_static(struct i915_context *i915)
+{
+   if (i915->current.cbuf_bo)
+      i915->validation_buffers[i915->num_validation_buffers++]
+         = i915->current.cbuf_bo;
+
+   if (i915->current.depth_bo)
+      i915->validation_buffers[i915->num_validation_buffers++]
+         = i915->current.depth_bo;
 }
 
-static unsigned translate_depth_format( enum pipe_format zformat )
+static void
+validate_map(struct i915_context *i915)
 {
-   switch (zformat) {
-   case PIPE_FORMAT_Z24X8_UNORM:
-   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
-      return DEPTH_FRMT_24_FIXED_8_OTHER;
-   case PIPE_FORMAT_Z16_UNORM:
-      return DEPTH_FRMT_16_FIXED;
-   default:
-      assert(0);
-      return 0;
+   const uint enabled = i915->current.sampler_enable_flags;
+   uint unit;
+   struct i915_texture *tex;
+
+
+   for (unit = 0; unit < I915_TEX_UNITS; unit++) {
+      if (enabled & (1 << unit)) {
+	 tex = i915_texture(i915->fragment_sampler_views[unit]->texture);
+	 i915->validation_buffers[i915->num_validation_buffers++] = tex->buffer;
+      }
    }
 }
 
+const static struct i915_tracked_hw_state hw_atoms[] = {
+   { "flush", NULL, emit_flush, I915_HW_FLUSH, 1 },
+   { "immediate", validate_immediate, NULL, I915_HW_IMMEDIATE },
+   { "static", validate_static, NULL, I915_HW_STATIC },
+   { "map", validate_map, NULL, I915_HW_MAP }
+};
 
-/**
- * Examine framebuffer state to determine width, height.
- */
 static boolean
-framebuffer_size(const struct pipe_framebuffer_state *fb,
-                 uint *width, uint *height)
+i915_validate_state(struct i915_context *i915, unsigned *batch_space)
 {
-   if (fb->cbufs[0]) {
-      *width = fb->cbufs[0]->width;
-      *height = fb->cbufs[0]->height;
-      return TRUE;
-   }
-   else if (fb->zsbuf) {
-      *width = fb->zsbuf->width;
-      *height = fb->zsbuf->height;
+   int i;
+
+   i915->num_validation_buffers = 0;
+   *batch_space = 0;
+
+   for (i = 0; i < Elements(hw_atoms); i++)
+      if ((i915->hardware_dirty & hw_atoms[i].dirty) && hw_atoms[i].validate) {
+	 hw_atoms[i].validate(i915);
+	 *batch_space += hw_atoms[i].batch_space;
+      }
+
+   if (i915->num_validation_buffers == 0)
       return TRUE;
-   }
-   else {
-      *width = *height = 0;
+
+   if (!i915_winsys_validate_buffers(i915->batch, i915->validation_buffers,
+				     i915->num_validation_buffers))
       return FALSE;
-   }
+
+   return TRUE;
 }
 
-static inline uint32_t
-buf_3d_tiling_bits(enum i915_winsys_buffer_tile tiling)
+static void
+emit_state(struct i915_context *i915)
 {
-         uint32_t tiling_bits = 0;
-
-         switch (tiling) {
-         case I915_TILE_Y:
-            tiling_bits |= BUF_3D_TILE_WALK_Y;
-         case I915_TILE_X:
-            tiling_bits |= BUF_3D_TILED_SURFACE;
-         case I915_TILE_NONE:
-            break;
-         }
+   int i;
 
-         return tiling_bits;
+   for (i = 0; i < Elements(hw_atoms); i++)
+      if ((i915->hardware_dirty & hw_atoms[i].dirty) && hw_atoms[i].emit)
+	 hw_atoms[i].emit(i915);
 }
 
 /* Push the state into the sarea and/or texture memory.
@@ -110,6 +142,7 @@ buf_3d_tiling_bits(enum i915_winsys_buffer_tile tiling)
 void
 i915_emit_hardware_state(struct i915_context *i915 )
 {
+   unsigned batch_space;
    /* XXX: there must be an easier way */
    const unsigned dwords = ( 14 + 
                              7 + 
@@ -135,14 +168,21 @@ i915_emit_hardware_state(struct i915_context *i915 )
    if (I915_DBG_ON(DBG_ATOMS))
       i915_dump_hardware_dirty(i915, __FUNCTION__);
 
-   if(!BEGIN_BATCH(dwords, relocs)) {
+   if (!i915_validate_state(i915, &batch_space)) {
+      FLUSH_BATCH(NULL);
+      assert(i915_validate_state(i915, &batch_space));
+   }
+
+   if(!BEGIN_BATCH(batch_space + dwords, relocs)) {
       FLUSH_BATCH(NULL);
-      assert(BEGIN_BATCH(dwords, relocs));
+      assert(i915_validate_state(i915, &batch_space));
+      assert(BEGIN_BATCH(batch_space + dwords, relocs));
    }
 
    save_ptr = (uintptr_t)i915->batch->ptr;
    save_relocs = i915->batch->relocs;
 
+   emit_state(i915);
    /* 14 dwords, 0 relocs */
    if (i915->hardware_dirty & I915_HW_INVARIANT)
    {
@@ -223,7 +263,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
    {
       int i;
       for (i = 0; i < I915_MAX_DYNAMIC; i++) {
-         if (i915->dynamic_dirty & (1 << i));
+         if (i915->dynamic_dirty & (1 << i))
             OUT_BATCH(i915->current.dynamic[i]);
       }
    }
@@ -233,64 +273,27 @@ i915_emit_hardware_state(struct i915_context *i915 )
    /* 8 dwords, 2 relocs */
    if (i915->hardware_dirty & I915_HW_STATIC)
    {
-      struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
-      struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
-
-      if (cbuf_surface) {
-         struct i915_texture *tex = i915_texture(cbuf_surface->texture);
-         assert(tex);
-
+      if (i915->current.cbuf_bo) {
          OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
-
-         OUT_BATCH(BUF_3D_ID_COLOR_BACK |
-                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
-                   buf_3d_tiling_bits(tex->tiling));
-
-         OUT_RELOC(tex->buffer,
+         OUT_BATCH(i915->current.cbuf_flags);
+         OUT_RELOC(i915->current.cbuf_bo,
                    I915_USAGE_RENDER,
                    0);
       }
 
       /* What happens if no zbuf??
        */
-      if (depth_surface) {
-         struct i915_texture *tex = i915_texture(depth_surface->texture);
-         unsigned offset = i915_texture_offset(tex, depth_surface->u.tex.level,
-                                               depth_surface->u.tex.first_layer);
-         assert(tex);
-         assert(offset == 0);
-
+      if (i915->current.depth_bo) {
          OUT_BATCH(_3DSTATE_BUF_INFO_CMD);
-
-         assert(tex);
-         OUT_BATCH(BUF_3D_ID_DEPTH |
-                   BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
-                   buf_3d_tiling_bits(tex->tiling));
-
-         OUT_RELOC(tex->buffer,
+         OUT_BATCH(i915->current.depth_flags);
+         OUT_RELOC(i915->current.depth_bo,
                    I915_USAGE_RENDER,
                    0);
       }
 
       {
-         unsigned cformat, zformat = 0;
-
-         if (cbuf_surface)
-            cformat = cbuf_surface->format;
-         else
-            cformat = PIPE_FORMAT_B8G8R8A8_UNORM; /* arbitrary */
-         cformat = translate_format(cformat);
-
-         if (depth_surface) 
-            zformat = translate_depth_format( i915->framebuffer.zsbuf->format );
-
          OUT_BATCH(_3DSTATE_DST_BUF_VARS_CMD);
-         OUT_BATCH(DSTORG_HORT_BIAS(0x8) | /* .5 */
-                   DSTORG_VERT_BIAS(0x8) | /* .5 */
-                   LOD_PRECLAMP_OGL |
-                   TEX_DEFAULT_COLOR_OGL |
-                   cformat |
-                   zformat );
+         OUT_BATCH(i915->current.dst_buf_vars);
       }
    }
 #endif
@@ -362,7 +365,7 @@ i915_emit_hardware_state(struct i915_context *i915 )
          uint i;
 
          OUT_BATCH( _3DSTATE_PIXEL_SHADER_CONSTANTS | (nr * 4) );
-         OUT_BATCH( (1 << (nr - 1)) | ((1 << (nr - 1)) - 1) );
+	 OUT_BATCH((1 << nr) - 1);
 
          for (i = 0; i < nr; i++) {
             const uint *c;
@@ -411,31 +414,13 @@ i915_emit_hardware_state(struct i915_context *i915 )
    /* 6 dwords, 0 relocs */
    if (i915->hardware_dirty & I915_HW_STATIC)
    {
-      uint w, h;
-      struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
-      struct i915_texture *tex = i915_texture(cbuf_surface->texture);
-      unsigned x, y;
-      int layer;
-      uint32_t draw_offset;
-      boolean ret;
-
-      ret = framebuffer_size(&i915->framebuffer, &w, &h);
-      assert(ret);
-
-      layer = cbuf_surface->u.tex.first_layer;
-
-      x = tex->image_offset[cbuf_surface->u.tex.level][layer].nblocksx;
-      y = tex->image_offset[cbuf_surface->u.tex.level][layer].nblocksy;
-
-      draw_offset = x | (y << 16);
-
       /* XXX flush only required when the draw_offset changes! */
       OUT_BATCH(MI_FLUSH | INHIBIT_FLUSH_RENDER_CACHE);
       OUT_BATCH(_3DSTATE_DRAW_RECT_CMD);
       OUT_BATCH(DRAW_RECT_DIS_DEPTH_OFS);
-      OUT_BATCH(draw_offset);
-      OUT_BATCH((w - 1 + x) | ((h - 1 + y) << 16));
-      OUT_BATCH(draw_offset);
+      OUT_BATCH(i915->current.draw_offset);
+      OUT_BATCH(i915->current.draw_size);
+      OUT_BATCH(i915->current.draw_offset);
    }
 #endif
 
diff --git a/src/gallium/drivers/i915/i915_state_static.c b/src/gallium/drivers/i915/i915_state_static.c
index dc9a4c1e2fd..97044499990 100644
--- a/src/gallium/drivers/i915/i915_state_static.c
+++ b/src/gallium/drivers/i915/i915_state_static.c
@@ -27,17 +27,151 @@
 #include "i915_reg.h"
 #include "i915_context.h"
 #include "i915_state.h"
+#include "i915_resource.h"
 
 
 
 /***********************************************************************
  * Update framebuffer state
  */
+static unsigned translate_format(enum pipe_format format)
+{
+   switch (format) {
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return COLOR_BUF_ARGB8888;
+   case PIPE_FORMAT_B5G6R5_UNORM:
+      return COLOR_BUF_RGB565;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static unsigned translate_depth_format(enum pipe_format zformat)
+{
+   switch (zformat) {
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24_UNORM_S8_USCALED:
+      return DEPTH_FRMT_24_FIXED_8_OTHER;
+   case PIPE_FORMAT_Z16_UNORM:
+      return DEPTH_FRMT_16_FIXED;
+   default:
+      assert(0);
+      return 0;
+   }
+}
+
+static inline uint32_t
+buf_3d_tiling_bits(enum i915_winsys_buffer_tile tiling)
+{
+   uint32_t tiling_bits = 0;
+
+   switch (tiling) {
+   case I915_TILE_Y:
+      tiling_bits |= BUF_3D_TILE_WALK_Y;
+   case I915_TILE_X:
+      tiling_bits |= BUF_3D_TILED_SURFACE;
+   case I915_TILE_NONE:
+      break;
+   }
+
+   return tiling_bits;
+}
+
+/**
+ * Examine framebuffer state to determine width, height.
+ */
+static boolean
+framebuffer_size(const struct pipe_framebuffer_state *fb,
+                 uint *width, uint *height)
+{
+   if (fb->cbufs[0]) {
+      *width = fb->cbufs[0]->width;
+      *height = fb->cbufs[0]->height;
+      return TRUE;
+   }
+   else if (fb->zsbuf) {
+      *width = fb->zsbuf->width;
+      *height = fb->zsbuf->height;
+      return TRUE;
+   }
+   else {
+      *width = *height = 0;
+      return FALSE;
+   }
+}
+
 static void update_framebuffer(struct i915_context *i915)
 {
-   /* HW emit currently references framebuffer state directly:
+   struct pipe_surface *cbuf_surface = i915->framebuffer.cbufs[0];
+   struct pipe_surface *depth_surface = i915->framebuffer.zsbuf;
+   unsigned cformat, zformat;
+   unsigned x, y, w, h;
+   int layer;
+   uint32_t draw_offset;
+   boolean ret;
+
+   if (cbuf_surface) {
+      struct i915_texture *tex = i915_texture(cbuf_surface->texture);
+      assert(tex);
+
+      i915->current.cbuf_bo = tex->buffer;
+      i915->current.cbuf_flags = BUF_3D_ID_COLOR_BACK |
+                                 BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
+                                 buf_3d_tiling_bits(tex->tiling);
+      cformat = cbuf_surface->format;
+
+      layer = cbuf_surface->u.tex.first_layer;
+
+      x = tex->image_offset[cbuf_surface->u.tex.level][layer].nblocksx;
+      y = tex->image_offset[cbuf_surface->u.tex.level][layer].nblocksy;
+   } else {
+      i915->current.cbuf_bo = NULL;
+      cformat = PIPE_FORMAT_B8G8R8A8_UNORM; /* arbitrary */
+      x = y = 0;
+   }
+   cformat = translate_format(cformat);
+
+   /* What happens if no zbuf??
     */
+   if (depth_surface) {
+      struct i915_texture *tex = i915_texture(depth_surface->texture);
+      unsigned offset = i915_texture_offset(tex, depth_surface->u.tex.level,
+                                            depth_surface->u.tex.first_layer);
+      assert(tex);
+      assert(offset == 0);
+
+      i915->current.depth_bo = tex->buffer;
+      i915->current.depth_flags = BUF_3D_ID_DEPTH |
+                                  BUF_3D_PITCH(tex->stride) |  /* pitch in bytes */
+                                  buf_3d_tiling_bits(tex->tiling);
+      zformat = translate_depth_format(depth_surface->format);
+   } else {
+      i915->current.depth_bo = NULL;
+      zformat = 0;
+   }
+
+   i915->current.dst_buf_vars = DSTORG_HORT_BIAS(0x8) | /* .5 */
+                                DSTORG_VERT_BIAS(0x8) | /* .5 */
+                                LOD_PRECLAMP_OGL |
+                                TEX_DEFAULT_COLOR_OGL |
+                                cformat |
+                                zformat;
+
+   /* drawing rect calculations */
+   draw_offset = x | (y << 16);
+   ret = framebuffer_size(&i915->framebuffer, &w, &h);
+   assert(ret);
+   if (i915->current.draw_offset != draw_offset) {
+      i915->current.draw_offset = draw_offset;
+      /* XXX: only emit flush on change and not always in emit */
+   }
+   i915->current.draw_size = (w - 1 + x) | ((h - 1 + y) << 16);
+
    i915->hardware_dirty |= I915_HW_STATIC;
+
+   /* flush the cache in case we sample from the old renderbuffers */
+   i915_set_flush_dirty(i915, I915_FLUSH_CACHE);
 }
 
 struct i915_tracked_state i915_hw_framebuffer = {
diff --git a/src/gallium/drivers/i915/i915_winsys.h b/src/gallium/drivers/i915/i915_winsys.h
index e915a886c9b..4ac2f5b9777 100644
--- a/src/gallium/drivers/i915/i915_winsys.h
+++ b/src/gallium/drivers/i915/i915_winsys.h
@@ -95,6 +95,18 @@ struct i915_winsys {
       (*batchbuffer_create)(struct i915_winsys *iws);
 
    /**
+    * Validate buffers for usage in this batchbuffer.
+    * Does space-checking and asorted other book-keeping.
+    *
+    * @batch
+    * @buffers array to buffers to validate
+    * @num_of_buffers size of the passed array
+    */
+   boolean (*validate_buffers)(struct i915_winsys_batchbuffer *batch,
+	 		       struct i915_winsys_buffer **buffers,
+			       int num_of_buffers);
+
+   /**
     * Emit a relocation to a buffer.
     * Target position in batchbuffer is the same as ptr.
     *
diff --git a/src/gallium/drivers/llvmpipe/lp_fence.c b/src/gallium/drivers/llvmpipe/lp_fence.c
index 3a55e76bc35..a21a3c74484 100644
--- a/src/gallium/drivers/llvmpipe/lp_fence.c
+++ b/src/gallium/drivers/llvmpipe/lp_fence.c
@@ -47,6 +47,9 @@ lp_fence_create(unsigned rank)
    static int fence_id;
    struct lp_fence *fence = CALLOC_STRUCT(lp_fence);
 
+   if (!fence)
+      return NULL;
+
    pipe_reference_init(&fence->reference, 1);
 
    pipe_mutex_init(fence->mutex);
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 21e8012d46a..2c32aa93cdf 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -278,6 +278,11 @@ llvmpipe_is_format_supported( struct pipe_screen *_screen,
       return util_format_s3tc_enabled;
    }
 
+   /* u_format doesn't support RGTC yet */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+      return FALSE;
+   }
+
    /*
     * Everything else should be supported by u_format.
     */
diff --git a/src/gallium/drivers/r300/r300_chipset.c b/src/gallium/drivers/r300/r300_chipset.c
index 1968d0feb35..990acea9f44 100644
--- a/src/gallium/drivers/r300/r300_chipset.c
+++ b/src/gallium/drivers/r300/r300_chipset.c
@@ -255,8 +255,6 @@ void r300_parse_chipset(struct r300_capabilities* caps)
             caps->family = CHIP_FAMILY_RS690;
             caps->has_tcl = FALSE;
             caps->is_r400 = TRUE;
-            caps->hiz_ram = R300_HIZ_LIMIT;
-            caps->zmask_ram = PIPE_ZMASK_SIZE;
             break;
 
         case 0x793F:
@@ -265,8 +263,6 @@ void r300_parse_chipset(struct r300_capabilities* caps)
             caps->family = CHIP_FAMILY_RS600;
             caps->has_tcl = FALSE;
             caps->is_r400 = TRUE;
-            caps->hiz_ram = R300_HIZ_LIMIT;
-            caps->zmask_ram = PIPE_ZMASK_SIZE;
             break;
 
         case 0x796C:
@@ -276,8 +272,6 @@ void r300_parse_chipset(struct r300_capabilities* caps)
             caps->family = CHIP_FAMILY_RS740;
             caps->has_tcl = FALSE;
             caps->is_r400 = TRUE;
-            caps->hiz_ram = R300_HIZ_LIMIT;
-            caps->zmask_ram = PIPE_ZMASK_SIZE;
             break;
 
         case 0x7100:
diff --git a/src/gallium/drivers/r300/r300_context.c b/src/gallium/drivers/r300/r300_context.c
index 9f85bd4ce5f..d422ffe03f8 100644
--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@@ -203,7 +203,7 @@ static boolean r300_setup_atoms(struct r300_context* r300)
     /* SC. */
     R300_INIT_ATOM(scissor_state, 3);
     /* GB, FG, GA, SU, SC, RB3D. */
-    R300_INIT_ATOM(invariant_state, 16 + (is_rv350 ? 4 : 0));
+    R300_INIT_ATOM(invariant_state, 18 + (is_rv350 ? 4 : 0));
     /* VAP. */
     R300_INIT_ATOM(viewport_state, 9);
     R300_INIT_ATOM(pvs_flush, 2);
@@ -353,6 +353,7 @@ static void r300_init_states(struct pipe_context *pipe)
         OUT_CB_REG(R300_SU_DEPTH_SCALE, 0x4B7FFFFF);
         OUT_CB_REG(R300_SU_DEPTH_OFFSET, 0);
         OUT_CB_REG(R300_SC_EDGERULE, 0x2DA49525);
+        OUT_CB_REG(R300_SC_SCREENDOOR, 0xffffff);
 
         if (r300->screen->caps.is_rv350) {
             OUT_CB_REG(R500_RB3D_DISCARD_SRC_PIXEL_LTE_THRESHOLD, 0x01010101);
diff --git a/src/gallium/drivers/r300/r300_context.h b/src/gallium/drivers/r300/r300_context.h
index 30073759476..e9c7d7bf63f 100644
--- a/src/gallium/drivers/r300/r300_context.h
+++ b/src/gallium/drivers/r300/r300_context.h
@@ -220,7 +220,7 @@ struct r300_vertex_stream_state {
 };
 
 struct r300_invariant_state {
-    uint32_t cb[20];
+    uint32_t cb[22];
 };
 
 struct r300_vap_invariant_state {
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 354144cac79..b97c45ac198 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -214,11 +214,18 @@ uint32_t r300_translate_texformat(enum pipe_format format,
     /* RGTC formats. */
     if (desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
         switch (format) {
-            case PIPE_FORMAT_RGTC1_UNORM:
             case PIPE_FORMAT_RGTC1_SNORM:
+                result |= sign_bit[0];
+            case PIPE_FORMAT_RGTC1_UNORM:
+                result &= ~(0xfff << 9); /* mask off swizzle */
+                result |= R300_TX_FORMAT_Y << R300_TX_FORMAT_R_SHIFT;
                 return R500_TX_FORMAT_ATI1N | result;
-            case PIPE_FORMAT_RGTC2_UNORM:
             case PIPE_FORMAT_RGTC2_SNORM:
+                result |= sign_bit[0] | sign_bit[1];
+            case PIPE_FORMAT_RGTC2_UNORM:
+                result &= ~(0xfff << 9); /* mask off swizzle */
+                result |= R300_TX_FORMAT_Y << R300_TX_FORMAT_R_SHIFT |
+                          R300_TX_FORMAT_X << R300_TX_FORMAT_G_SHIFT;
                 return R400_TX_FORMAT_ATI2N | result;
             default:
                 return ~0; /* Unsupported/unknown. */
diff --git a/src/gallium/drivers/r600/eg_asm.c b/src/gallium/drivers/r600/eg_asm.c
index 4f86e3b4c38..8cb417f9731 100644
--- a/src/gallium/drivers/r600/eg_asm.c
+++ b/src/gallium/drivers/r600/eg_asm.c
@@ -98,31 +98,9 @@ int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf)
 	return 0;
 }
 
-void eg_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
+void eg_cf_vtx(struct r600_vertex_element *ve)
 {
-	struct r600_pipe_state *rstate;
-	unsigned i = 0;
-
-	if (count > 8) {
-		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
-		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
-				S_SQ_CF_WORD1_BARRIER(1) |
-				S_SQ_CF_WORD1_COUNT(8 - 1);
-		bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
-		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
-				S_SQ_CF_WORD1_BARRIER(1) |
-				S_SQ_CF_WORD1_COUNT(count - 8 - 1);
-	} else {
-		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
-		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
-				S_SQ_CF_WORD1_BARRIER(1) |
-				S_SQ_CF_WORD1_COUNT(count - 1);
-	}
-	bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
-	bytecode[i++] = S_SQ_CF_WORD1_CF_INST(EG_V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
-			S_SQ_CF_WORD1_BARRIER(1);
-
-	rstate = &ve->rstate;
+	struct r600_pipe_state *rstate = &ve->rstate;
 	rstate->id = R600_PIPE_STATE_FETCH_SHADER;
 	rstate->nregs = 0;
 	r600_pipe_state_add_reg(rstate, R_0288A8_SQ_PGM_RESOURCES_FS,
diff --git a/src/gallium/drivers/r600/r600.h b/src/gallium/drivers/r600/r600.h
index 64c52bca795..1b76f0098dd 100644
--- a/src/gallium/drivers/r600/r600.h
+++ b/src/gallium/drivers/r600/r600.h
@@ -118,10 +118,10 @@ unsigned r600_get_clock_crystal_freq(struct radeon *radeon);
 /* r600_bo.c */
 struct r600_bo;
 struct r600_bo *r600_bo(struct radeon *radeon,
-                        unsigned size, unsigned alignment,
-                        unsigned binding, unsigned usage);
+			unsigned size, unsigned alignment,
+			unsigned binding, unsigned usage);
 struct r600_bo *r600_bo_handle(struct radeon *radeon,
-			       unsigned handle, unsigned *array_mode);
+				unsigned handle, unsigned *array_mode);
 void *r600_bo_map(struct radeon *radeon, struct r600_bo *bo, unsigned usage, void *ctx);
 void r600_bo_unmap(struct radeon *radeon, struct r600_bo *bo);
 void r600_bo_reference(struct radeon *radeon, struct r600_bo **dst,
diff --git a/src/gallium/drivers/r600/r600_asm.c b/src/gallium/drivers/r600/r600_asm.c
index 1393df88757..8006e9b9a58 100644
--- a/src/gallium/drivers/r600/r600_asm.c
+++ b/src/gallium/drivers/r600/r600_asm.c
@@ -86,6 +86,7 @@ static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_CLAMPED:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
+		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
 		case V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
 			return 1;
@@ -135,6 +136,7 @@ static inline unsigned int r600_bc_get_num_operands(struct r600_bc *bc, struct r
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_RECIPSQRT_IEEE:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT_FLOOR:
+		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_SIN:
 		case EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_COS:
 			return 1;
@@ -1441,7 +1443,8 @@ static int r600_bc_vtx_build(struct r600_bc *bc, struct r600_bc_vtx *vtx, unsign
 				S_SQ_VTX_WORD1_FORMAT_COMP_ALL(vtx->format_comp_all) |
 				S_SQ_VTX_WORD1_SRF_MODE_ALL(vtx->srf_mode_all) |
 				S_SQ_VTX_WORD1_GPR_DST_GPR(vtx->dst_gpr);
-	bc->bytecode[id++] = S_SQ_VTX_WORD2_MEGA_FETCH(1);
+	bc->bytecode[id++] = S_SQ_VTX_WORD2_OFFSET(vtx->offset) |
+				S_SQ_VTX_WORD2_MEGA_FETCH(1);
 	bc->bytecode[id++] = 0;
 	return 0;
 }
@@ -2778,12 +2781,13 @@ void r600_bc_dump(struct r600_bc *bc)
 			fprintf(stderr, "SEL_Z:%d ", vtx->dst_sel_z);
 			fprintf(stderr, "SEL_W:%d) ", vtx->dst_sel_w);
 			fprintf(stderr, "USE_CONST_FIELDS:%d ", vtx->use_const_fields);
-			fprintf(stderr, "DATA_FORMAT:%d ", vtx->data_format);
-			fprintf(stderr, "NUM_FORMAT_ALL:%d ", vtx->num_format_all);
-			fprintf(stderr, "FORMAT_COMP_ALL:%d ", vtx->format_comp_all);
-			fprintf(stderr, "SRF_MODE_ALL:%d\n", vtx->srf_mode_all);
+			fprintf(stderr, "FORMAT(DATA:%d ", vtx->data_format);
+			fprintf(stderr, "NUM:%d ", vtx->num_format_all);
+			fprintf(stderr, "COMP:%d ", vtx->format_comp_all);
+			fprintf(stderr, "MODE:%d)\n", vtx->srf_mode_all);
 			id++;
-			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
+			fprintf(stderr, "%04d %08X   ", id, bc->bytecode[id]);
+			fprintf(stderr, "OFFSET:%d\n", vtx->offset);
 			//TODO
 			id++;
 			fprintf(stderr, "%04d %08X   \n", id, bc->bytecode[id]);
@@ -2794,29 +2798,9 @@ void r600_bc_dump(struct r600_bc *bc)
 	fprintf(stderr, "--------------------------------------\n");
 }
 
-static void r600_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count)
+static void r600_cf_vtx(struct r600_vertex_element *ve)
 {
 	struct r600_pipe_state *rstate;
-	unsigned i = 0;
-
-	if (count > 8) {
-		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
-		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
-						S_SQ_CF_WORD1_BARRIER(1) |
-						S_SQ_CF_WORD1_COUNT(8 - 1);
-		bytecode[i++] = S_SQ_CF_WORD0_ADDR(40 >> 1);
-		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
-						S_SQ_CF_WORD1_BARRIER(1) |
-						S_SQ_CF_WORD1_COUNT(count - 8 - 1);
-	} else {
-		bytecode[i++] = S_SQ_CF_WORD0_ADDR(8 >> 1);
-		bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_VTX) |
-						S_SQ_CF_WORD1_BARRIER(1) |
-						S_SQ_CF_WORD1_COUNT(count - 1);
-	}
-	bytecode[i++] = S_SQ_CF_WORD0_ADDR(0);
-	bytecode[i++] = S_SQ_CF_WORD1_CF_INST(V_SQ_CF_WORD1_SQ_CF_INST_RETURN) |
-			S_SQ_CF_WORD1_BARRIER(1);
 
 	rstate = &ve->rstate;
 	rstate->id = R600_PIPE_STATE_FETCH_SHADER;
@@ -2962,37 +2946,19 @@ out_unknown:
 
 int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, struct r600_vertex_element *ve)
 {
-	unsigned ndw, i;
-	u32 *bytecode;
-	unsigned fetch_resource_start = 0, format, num_format, format_comp;
+	static int dump_shaders = -1;
+
+	struct r600_bc bc;
+	struct r600_bc_vtx vtx;
 	struct pipe_vertex_element *elements = ve->elements;
 	const struct util_format_description *desc;
-
-	/* 2 dwords for cf aligned to 4 + 4 dwords per input */
-	ndw = 8 + ve->count * 4;
-	ve->fs_size = ndw * 4;
-
-	/* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
-	ve->fetch_shader = r600_bo(rctx->radeon, ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0);
-	if (ve->fetch_shader == NULL) {
-		return -ENOMEM;
-	}
-
-	bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
-	if (bytecode == NULL) {
-		r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
-		return -ENOMEM;
-	}
-
-	if (rctx->family >= CHIP_CEDAR) {
-		eg_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
-	} else {
-		r600_cf_vtx(ve, &bytecode[0], (ndw - 8) / 4);
-		fetch_resource_start = 160;
-	}
+	unsigned fetch_resource_start = rctx->family >= CHIP_CEDAR ? 0 : 160;
+	unsigned format, num_format, format_comp;
+	u32 *bytecode;
+	int i, r;
 
 	/* vertex elements offset need special handling, if offset is bigger
-	 * than what we can put in fetch instruction then we need to alterate
+	+ * than what we can put in fetch instruction then we need to alterate
 	 * the vertex resource offset. In such case in order to simplify code
 	 * we will bound one resource per elements. It's a worst case scenario.
 	 */
@@ -3003,40 +2969,155 @@ int r600_vertex_elements_build_fetch_shader(struct r600_pipe_context *rctx, stru
 		}
 	}
 
+	memset(&bc, 0, sizeof(bc));
+	r = r600_bc_init(&bc, r600_get_family(rctx->radeon));
+	if (r)
+		return r;
+
+	for (i = 0; i < ve->count; i++) {
+		if (elements[i].instance_divisor > 1) {
+			struct r600_bc_alu alu;
+
+			memset(&alu, 0, sizeof(alu));
+			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
+			alu.src[0].sel = 0;
+			alu.src[0].chan = 3;
+
+			alu.dst.sel = i + 1;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+
+			if ((r = r600_bc_add_alu(&bc, &alu))) {
+				r600_bc_clear(&bc);
+				return r;
+			}
+
+			memset(&alu, 0, sizeof(alu));
+			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_MUL);
+			alu.src[0].sel = i + 1;
+			alu.src[0].chan = 3;
+
+			alu.src[1].sel = V_SQ_ALU_SRC_LITERAL;
+			alu.src[1].value = fui(1.0f / (float)elements[i].instance_divisor);
+
+			alu.dst.sel = i + 1;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+
+			if ((r = r600_bc_add_alu(&bc, &alu))) {
+				r600_bc_clear(&bc);
+				return r;
+			}
+
+			memset(&alu, 0, sizeof(alu));
+			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC);
+			alu.src[0].sel = i + 1;
+			alu.src[0].chan = 3;
+
+			alu.dst.sel = i + 1;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+
+			if ((r = r600_bc_add_alu(&bc, &alu))) {
+				r600_bc_clear(&bc);
+				return r;
+			}
+
+			memset(&alu, 0, sizeof(alu));
+			alu.inst = BC_INST(&bc, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_FLT_TO_INT);
+			alu.src[0].sel = i + 1;
+			alu.src[0].chan = 3;
+
+			alu.dst.sel = i + 1;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+
+			if ((r = r600_bc_add_alu(&bc, &alu))) {
+				r600_bc_clear(&bc);
+				return r;
+			}
+		}
+	}
+
 	for (i = 0; i < ve->count; i++) {
 		unsigned vbuffer_index;
 		r600_vertex_data_type(ve->elements[i].src_format, &format, &num_format, &format_comp);
 		desc = util_format_description(ve->elements[i].src_format);
 		if (desc == NULL) {
+			r600_bc_clear(&bc);
 			R600_ERR("unknown format %d\n", ve->elements[i].src_format);
-			r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
 			return -EINVAL;
 		}
 
 		/* see above for vbuffer_need_offset explanation */
 		vbuffer_index = elements[i].vertex_buffer_index;
-		if (ve->vbuffer_need_offset) {
-			bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(i + fetch_resource_start);
-		} else {
-			bytecode[8 + i * 4 + 0] = S_SQ_VTX_WORD0_BUFFER_ID(vbuffer_index + fetch_resource_start);
-		}
-		bytecode[8 + i * 4 + 0] |= S_SQ_VTX_WORD0_SRC_GPR(0) |
-					S_SQ_VTX_WORD0_SRC_SEL_X(0) |
-					S_SQ_VTX_WORD0_MEGA_FETCH_COUNT(0x1F);
-		bytecode[8 + i * 4 + 1] = S_SQ_VTX_WORD1_DST_SEL_X(desc->swizzle[0]) |
-					S_SQ_VTX_WORD1_DST_SEL_Y(desc->swizzle[1]) |
-					S_SQ_VTX_WORD1_DST_SEL_Z(desc->swizzle[2]) |
-					S_SQ_VTX_WORD1_DST_SEL_W(desc->swizzle[3]) |
-					S_SQ_VTX_WORD1_USE_CONST_FIELDS(0) |
-					S_SQ_VTX_WORD1_DATA_FORMAT(format) |
-					S_SQ_VTX_WORD1_NUM_FORMAT_ALL(num_format) |
-					S_SQ_VTX_WORD1_FORMAT_COMP_ALL(format_comp) |
-					S_SQ_VTX_WORD1_SRF_MODE_ALL(1) |
-					S_SQ_VTX_WORD1_GPR_DST_GPR(i + 1);
-		bytecode[8 + i * 4 + 2] = S_SQ_VTX_WORD2_OFFSET(elements[i].src_offset) |
-					S_SQ_VTX_WORD2_MEGA_FETCH(1);
-		bytecode[8 + i * 4 + 3] = 0;
+		memset(&vtx, 0, sizeof(vtx));
+		vtx.buffer_id = (ve->vbuffer_need_offset ? i : vbuffer_index) + fetch_resource_start;
+		vtx.fetch_type = elements[i].instance_divisor ? 1 : 0;
+		vtx.src_gpr = elements[i].instance_divisor > 1 ? i + 1 : 0;
+		vtx.src_sel_x = elements[i].instance_divisor ? 3 : 0;
+		vtx.mega_fetch_count = 16;
+		vtx.dst_gpr = i + 1;
+		vtx.dst_sel_x = desc->swizzle[0];
+		vtx.dst_sel_y = desc->swizzle[1];
+		vtx.dst_sel_z = desc->swizzle[2];
+		vtx.dst_sel_w = desc->swizzle[3];
+		vtx.data_format = format;
+		vtx.num_format_all = num_format;
+		vtx.format_comp_all = format_comp;
+		vtx.srf_mode_all = 1;
+		vtx.offset = elements[i].src_offset;
+
+		if ((r = r600_bc_add_vtx(&bc, &vtx))) {
+			r600_bc_clear(&bc);
+			return r;
+		}
+	}
+
+	r600_bc_add_cfinst(&bc, BC_INST(&bc, V_SQ_CF_WORD1_SQ_CF_INST_RETURN));
+
+	/* use PIPE_BIND_VERTEX_BUFFER so we use the cache buffer manager */
+	ve->fetch_shader = r600_bo(rctx->radeon, bc.ndw*4, 256, PIPE_BIND_VERTEX_BUFFER, 0);
+	if (ve->fetch_shader == NULL) {
+		r600_bc_clear(&bc);
+		return -ENOMEM;
+	}
+
+	ve->fs_size = bc.ndw*4;
+	if ((r = r600_bc_build(&bc))) {
+		r600_bc_clear(&bc);
+		return r;
 	}
+
+	if (dump_shaders == -1)
+		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
+
+	if (dump_shaders) {
+		fprintf(stderr, "--------------------------------------------------------------\n");
+		r600_bc_dump(&bc);
+		fprintf(stderr, "______________________________________________________________\n");
+	}
+
+	bytecode = r600_bo_map(rctx->radeon, ve->fetch_shader, 0, NULL);
+	if (bytecode == NULL) {
+		r600_bc_clear(&bc);
+		r600_bo_reference(rctx->radeon, &ve->fetch_shader, NULL);
+		return -ENOMEM;
+	}
+
+	memcpy(bytecode, bc.bytecode, ve->fs_size);
+
 	r600_bo_unmap(rctx->radeon, ve->fetch_shader);
+	r600_bc_clear(&bc);
+
+	if (rctx->family >= CHIP_CEDAR)
+		eg_cf_vtx(ve);
+	else
+		r600_cf_vtx(ve);
+
 	return 0;
 }
diff --git a/src/gallium/drivers/r600/r600_asm.h b/src/gallium/drivers/r600/r600_asm.h
index 453c29790c1..dbd1e204b49 100644
--- a/src/gallium/drivers/r600/r600_asm.h
+++ b/src/gallium/drivers/r600/r600_asm.h
@@ -103,6 +103,7 @@ struct r600_bc_vtx {
 	unsigned			num_format_all;
 	unsigned			format_comp_all;
 	unsigned			srf_mode_all;
+	unsigned			offset;
 };
 
 struct r600_bc_output {
@@ -187,7 +188,7 @@ struct r600_bc {
 
 /* eg_asm.c */
 int eg_bc_cf_build(struct r600_bc *bc, struct r600_bc_cf *cf);
-void eg_cf_vtx(struct r600_vertex_element *ve, u32 *bytecode, unsigned count);
+void eg_cf_vtx(struct r600_vertex_element *ve);
 
 /* r600_asm.c */
 int r600_bc_init(struct r600_bc *bc, enum radeon_family family);
diff --git a/src/gallium/drivers/r600/r600_blit.c b/src/gallium/drivers/r600/r600_blit.c
index 9865ea17ae5..04408a5cc8e 100644
--- a/src/gallium/drivers/r600/r600_blit.c
+++ b/src/gallium/drivers/r600/r600_blit.c
@@ -225,7 +225,7 @@ struct texture_orig_info {
 	unsigned height0;
 };
 
-static void r600_s3tc_to_blittable(struct pipe_resource *tex,
+static void r600_compressed_to_blittable(struct pipe_resource *tex,
 				   unsigned level,
 				   struct texture_orig_info *orig)
 {
@@ -253,7 +253,7 @@ static void r600_s3tc_to_blittable(struct pipe_resource *tex,
 
 }
 
-static void r600_reset_blittable_to_s3tc(struct pipe_resource *tex,
+static void r600_reset_blittable_to_compressed(struct pipe_resource *tex,
 					 unsigned level,
 					 struct texture_orig_info *orig)
 {
@@ -282,13 +282,13 @@ static void r600_resource_copy_region(struct pipe_context *ctx,
 
 	restore_orig[0] = restore_orig[1] = FALSE;
 
-	if (util_format_is_s3tc(src->format)) {
-		r600_s3tc_to_blittable(src, src_level, &orig_info[0]);
+	if (util_format_is_compressed(src->format)) {
+		r600_compressed_to_blittable(src, src_level, &orig_info[0]);
 		restore_orig[0] = TRUE;
 	}
 
-	if (util_format_is_s3tc(dst->format)) {
-		r600_s3tc_to_blittable(dst, dst_level, &orig_info[1]);
+	if (util_format_is_compressed(dst->format)) {
+		r600_compressed_to_blittable(dst, dst_level, &orig_info[1]);
 		restore_orig[1] = TRUE;
 		/* translate the dst box as well */
 		dstx = util_format_get_nblocksx(orig_info[1].format, dstx);
@@ -299,10 +299,10 @@ static void r600_resource_copy_region(struct pipe_context *ctx,
 			    src, src_level, src_box);
 
 	if (restore_orig[0])
-		r600_reset_blittable_to_s3tc(src, src_level, &orig_info[0]);
+		r600_reset_blittable_to_compressed(src, src_level, &orig_info[0]);
 
 	if (restore_orig[1])
-		r600_reset_blittable_to_s3tc(dst, dst_level, &orig_info[1]);
+		r600_reset_blittable_to_compressed(dst, dst_level, &orig_info[1]);
 }
 
 void r600_init_blit_functions(struct r600_pipe_context *rctx)
diff --git a/src/gallium/drivers/r600/r600_buffer.c b/src/gallium/drivers/r600/r600_buffer.c
index 0c5d7133c7a..2363cd1ebc5 100644
--- a/src/gallium/drivers/r600/r600_buffer.c
+++ b/src/gallium/drivers/r600/r600_buffer.c
@@ -132,13 +132,13 @@ static void r600_transfer_destroy(struct pipe_context *ctx,
 }
 
 static void r600_buffer_transfer_inline_write(struct pipe_context *pipe,
-                                              struct pipe_resource *resource,
-                                              unsigned level,
-                                              unsigned usage,
-                                              const struct pipe_box *box,
-                                              const void *data,
-                                              unsigned stride,
-                                              unsigned layer_stride)
+						struct pipe_resource *resource,
+						unsigned level,
+						unsigned usage,
+						const struct pipe_box *box,
+						const void *data,
+						unsigned stride,
+						unsigned layer_stride)
 {
 	struct radeon *ws = (struct radeon*)pipe->winsys;
 	struct r600_resource_buffer *rbuffer = r600_buffer(resource);
@@ -224,7 +224,7 @@ struct pipe_resource *r600_user_buffer_create(struct pipe_screen *screen,
 	rbuffer->r.b.b.b.depth0 = 1;
 	rbuffer->r.b.b.b.array_size = 1;
 	rbuffer->r.b.b.b.flags = 0;
-        rbuffer->r.b.user_ptr = ptr;
+	rbuffer->r.b.user_ptr = ptr;
 	rbuffer->r.bo = NULL;
 	rbuffer->r.bo_size = 0;
 	return &rbuffer->r.b.b.b;
diff --git a/src/gallium/drivers/r600/r600_pipe.c b/src/gallium/drivers/r600/r600_pipe.c
index 34094001b75..3fd6668f718 100644
--- a/src/gallium/drivers/r600/r600_pipe.c
+++ b/src/gallium/drivers/r600/r600_pipe.c
@@ -77,8 +77,7 @@ static void r600_flush(struct pipe_context *ctx, unsigned flags,
 	u_upload_flush(rctx->vbuf_mgr->uploader);
 }
 
-static void r600_update_num_contexts(struct r600_screen *rscreen,
-                                     int diff)
+static void r600_update_num_contexts(struct r600_screen *rscreen, int diff)
 {
 	pipe_mutex_lock(rscreen->mutex_num_contexts);
 	if (diff > 0) {
@@ -286,13 +285,13 @@ static int r600_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
 	case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
 	case PIPE_CAP_DEPTH_CLAMP:
 	case PIPE_CAP_SHADER_STENCIL_EXPORT:
+	case PIPE_CAP_INSTANCED_DRAWING:
 		return 1;
 
 	/* Unsupported features (boolean caps). */
 	case PIPE_CAP_STREAM_OUTPUT:
 	case PIPE_CAP_PRIMITIVE_RESTART:
 	case PIPE_CAP_INDEP_BLEND_FUNC: /* FIXME allow this */
-	case PIPE_CAP_INSTANCED_DRAWING:
 		return 0;
 
 	case PIPE_CAP_ARRAY_TEXTURES:
diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c
index 240c8f1ffd0..0b4dc75e584 100644
--- a/src/gallium/drivers/r600/r600_shader.c
+++ b/src/gallium/drivers/r600/r600_shader.c
@@ -241,10 +241,10 @@ int r600_pipe_shader_create(struct pipe_context *ctx, struct r600_pipe_shader *s
 	struct r600_pipe_context *rctx = (struct r600_pipe_context *)ctx;
 	int r;
 
-        /* Would like some magic "get_bool_option_once" routine.
-         */
-        if (dump_shaders == -1)
-                dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
+	/* Would like some magic "get_bool_option_once" routine.
+	*/
+	if (dump_shaders == -1)
+		dump_shaders = debug_get_bool_option("R600_DUMP_SHADERS", FALSE);
 
 	if (dump_shaders) {
 		fprintf(stderr, "--------------------------------------------------------------\n");
@@ -420,6 +420,7 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 {
 	struct tgsi_full_declaration *d = &ctx->parse.FullToken.FullDeclaration;
 	unsigned i;
+	int r;
 
 	switch (d->Declaration.File) {
 	case TGSI_FILE_INPUT:
@@ -451,6 +452,26 @@ static int tgsi_declaration(struct r600_shader_ctx *ctx)
 	case TGSI_FILE_SAMPLER:
 	case TGSI_FILE_ADDRESS:
 		break;
+
+	case TGSI_FILE_SYSTEM_VALUE:
+		if (d->Semantic.Name == TGSI_SEMANTIC_INSTANCEID) {
+			struct r600_bc_alu alu;
+			memset(&alu, 0, sizeof(struct r600_bc_alu));
+
+			alu.inst = CTX_INST(V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_INT_TO_FLT);
+			alu.src[0].sel = 0;
+			alu.src[0].chan = 3;
+
+			alu.dst.sel = 0;
+			alu.dst.chan = 3;
+			alu.dst.write = 1;
+			alu.last = 1;
+
+			if ((r = r600_bc_add_alu(ctx->bc, &alu)))
+				return r;
+			break;
+		}
+
 	default:
 		R600_ERR("unsupported file %d declaration\n", d->Declaration.File);
 		return -EINVAL;
@@ -521,6 +542,7 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 	r600_src->swizzle[3] = tgsi_src->Register.SwizzleW;
 	r600_src->neg = tgsi_src->Register.Negate;
 	r600_src->abs = tgsi_src->Register.Absolute;
+
 	if (tgsi_src->Register.File == TGSI_FILE_IMMEDIATE) {
 		int index;
 		if ((tgsi_src->Register.SwizzleX == tgsi_src->Register.SwizzleY) &&
@@ -535,6 +557,13 @@ static void tgsi_src(struct r600_shader_ctx *ctx,
 		index = tgsi_src->Register.Index;
 		r600_src->sel = V_SQ_ALU_SRC_LITERAL;
 		memcpy(r600_src->value, ctx->literals + index * 4, sizeof(r600_src->value));
+	} else if (tgsi_src->Register.File == TGSI_FILE_SYSTEM_VALUE) {
+		/* assume we wan't TGSI_SEMANTIC_INSTANCEID here */
+		r600_src->swizzle[0] = 3;
+		r600_src->swizzle[1] = 3;
+		r600_src->swizzle[2] = 3;
+		r600_src->swizzle[3] = 3;
+		r600_src->sel = 0;
 	} else {
 		if (tgsi_src->Register.Indirect)
 			r600_src->rel = V_SQ_REL_RELATIVE;
@@ -2858,7 +2887,7 @@ static struct r600_shader_tgsi_instruction r600_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CEIL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_I2F,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_NOT,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_trans_srcx_replicate},
+	{TGSI_OPCODE_TRUNC,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
 	{TGSI_OPCODE_SHL,	0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	/* gap */
 	{88,			0, V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
@@ -3016,7 +3045,7 @@ static struct r600_shader_tgsi_instruction eg_shader_tgsi_instruction[] = {
 	{TGSI_OPCODE_CEIL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_I2F,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	{TGSI_OPCODE_NOT,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
-	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_trans_srcx_replicate},
+	{TGSI_OPCODE_TRUNC,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_TRUNC, tgsi_op2},
 	{TGSI_OPCODE_SHL,	0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
 	/* gap */
 	{88,			0, EG_V_SQ_ALU_WORD1_OP2_SQ_OP2_INST_NOP, tgsi_unsupported},
diff --git a/src/gallium/drivers/r600/r600_state_common.c b/src/gallium/drivers/r600/r600_state_common.c
index 72707fbd8b8..3c072fe7ca9 100644
--- a/src/gallium/drivers/r600/r600_state_common.c
+++ b/src/gallium/drivers/r600/r600_state_common.c
@@ -299,13 +299,13 @@ void r600_spi_update(struct r600_pipe_context *rctx)
 			tmp |= S_028644_PT_SPRITE_TEX(1);
 		}
 
-                if (rctx->family < CHIP_CEDAR) {
-                    if (rshader->input[i].centroid)
-                            tmp |= S_028644_SEL_CENTROID(1);
+		if (rctx->family < CHIP_CEDAR) {
+			if (rshader->input[i].centroid)
+				tmp |= S_028644_SEL_CENTROID(1);
 
-                    if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
-                            tmp |= S_028644_SEL_LINEAR(1);
-                }
+			if (rshader->input[i].interpolate == TGSI_INTERPOLATE_LINEAR)
+				tmp |= S_028644_SEL_LINEAR(1);
+		}
 
 		r600_pipe_state_add_reg(&rstate, R_028644_SPI_PS_INPUT_CNTL_0 + i * 4, tmp, 0xFFFFFFFF, NULL);
 	}
@@ -520,7 +520,7 @@ void r600_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
 	r600_context_pipe_state_set(&rctx->ctx, &vgt);
 
 	rdraw.vgt_num_indices = draw.info.count;
-	rdraw.vgt_num_instances = 1;
+	rdraw.vgt_num_instances = draw.info.instance_count;
 	rdraw.vgt_index_type = vgt_dma_index_type;
 	rdraw.vgt_draw_initiator = vgt_draw_initiator;
 	rdraw.indices = NULL;
diff --git a/src/gallium/drivers/r600/r600_texture.c b/src/gallium/drivers/r600/r600_texture.c
index 4c9d5609c06..048d0b61e3b 100644
--- a/src/gallium/drivers/r600/r600_texture.c
+++ b/src/gallium/drivers/r600/r600_texture.c
@@ -292,7 +292,7 @@ static boolean permit_hardware_blit(struct pipe_screen *screen,
 		bind = PIPE_BIND_RENDER_TARGET;
 
 	/* hackaround for S3TC */
-	if (util_format_is_s3tc(res->format))
+	if (util_format_is_compressed(res->format))
 		return TRUE;
 
 	if (!screen->is_format_supported(screen,
@@ -433,7 +433,7 @@ struct pipe_resource *r600_texture_create(struct pipe_screen *screen,
 	}
 
 	if (!(templ->flags & R600_RESOURCE_FLAG_TRANSFER) &&
-	    util_format_is_s3tc(templ->format))
+	    util_format_is_compressed(templ->format))
 		array_mode = V_038000_ARRAY_1D_TILED_THIN1;
 
 	return (struct pipe_resource *)r600_texture_create_object(screen, templ, array_mode,
@@ -887,12 +887,14 @@ uint32_t r600_translate_texformat(enum pipe_format format,
 			goto out_unknown;
 
 		switch (format) {
-		case PIPE_FORMAT_RGTC1_UNORM:
 		case PIPE_FORMAT_RGTC1_SNORM:
+			word4 |= sign_bit[0];
+		case PIPE_FORMAT_RGTC1_UNORM:
 			result = FMT_BC4;
 			goto out_word4;
-		case PIPE_FORMAT_RGTC2_UNORM:
 		case PIPE_FORMAT_RGTC2_SNORM:
+			word4 |= sign_bit[0] | sign_bit[1];
+		case PIPE_FORMAT_RGTC2_UNORM:
 			result = FMT_BC5;
 			goto out_word4;
 		default:
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index a06817c5735..603e1de7982 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -249,6 +249,11 @@ softpipe_is_format_supported( struct pipe_screen *screen,
       return util_format_s3tc_enabled;
    }
 
+   /* u_format doesn't implement RGTC yet */
+   if (format_desc->layout == UTIL_FORMAT_LAYOUT_RGTC) {
+	return FALSE;
+   }
+
    /*
     * Everything else should be supported by u_format.
     */
diff --git a/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c b/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
index afeab5eef42..7cc5af89639 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
+++ b/src/gallium/winsys/i915/drm/i915_drm_batchbuffer.c
@@ -5,6 +5,7 @@
 #include "i915_drm.h"
 #include "i915/i915_debug.h"
 #include <xf86drm.h>
+#include <stdio.h>
 
 #define BATCH_RESERVED 16
 
@@ -71,6 +72,26 @@ i915_drm_batchbuffer_create(struct i915_winsys *iws)
    return &batch->base;
 }
 
+static boolean
+i915_drm_batchbuffer_validate_buffers(struct i915_winsys_batchbuffer *batch,
+				      struct i915_winsys_buffer **buffer,
+				      int num_of_buffers)
+{
+   struct i915_drm_batchbuffer *drm_batch = i915_drm_batchbuffer(batch);
+   drm_intel_bo *bos[num_of_buffers + 1];
+   int i, ret;
+
+   bos[0] = drm_batch->bo;
+   for (i = 0; i < num_of_buffers; i++)
+      bos[i+1] = intel_bo(buffer[i]);
+
+   ret = drm_intel_bufmgr_check_aperture_space(bos, num_of_buffers);
+   if (ret != 0)
+      return FALSE;
+
+   return TRUE;
+}
+
 static int
 i915_drm_batchbuffer_reloc(struct i915_winsys_batchbuffer *ibatch,
                             struct i915_winsys_buffer *buffer,
@@ -169,6 +190,14 @@ i915_drm_batchbuffer_flush(struct i915_winsys_batchbuffer *ibatch,
       assert(ret == 0);
    }
 
+   if (i915_drm_winsys(ibatch->iws)->dump_raw_file) {
+      FILE *file = fopen(i915_drm_winsys(ibatch->iws)->dump_raw_file, "a");
+      if (file) {
+	 fwrite(batch->base.map, used, 1, file);
+	 fclose(file);
+      }
+   }
+
 #ifdef INTEL_RUN_SYNC
    drm_intel_bo_wait_rendering(batch->bo);
 #endif
@@ -202,6 +231,7 @@ i915_drm_batchbuffer_destroy(struct i915_winsys_batchbuffer *ibatch)
 void i915_drm_winsys_init_batchbuffer_functions(struct i915_drm_winsys *idws)
 {
    idws->base.batchbuffer_create = i915_drm_batchbuffer_create;
+   idws->base.validate_buffers = i915_drm_batchbuffer_validate_buffers;
    idws->base.batchbuffer_reloc = i915_drm_batchbuffer_reloc;
    idws->base.batchbuffer_flush = i915_drm_batchbuffer_flush;
    idws->base.batchbuffer_destroy = i915_drm_batchbuffer_destroy;
diff --git a/src/gallium/winsys/i915/drm/i915_drm_winsys.c b/src/gallium/winsys/i915/drm/i915_drm_winsys.c
index 2288b48b2bd..2c3b508d056 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_winsys.c
+++ b/src/gallium/winsys/i915/drm/i915_drm_winsys.c
@@ -72,6 +72,7 @@ i915_drm_winsys_create(int drmFD)
    drm_intel_bufmgr_gem_enable_fenced_relocs(idws->gem_manager);
 
    idws->dump_cmd = debug_get_bool_option("I915_DUMP_CMD", FALSE);
+   idws->dump_raw_file = debug_get_option("I915_DUMP_RAW_FILE", NULL);
    idws->send_cmd = !debug_get_bool_option("I915_NO_HW", FALSE);
 
    return &idws->base;
diff --git a/src/gallium/winsys/i915/drm/i915_drm_winsys.h b/src/gallium/winsys/i915/drm/i915_drm_winsys.h
index 0d74d0270c7..dae53c3e801 100644
--- a/src/gallium/winsys/i915/drm/i915_drm_winsys.h
+++ b/src/gallium/winsys/i915/drm/i915_drm_winsys.h
@@ -18,6 +18,7 @@ struct i915_drm_winsys
    struct i915_winsys base;
 
    boolean dump_cmd;
+   char *dump_raw_file;
    boolean send_cmd;
 
    int fd; /**< Drm file discriptor */
diff --git a/src/gallium/winsys/i915/sw/i915_sw_batchbuffer.c b/src/gallium/winsys/i915/sw/i915_sw_batchbuffer.c
index 8085591c8eb..3d0c1fa6224 100644
--- a/src/gallium/winsys/i915/sw/i915_sw_batchbuffer.c
+++ b/src/gallium/winsys/i915/sw/i915_sw_batchbuffer.c
@@ -58,6 +58,14 @@ i915_sw_batchbuffer_create(struct i915_winsys *iws)
    return &batch->base;
 }
 
+static boolean
+i915_sw_batchbuffer_validate_buffers(struct i915_winsys_batchbuffer *batch,
+				     struct i915_winsys_buffer **buffer,
+				     int num_of_buffers)
+{
+   return TRUE;
+}
+
 static int
 i915_sw_batchbuffer_reloc(struct i915_winsys_batchbuffer *ibatch,
                           struct i915_winsys_buffer *buffer,
@@ -107,16 +115,16 @@ i915_sw_batchbuffer_flush(struct i915_winsys_batchbuffer *ibatch,
 
 #ifdef INTEL_ALWAYS_FLUSH
    /* MI_FLUSH | FLUSH_MAP_CACHE */
-   i915_winsys_batchbuffer_dword(ibatch, (0x4<<23)|(1<<0));
+   i915_winsys_batchbuffer_dword_unchecked(ibatch, (0x4<<23)|(1<<0));
    used += 4;
 #endif
 
    if ((used & 4) == 0) {
       /* MI_NOOP */
-      i915_winsys_batchbuffer_dword(ibatch, 0);
+      i915_winsys_batchbuffer_dword_unchecked(ibatch, 0);
    }
    /* MI_BATCH_BUFFER_END */
-   i915_winsys_batchbuffer_dword(ibatch, (0xA<<23));
+   i915_winsys_batchbuffer_dword_unchecked(ibatch, (0xA<<23));
 
    used = batch->base.ptr - batch->base.map;
    assert((used & 4) == 0);
@@ -146,6 +154,7 @@ i915_sw_batchbuffer_destroy(struct i915_winsys_batchbuffer *ibatch)
 void i915_sw_winsys_init_batchbuffer_functions(struct i915_sw_winsys *isws)
 {
    isws->base.batchbuffer_create = i915_sw_batchbuffer_create;
+   isws->base.validate_buffers = i915_sw_batchbuffer_validate_buffers;
    isws->base.batchbuffer_reloc = i915_sw_batchbuffer_reloc;
    isws->base.batchbuffer_flush = i915_sw_batchbuffer_flush;
    isws->base.batchbuffer_destroy = i915_sw_batchbuffer_destroy;
diff --git a/src/gallium/winsys/i915/sw/i915_sw_winsys.c b/src/gallium/winsys/i915/sw/i915_sw_winsys.c
index 058ddc44aaf..fc48da6fb92 100644
--- a/src/gallium/winsys/i915/sw/i915_sw_winsys.c
+++ b/src/gallium/winsys/i915/sw/i915_sw_winsys.c
@@ -50,7 +50,7 @@ i915_sw_winsys_create()
    isws->base.pci_id = deviceID;
    isws->max_batch_size = 16 * 4096;
 
-   isws->dump_cmd = debug_get_bool_option("INTEL_DUMP_CMD", FALSE);
+   isws->dump_cmd = debug_get_bool_option("I915_DUMP_CMD", FALSE);
 
    return &isws->base;
 }
diff --git a/src/gallium/winsys/sw/xlib/SConscript b/src/gallium/winsys/sw/xlib/SConscript
index f6c47411831..df01a9ec2bf 100644
--- a/src/gallium/winsys/sw/xlib/SConscript
+++ b/src/gallium/winsys/sw/xlib/SConscript
@@ -4,7 +4,7 @@
 
 Import('*')
 
-if env['platform'] in ('cygwin_nt-5.1', 'cygwin_nt-6.1', 'linux'):
+if env['platform'] in ('cygwin', 'linux'):
 
     env = env.Clone()
 
diff --git a/src/glsl/Makefile b/src/glsl/Makefile
index 876f0dfc2a5..df031d2d548 100644
--- a/src/glsl/Makefile
+++ b/src/glsl/Makefile
@@ -208,6 +208,6 @@ builtin_compiler: $(GLSL2_OBJECTS) $(OBJECTS) builtin_stubs.o
 
 builtin_function.cpp: builtins/profiles/* builtins/ir/* builtins/tools/generate_builtins.py builtins/tools/texture_builtins.py builtin_compiler
 	@echo Regenerating builtin_function.cpp...
-	$(PYTHON2) $(PYTHON_FLAGS) builtins/tools/generate_builtins.py ./builtin_compiler > builtin_function.cpp
+	$(PYTHON2) $(PYTHON_FLAGS) builtins/tools/generate_builtins.py ./builtin_compiler > builtin_function.cpp || rm -f builtin_function.cpp
 
 -include depend
diff --git a/src/glsl/ast_to_hir.cpp b/src/glsl/ast_to_hir.cpp
index bef099cca3b..fd1f0b49f42 100644
--- a/src/glsl/ast_to_hir.cpp
+++ b/src/glsl/ast_to_hir.cpp
@@ -3445,11 +3445,9 @@ ast_struct_specifier::hir(exec_list *instructions,
    if (!state->symbols->add_type(name, t)) {
       _mesa_glsl_error(& loc, state, "struct `%s' previously defined", name);
    } else {
-
-      const glsl_type **s = (const glsl_type **)
-	 realloc(state->user_structures,
-		 sizeof(state->user_structures[0]) *
-		 (state->num_user_structures + 1));
+      const glsl_type **s = reralloc(state, state->user_structures,
+				     const glsl_type *,
+				     state->num_user_structures + 1);
       if (s != NULL) {
 	 s[state->num_user_structures] = t;
 	 state->user_structures = s;
diff --git a/src/glsl/builtin_types.h b/src/glsl/builtin_types.h
index 8ccbf6e312f..58b9a81273a 100644
--- a/src/glsl/builtin_types.h
+++ b/src/glsl/builtin_types.h
@@ -27,6 +27,10 @@ const glsl_type glsl_type::_error_type =
 const glsl_type glsl_type::_void_type =
    glsl_type(GL_INVALID_ENUM, GLSL_TYPE_VOID, 0, 0, "void");
 
+const glsl_type glsl_type::_sampler3D_type =
+   glsl_type(GL_SAMPLER_3D, GLSL_SAMPLER_DIM_3D, 0, 0, GLSL_TYPE_FLOAT,
+	     "sampler3D");
+
 const glsl_type *const glsl_type::error_type = & glsl_type::_error_type;
 const glsl_type *const glsl_type::void_type = & glsl_type::_void_type;
 
@@ -181,8 +185,6 @@ const glsl_type glsl_type::builtin_110_types[] = {
 	     "sampler1DShadow"),
    glsl_type(GL_SAMPLER_2D_SHADOW, GLSL_SAMPLER_DIM_2D, 1, 0, GLSL_TYPE_FLOAT,
 	     "sampler2DShadow"),
-   glsl_type(GL_SAMPLER_3D,   GLSL_SAMPLER_DIM_3D, 0, 0, GLSL_TYPE_FLOAT,
-	     "sampler3D"),
 };
 /*@}*/
 
diff --git a/src/glsl/builtins/profiles/130.frag b/src/glsl/builtins/profiles/130.frag
index 43653a906f8..0e3c7ac4199 100644
--- a/src/glsl/builtins/profiles/130.frag
+++ b/src/glsl/builtins/profiles/130.frag
@@ -491,8 +491,8 @@ ivec2 textureSize( sampler1DArray sampler, int lod);
 ivec2 textureSize(isampler1DArray sampler, int lod);
 ivec2 textureSize(usampler1DArray sampler, int lod);
 ivec3 textureSize( sampler2DArray sampler, int lod);
-ivec2 textureSize(isampler2DArray sampler, int lod);
-ivec2 textureSize(usampler2DArray sampler, int lod);
+ivec3 textureSize(isampler2DArray sampler, int lod);
+ivec3 textureSize(usampler2DArray sampler, int lod);
 
 ivec2 textureSize(sampler1DArrayShadow sampler, int lod);
 ivec3 textureSize(sampler2DArrayShadow sampler, int lod);
diff --git a/src/glsl/builtins/profiles/130.vert b/src/glsl/builtins/profiles/130.vert
index 742dec6e6d5..f85b27f8f8c 100644
--- a/src/glsl/builtins/profiles/130.vert
+++ b/src/glsl/builtins/profiles/130.vert
@@ -493,8 +493,8 @@ ivec2 textureSize( sampler1DArray sampler, int lod);
 ivec2 textureSize(isampler1DArray sampler, int lod);
 ivec2 textureSize(usampler1DArray sampler, int lod);
 ivec3 textureSize( sampler2DArray sampler, int lod);
-ivec2 textureSize(isampler2DArray sampler, int lod);
-ivec2 textureSize(usampler2DArray sampler, int lod);
+ivec3 textureSize(isampler2DArray sampler, int lod);
+ivec3 textureSize(usampler2DArray sampler, int lod);
 
 ivec2 textureSize(sampler1DArrayShadow sampler, int lod);
 ivec3 textureSize(sampler2DArrayShadow sampler, int lod);
diff --git a/src/glsl/builtins/profiles/OES_texture_3D.frag b/src/glsl/builtins/profiles/OES_texture_3D.frag
new file mode 100644
index 00000000000..b6ebd6a311f
--- /dev/null
+++ b/src/glsl/builtins/profiles/OES_texture_3D.frag
@@ -0,0 +1,7 @@
+#version 100
+#extension GL_OES_texture_3D : enable
+
+vec4 texture3D (sampler3D sampler, vec3 coord);
+vec4 texture3DProj (sampler3D sampler, vec4 coord);
+vec4 texture3D (sampler3D sampler, vec3 coord, float bias);
+vec4 texture3DProj (sampler3D sampler, vec4 coord, float bias);
diff --git a/src/glsl/builtins/profiles/OES_texture_3D.vert b/src/glsl/builtins/profiles/OES_texture_3D.vert
new file mode 100644
index 00000000000..81d12f51e9f
--- /dev/null
+++ b/src/glsl/builtins/profiles/OES_texture_3D.vert
@@ -0,0 +1,7 @@
+#version 100
+#extension GL_OES_texture_3D : enable
+
+vec4 texture3D (sampler3D sampler, vec3 coord);
+vec4 texture3DProj (sampler3D sampler, vec4 coord);
+vec4 texture3DLod (sampler3D sampler, vec3 coord, float lod);
+vec4 texture3DProjLod (sampler3D sampler, vec4 coord, float lod);
diff --git a/src/glsl/glsl_parser_extras.cpp b/src/glsl/glsl_parser_extras.cpp
index d7a37aef46d..e8c60936fb6 100644
--- a/src/glsl/glsl_parser_extras.cpp
+++ b/src/glsl/glsl_parser_extras.cpp
@@ -256,6 +256,11 @@ _mesa_glsl_process_extension(const char *name, YYLTYPE *name_locp,
       state->AMD_conservative_depth_enable = (ext_mode != extension_disable);
       state->AMD_conservative_depth_warn = (ext_mode == extension_warn);
       unsupported = !state->extensions->AMD_conservative_depth;
+   } else if (strcmp(name, "GL_OES_texture_3D") == 0 && state->es_shader) {
+      state->OES_texture_3D_enable = (ext_mode != extension_disable);
+      state->OES_texture_3D_warn = (ext_mode == extension_warn);
+
+      unsupported = !state->extensions->EXT_texture3D;
    } else {
       unsupported = true;
    }
diff --git a/src/glsl/glsl_parser_extras.h b/src/glsl/glsl_parser_extras.h
index 10cb673c694..b5c016fb399 100644
--- a/src/glsl/glsl_parser_extras.h
+++ b/src/glsl/glsl_parser_extras.h
@@ -172,6 +172,8 @@ struct _mesa_glsl_parse_state {
    unsigned ARB_shader_stencil_export_warn:1;
    unsigned AMD_conservative_depth_enable:1;
    unsigned AMD_conservative_depth_warn:1;
+   unsigned OES_texture_3D_enable:1;
+   unsigned OES_texture_3D_warn:1;
    /*@}*/
 
    /** Extensions supported by the OpenGL implementation. */
diff --git a/src/glsl/glsl_types.cpp b/src/glsl/glsl_types.cpp
index 76b4f3e4cb0..78d10bd9380 100644
--- a/src/glsl/glsl_types.cpp
+++ b/src/glsl/glsl_types.cpp
@@ -131,6 +131,7 @@ glsl_type::generate_110_types(glsl_symbol_table *symtab)
    add_types_to_symbol_table(symtab, builtin_110_types,
 			     Elements(builtin_110_types),
 			     false);
+   add_types_to_symbol_table(symtab, &_sampler3D_type, 1, false);
    add_types_to_symbol_table(symtab, builtin_110_deprecated_structure_types,
 			     Elements(builtin_110_deprecated_structure_types),
 			     false);
@@ -179,6 +180,13 @@ glsl_type::generate_EXT_texture_array_types(glsl_symbol_table *symtab,
 
 
 void
+glsl_type::generate_OES_texture_3D_types(glsl_symbol_table *symtab, bool warn)
+{
+   add_types_to_symbol_table(symtab, &_sampler3D_type, 1, warn);
+}
+
+
+void
 _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
 {
    switch (state->language_version) {
@@ -204,6 +212,10 @@ _mesa_glsl_initialize_types(struct _mesa_glsl_parse_state *state)
       glsl_type::generate_ARB_texture_rectangle_types(state->symbols,
 					   state->ARB_texture_rectangle_warn);
    }
+   if (state->OES_texture_3D_enable && state->language_version == 100) {
+      glsl_type::generate_OES_texture_3D_types(state->symbols,
+					       state->OES_texture_3D_warn);
+   }
 
    if (state->EXT_texture_array_enable && state->language_version < 130) {
       // These are already included in 130; don't create twice.
diff --git a/src/glsl/glsl_types.h b/src/glsl/glsl_types.h
index 61bf5e0cfd2..3c2672c01a0 100644
--- a/src/glsl/glsl_types.h
+++ b/src/glsl/glsl_types.h
@@ -427,6 +427,7 @@ private:
    /*@{*/
    static const glsl_type _error_type;
    static const glsl_type _void_type;
+   static const glsl_type _sampler3D_type;
    static const glsl_type builtin_core_types[];
    static const glsl_type builtin_structure_types[];
    static const glsl_type builtin_110_deprecated_structure_types[];
@@ -453,6 +454,7 @@ private:
    static void generate_130_types(glsl_symbol_table *);
    static void generate_ARB_texture_rectangle_types(glsl_symbol_table *, bool);
    static void generate_EXT_texture_array_types(glsl_symbol_table *, bool);
+   static void generate_OES_texture_3D_types(glsl_symbol_table *, bool);
    /*@}*/
 
    /**
diff --git a/src/glx/dri2_glx.c b/src/glx/dri2_glx.c
index a275ba5b9fe..2c28bc27150 100644
--- a/src/glx/dri2_glx.c
+++ b/src/glx/dri2_glx.c
@@ -535,8 +535,13 @@ dri2SwapBuffers(__GLXDRIdrawable *pdraw, int64_t target_msc, int64_t divisor,
     CARD64 ret = 0;
 
 #ifdef __DRI2_FLUSH
-    if (psc->f)
-    	(*psc->f->flush)(priv->driDrawable);
+    if (psc->f) {
+       struct glx_context *gc = __glXGetCurrentContext();
+
+       if (gc) {
+	  (*psc->f->flush)(priv->driDrawable);
+       }
+    }
 #endif
 
     /* Old servers don't send invalidate events */
diff --git a/src/glx/glxclient.h b/src/glx/glxclient.h
index fdcef8075a8..2b6966f2e08 100644
--- a/src/glx/glxclient.h
+++ b/src/glx/glxclient.h
@@ -419,9 +419,9 @@ struct glx_context
    /*@} */
 
    /**
-    * Thread ID we're currently current in. Zero if none.
+    * Number of threads we're currently current in.
     */
-   unsigned long thread_id;
+   unsigned long thread_refcount;
 
    char gl_extension_bits[__GL_EXT_BYTES];
 };
diff --git a/src/glx/glxcmds.c b/src/glx/glxcmds.c
index 80eaf72b7d5..22bebab26bc 100644
--- a/src/glx/glxcmds.c
+++ b/src/glx/glxcmds.c
@@ -727,11 +727,16 @@ glXSwapBuffers(Display * dpy, GLXDrawable drawable)
    xGLXSwapBuffersReq *req;
 #endif
 
+   gc = __glXGetCurrentContext();
+
 #if defined(GLX_DIRECT_RENDERING) && !defined(GLX_USE_APPLEGL)
    __GLXDRIdrawable *pdraw = GetGLXDRIDrawable(dpy, drawable);
 
    if (pdraw != NULL) {
-      glFlush();
+      if (gc && drawable == gc->currentDrawable) {
+	 glFlush();
+      }
+
       (*pdraw->psc->driScreen->swapBuffers)(pdraw, 0, 0, 0);
       return;
    }
@@ -746,7 +751,6 @@ glXSwapBuffers(Display * dpy, GLXDrawable drawable)
     ** The calling thread may or may not have a current context.  If it
     ** does, send the context tag so the server can do a flush.
     */
-   gc = __glXGetCurrentContext();
    if ((gc != NULL) && (dpy == gc->currentDpy) &&
        ((drawable == gc->currentDrawable)
         || (drawable == gc->currentReadable))) {
diff --git a/src/glx/glxcurrent.c b/src/glx/glxcurrent.c
index 36317383544..9a6499037b1 100644
--- a/src/glx/glxcurrent.c
+++ b/src/glx/glxcurrent.c
@@ -216,6 +216,16 @@ MakeContextCurrent(Display * dpy, GLXDrawable draw,
    struct glx_context *oldGC = __glXGetCurrentContext();
    int ret = Success;
 
+   /* XXX: If this is left out, then libGL ends up not having this
+    * symbol, and drivers using it fail to load.  Compare the
+    * implementation of this symbol to _glapi_noop_enable_warnings(),
+    * though, which gets into the library despite no callers, the same
+    * prototypes, and the same compile flags to the files containing
+    * them.  Moving the definition to glapi_nop.c gets it into the
+    * library, though.
+    */
+   (void)_glthread_GetID();
+
    /* Make sure that the new context has a nonzero ID.  In the request,
     * a zero context ID is used only to mean that we bind to no current
     * context.
@@ -236,41 +246,42 @@ MakeContextCurrent(Display * dpy, GLXDrawable draw,
 
    _glapi_check_multithread();
 
-   if (gc != NULL && gc->thread_id != 0 && gc->thread_id != _glthread_GetID()) {
-      __glXGenerateError(dpy, gc, gc->xid,
-                         BadAccess, X_GLXMakeContextCurrent);
-      return False;
-   }
-
+   __glXLock();
    if (oldGC == gc &&
-       gc->currentDrawable == draw && gc->currentReadable == read)
+       gc->currentDrawable == draw && gc->currentReadable == read) {
+      __glXUnlock();
       return True;
+   }
 
    if (oldGC != &dummyContext) {
-      oldGC->vtable->unbind(oldGC, gc);
-      oldGC->currentDpy = 0;
-      oldGC->currentDrawable = None;
-      oldGC->currentReadable = None;
-      oldGC->thread_id = 0;
+      if (--oldGC->thread_refcount == 0) {
+	 oldGC->vtable->unbind(oldGC, gc);
+	 oldGC->currentDpy = 0;
+	 oldGC->currentDrawable = None;
+	 oldGC->currentReadable = None;
+
+	 if (oldGC->xid == None && oldGC != gc) {
+	    /* We are switching away from a context that was
+	     * previously destroyed, so we need to free the memory
+	     * for the old handle. */
+	    oldGC->vtable->destroy(oldGC);
+	 }
+      }
    }
 
    if (gc) {
-      gc->currentDpy = dpy;
-      gc->currentDrawable = draw;
-      gc->currentReadable = read;
-      gc->thread_id = _glthread_GetID();
+      if (gc->thread_refcount++ == 0) {
+	 gc->currentDpy = dpy;
+	 gc->currentDrawable = draw;
+	 gc->currentReadable = read;
+      }
       __glXSetCurrentContext(gc);
       ret = gc->vtable->bind(gc, oldGC, draw, read);
    } else {
       __glXSetCurrentContextNull();
    }
 
-   if (oldGC != &dummyContext && oldGC->xid == None && oldGC != gc) {
-      /* We are switching away from a context that was
-       * previously destroyed, so we need to free the memory
-       * for the old handle. */
-      oldGC->vtable->destroy(oldGC);
-   }
+   __glXUnlock();
 
    if (ret) {
       __glXGenerateError(dpy, gc, None, ret, X_GLXMakeContextCurrent);
diff --git a/src/glx/glxextensions.c b/src/glx/glxextensions.c
index 3a0e64c46d1..ffd466479b4 100644
--- a/src/glx/glxextensions.c
+++ b/src/glx/glxextensions.c
@@ -90,6 +90,7 @@ static const struct extension_info known_glx_extensions[] = {
    { GLX(MESA_agp_offset),             VER(0,0), N, N, N, Y }, /* Deprecated */
    { GLX(MESA_copy_sub_buffer),        VER(0,0), Y, N, N, N },
 #endif
+   { GLX(MESA_multithread_makecurrent),VER(0,0), Y, N, Y, N },
    { GLX(MESA_pixmap_colormap),        VER(0,0), N, N, N, N }, /* Deprecated */
    { GLX(MESA_release_buffers),        VER(0,0), N, N, N, N }, /* Deprecated */
 #ifdef GLX_USE_APPLEGL
diff --git a/src/glx/glxextensions.h b/src/glx/glxextensions.h
index 78776618338..333b3f9adbd 100644
--- a/src/glx/glxextensions.h
+++ b/src/glx/glxextensions.h
@@ -43,6 +43,7 @@ enum
    MESA_agp_offset_bit,
    MESA_copy_sub_buffer_bit,
    MESA_depth_float_bit,
+   MESA_multithread_makecurrent_bit,
    MESA_pixmap_colormap_bit,
    MESA_release_buffers_bit,
    MESA_swap_control_bit,
diff --git a/src/mesa/SConscript b/src/mesa/SConscript
index ea04fb1a0ee..90fec124af9 100644
--- a/src/mesa/SConscript
+++ b/src/mesa/SConscript
@@ -106,6 +106,7 @@ main_sources = [
     'main/stencil.c',
     'main/syncobj.c',
     'main/texcompress.c',
+    'main/texcompress_rgtc.c',
     'main/texcompress_s3tc.c',
     'main/texcompress_fxt1.c',
     'main/texenv.c',
diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index a413c02b573..5496b4fdd3b 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -679,6 +679,8 @@
 #define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS    2
 #define BRW_SAMPLER_MESSAGE_SIMD4X2_SAMPLE_COMPARE    0
 #define BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_COMPARE     2
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE 0
+#define BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE  1
 #define BRW_SAMPLER_MESSAGE_SIMD4X2_RESINFO           2
 #define BRW_SAMPLER_MESSAGE_SIMD8_RESINFO             2
 #define BRW_SAMPLER_MESSAGE_SIMD16_RESINFO            2
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index 30e3bd54469..9bdcda780ef 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -213,6 +213,7 @@ fs_visitor::implied_mrf_writes(fs_inst *inst)
       return 2;
    case FS_OPCODE_TEX:
    case FS_OPCODE_TXB:
+   case FS_OPCODE_TXD:
    case FS_OPCODE_TXL:
       return 1;
    case FS_OPCODE_FB_WRITE:
@@ -1200,6 +1201,8 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
       }
       /* gen4's SIMD8 sampler always has the slots for u,v,r present. */
       mlen += 3;
+   } else if (ir->op == ir_txd) {
+      assert(!"TXD isn't supported on gen4 yet.");
    } else {
       /* Oh joy.  gen4 doesn't have SIMD8 non-shadow-compare bias/lod
        * instructions.  We'll need to do SIMD16 here.
@@ -1253,6 +1256,8 @@ fs_visitor::emit_texture_gen4(ir_texture *ir, fs_reg dst, fs_reg coordinate)
       inst = emit(fs_inst(FS_OPCODE_TXL, dst));
       break;
    case ir_txd:
+      inst = emit(fs_inst(FS_OPCODE_TXD, dst));
+      break;
    case ir_txf:
       assert(!"GLSL 1.30 features unsupported");
       break;
@@ -2308,6 +2313,16 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
 	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_BIAS_GEN5;
 	 }
 	 break;
+      case FS_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE_GEN5;
+	 } else {
+	    msg_type = BRW_SAMPLER_MESSAGE_SAMPLE_LOD_GEN5;
+	 }
+	 break;
+      case FS_OPCODE_TXD:
+	 assert(!"TXD isn't supported on gen5+ yet.");
+	 break;
       }
    } else {
       switch (inst->opcode) {
@@ -2325,13 +2340,26 @@ fs_visitor::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
       case FS_OPCODE_TXB:
 	 if (inst->shadow_compare) {
 	    assert(inst->mlen == 6);
-	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
 	 } else {
 	    assert(inst->mlen == 9);
 	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
 	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
 	 }
 	 break;
+      case FS_OPCODE_TXL:
+	 if (inst->shadow_compare) {
+	    assert(inst->mlen == 6);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
+	 } else {
+	    assert(inst->mlen == 9);
+	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
+	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
+	 }
+	 break;
+      case FS_OPCODE_TXD:
+	 assert(!"TXD isn't supported on gen4 yet.");
+	 break;
       }
    }
    assert(msg_type != -1);
@@ -3607,6 +3635,7 @@ fs_visitor::generate_code()
 	 break;
       case FS_OPCODE_TEX:
       case FS_OPCODE_TXB:
+      case FS_OPCODE_TXD:
       case FS_OPCODE_TXL:
 	 generate_tex(inst, dst, src[0]);
 	 break;
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 8352760acf7..dc030ae5b50 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -71,6 +71,7 @@ enum fs_opcodes {
    FS_OPCODE_LINTERP,
    FS_OPCODE_TEX,
    FS_OPCODE_TXB,
+   FS_OPCODE_TXD,
    FS_OPCODE_TXL,
    FS_OPCODE_DISCARD_NOT,
    FS_OPCODE_DISCARD_AND,
@@ -309,6 +310,7 @@ public:
    {
       return (opcode == FS_OPCODE_TEX ||
 	      opcode == FS_OPCODE_TXB ||
+	      opcode == FS_OPCODE_TXD ||
 	      opcode == FS_OPCODE_TXL);
    }
 
diff --git a/src/mesa/drivers/dri/i965/gen6_clip_state.c b/src/mesa/drivers/dri/i965/gen6_clip_state.c
index 38c98f30efb..d6c1f1c893d 100644
--- a/src/mesa/drivers/dri/i965/gen6_clip_state.c
+++ b/src/mesa/drivers/dri/i965/gen6_clip_state.c
@@ -68,7 +68,7 @@ upload_clip_state(struct brw_context *brw)
 	     depth_clamp |
 	     provoking);
    OUT_BATCH(U_FIXED(0.125, 3) << GEN6_CLIP_MIN_POINT_WIDTH_SHIFT |
-             U_FIXED(225.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
+             U_FIXED(255.875, 3) << GEN6_CLIP_MAX_POINT_WIDTH_SHIFT |
              GEN6_CLIP_FORCE_ZERO_RTAINDEX);
    ADVANCE_BATCH();
 }
diff --git a/src/mesa/drivers/dri/intel/intel_screen.c b/src/mesa/drivers/dri/intel/intel_screen.c
index 356d5f72d89..746da462ee2 100644
--- a/src/mesa/drivers/dri/intel/intel_screen.c
+++ b/src/mesa/drivers/dri/intel/intel_screen.c
@@ -104,7 +104,8 @@ static const __DRItexBufferExtension intelTexBufferExtension = {
 static void
 intelDRI2Flush(__DRIdrawable *drawable)
 {
-   struct intel_context *intel = drawable->driContextPriv->driverPrivate;
+   GET_CURRENT_CONTEXT(ctx);
+   struct intel_context *intel = intel_context(ctx);
 
    if (intel->gen < 4)
       INTEL_FIREVERTICES(intel);
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 7504b8a85db..b8bb2555acd 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -241,8 +241,7 @@ static const struct extension extension_table[] = {
    { "GL_OES_stencil4",                            o(dummy_false),                     DISABLE                },
    { "GL_OES_stencil8",                            o(EXT_framebuffer_object),                       ES1 | ES2 },
    { "GL_OES_stencil_wrap",                        o(EXT_stencil_wrap),                             ES1       },
-   /* GL_OES_texture_3D is disabled due to missing GLSL support. */
-   { "GL_OES_texture_3D",                          o(EXT_texture3D),                   DISABLE                },
+   { "GL_OES_texture_3D",                          o(EXT_texture3D),                                      ES2 },
    { "GL_OES_texture_cube_map",                    o(ARB_texture_cube_map),                         ES1       },
    { "GL_OES_texture_env_crossbar",                o(ARB_texture_env_crossbar),                     ES1       },
    { "GL_OES_texture_mirrored_repeat",             o(ARB_texture_mirrored_repeat),                  ES1       },
@@ -428,6 +427,7 @@ _mesa_enable_sw_extensions(struct gl_context *ctx)
    ctx->Extensions.ARB_texture_mirrored_repeat = GL_TRUE;
    ctx->Extensions.ARB_texture_non_power_of_two = GL_TRUE;
    ctx->Extensions.ARB_texture_rg = GL_TRUE;
+   ctx->Extensions.ARB_texture_compression_rgtc = GL_TRUE;
    ctx->Extensions.ARB_vertex_array_object = GL_TRUE;
 #if FEATURE_ARB_vertex_program
    ctx->Extensions.ARB_vertex_program = GL_TRUE;
diff --git a/src/mesa/main/formats.c b/src/mesa/main/formats.c
index 1e395363475..947db84a69e 100644
--- a/src/mesa/main/formats.c
+++ b/src/mesa/main/formats.c
@@ -890,7 +890,43 @@ static struct gl_format_info format_info[MESA_FORMAT_COUNT] =
       16, 16, 16, 16,
       0, 0, 0, 0, 0,
       1, 1, 8
-   }
+   },
+   {
+     MESA_FORMAT_RED_RGTC1,
+     "MESA_FORMAT_RED_RGTC1",
+     GL_RED,
+     GL_UNSIGNED_NORMALIZED,
+     4, 0, 0, 0,
+     0, 0, 0, 0, 0,
+     4, 4, 8                     /* 8 bytes per 4x4 block */
+   },
+   {
+     MESA_FORMAT_SIGNED_RED_RGTC1,
+     "MESA_FORMAT_SIGNED_RED_RGTC1",
+     GL_RED,
+     GL_SIGNED_NORMALIZED,
+     4, 0, 0, 0,
+     0, 0, 0, 0, 0,
+     4, 4, 8                     /* 8 bytes per 4x4 block */
+   },
+   {
+     MESA_FORMAT_RG_RGTC2,
+     "MESA_FORMAT_RG_RGTC2",
+     GL_RG,
+     GL_UNSIGNED_NORMALIZED,
+     4, 4, 0, 0,
+     0, 0, 0, 0, 0,
+     4, 4, 16                     /* 16 bytes per 4x4 block */
+   },
+   {
+     MESA_FORMAT_SIGNED_RG_RGTC2,
+     "MESA_FORMAT_SIGNED_RG_RGTC2",
+     GL_RG,
+     GL_SIGNED_NORMALIZED,
+     4, 4, 0, 0,
+     0, 0, 0, 0, 0,
+     4, 4, 16                     /* 16 bytes per 4x4 block */
+   },
 };
 
 
@@ -1530,6 +1566,10 @@ _mesa_format_to_type_and_comps(gl_format format,
    case MESA_FORMAT_SRGBA_DXT5:
 #endif
 #endif
+   case MESA_FORMAT_RED_RGTC1:
+   case MESA_FORMAT_SIGNED_RED_RGTC1:
+   case MESA_FORMAT_RG_RGTC2:
+   case MESA_FORMAT_SIGNED_RG_RGTC2:
       /* XXX generate error instead? */
       *datatype = GL_UNSIGNED_BYTE;
       *comps = 0;
diff --git a/src/mesa/main/formats.h b/src/mesa/main/formats.h
index 9a5cef37788..e21967e2b0c 100644
--- a/src/mesa/main/formats.h
+++ b/src/mesa/main/formats.h
@@ -179,6 +179,12 @@ typedef enum
    MESA_FORMAT_RGBA_16,           /* ... */
    /*@}*/
 
+   /*@{*/
+   MESA_FORMAT_RED_RGTC1,
+   MESA_FORMAT_SIGNED_RED_RGTC1,
+   MESA_FORMAT_RG_RGTC2,
+   MESA_FORMAT_SIGNED_RG_RGTC2,
+   /*@}*/
    MESA_FORMAT_COUNT
 } gl_format;
 
diff --git a/src/mesa/main/texcompress.c b/src/mesa/main/texcompress.c
index 7a0b522a2d8..82d02ed0ecf 100644
--- a/src/mesa/main/texcompress.c
+++ b/src/mesa/main/texcompress.c
@@ -64,6 +64,7 @@ _mesa_get_compressed_formats(struct gl_context *ctx, GLint *formats, GLboolean a
          n += 2;
       }
    }
+   /* don't return RGTC - ARB_texture_compression_rgtc query 19 */
    if (ctx->Extensions.EXT_texture_compression_s3tc) {
       if (formats) {
          formats[n++] = GL_COMPRESSED_RGB_S3TC_DXT1_EXT;
@@ -163,6 +164,15 @@ _mesa_glenum_to_compressed_format(GLenum format)
    case GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT:
       return MESA_FORMAT_SRGBA_DXT5;
 
+   case GL_COMPRESSED_RED_RGTC1:
+      return MESA_FORMAT_RED_RGTC1;
+   case GL_COMPRESSED_SIGNED_RED_RGTC1:
+      return MESA_FORMAT_SIGNED_RED_RGTC1;
+   case GL_COMPRESSED_RG_RGTC2:
+      return MESA_FORMAT_RG_RGTC2;
+   case GL_COMPRESSED_SIGNED_RG_RGTC2:
+      return MESA_FORMAT_SIGNED_RG_RGTC2;
+
    default:
       return MESA_FORMAT_NONE;
    }
@@ -209,6 +219,16 @@ _mesa_compressed_format_to_glenum(struct gl_context *ctx, GLuint mesaFormat)
       return GL_COMPRESSED_SRGB_ALPHA_S3TC_DXT5_EXT;
 #endif
 #endif
+
+   case MESA_FORMAT_RED_RGTC1:
+      return GL_COMPRESSED_RED_RGTC1;
+   case MESA_FORMAT_SIGNED_RED_RGTC1:
+      return GL_COMPRESSED_SIGNED_RED_RGTC1;
+   case MESA_FORMAT_RG_RGTC2:
+      return GL_COMPRESSED_RG_RGTC2;
+   case MESA_FORMAT_SIGNED_RG_RGTC2:
+      return GL_COMPRESSED_SIGNED_RG_RGTC2;
+
    default:
       _mesa_problem(ctx, "Unexpected mesa texture format in"
                     " _mesa_compressed_format_to_glenum()");
diff --git a/src/mesa/main/texcompress_rgtc.c b/src/mesa/main/texcompress_rgtc.c
new file mode 100644
index 00000000000..1a01755f14d
--- /dev/null
+++ b/src/mesa/main/texcompress_rgtc.c
@@ -0,0 +1,1122 @@
+/*
+ * Copyright (C) 2011 Red Hat Inc.
+ * 
+ * block compression parts are:
+ * Copyright (C) 2004  Roland Scheidegger   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ *
+ * Author:
+ *    Dave Airlie
+ */
+
+/**
+ * \file texcompress_rgtc.c
+ * GL_EXT_texture_compression_rgtc support.
+ */
+
+
+#include "glheader.h"
+#include "imports.h"
+#include "colormac.h"
+#include "image.h"
+#include "macros.h"
+#include "mfeatures.h"
+#include "mipmap.h"
+#include "texcompress.h"
+#include "texcompress_rgtc.h"
+#include "texstore.h"
+
+#define RGTC_DEBUG 0
+
+static void encode_rgtc_chan_u(GLubyte *blkaddr, GLubyte srccolors[4][4],
+			     GLint numxpixels, GLint numypixels);
+static void encode_rgtc_chan_s(GLbyte *blkaddr, GLbyte srccolors[4][4],
+			     GLint numxpixels, GLint numypixels);
+
+static void extractsrc_u( GLubyte srcpixels[4][4], const GLchan *srcaddr,
+			  GLint srcRowStride, GLint numxpixels, GLint numypixels, GLint comps)
+{
+   GLubyte i, j;
+   const GLchan *curaddr;
+   for (j = 0; j < numypixels; j++) {
+      curaddr = srcaddr + j * srcRowStride * comps;
+      for (i = 0; i < numxpixels; i++) {
+	 srcpixels[j][i] = *curaddr / (CHAN_MAX / 255);
+	 curaddr += comps;
+      }
+   }
+}
+
+static void extractsrc_s( GLbyte srcpixels[4][4], const GLfloat *srcaddr,
+			  GLint srcRowStride, GLint numxpixels, GLint numypixels, GLint comps)
+{
+   GLubyte i, j;
+   const GLfloat *curaddr;
+   for (j = 0; j < numypixels; j++) {
+      curaddr = srcaddr + j * srcRowStride * comps;
+      for (i = 0; i < numxpixels; i++) {
+	 srcpixels[j][i] = FLOAT_TO_BYTE_TEX(*curaddr);
+	 curaddr += comps;
+      }
+   }
+}
+
+
+GLboolean
+_mesa_texstore_red_rgtc1(TEXSTORE_PARAMS)
+{
+   GLubyte *dst;
+   const GLint texWidth = dstRowStride * 4 / 8; /* a bit of a hack */
+   const GLchan *tempImage = NULL;
+   int i, j;
+   int numxpixels, numypixels;
+   const GLchan *srcaddr;
+   GLubyte srcpixels[4][4];
+   GLubyte *blkaddr;
+   GLint dstRowDiff;
+   ASSERT(dstFormat == MESA_FORMAT_RED_RGTC1);
+   ASSERT(dstXoffset % 4 == 0);
+   ASSERT(dstYoffset % 4 == 0);
+   ASSERT(dstZoffset % 4 == 0);
+   (void) dstZoffset;
+   (void) dstImageOffsets;
+
+
+   tempImage = _mesa_make_temp_chan_image(ctx, dims,
+					  baseInternalFormat,
+					  _mesa_get_format_base_format(dstFormat),
+					  srcWidth, srcHeight, srcDepth,
+					  srcFormat, srcType, srcAddr,
+					  srcPacking);
+   if (!tempImage)
+      return GL_FALSE; /* out of memory */
+
+   dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
+                                        dstFormat,
+                                        texWidth, (GLubyte *) dstAddr);
+
+   blkaddr = dst;
+   dstRowDiff = dstRowStride >= (srcWidth * 4) ? dstRowStride - (((srcWidth + 3) & ~3) * 4) : 0;
+   for (j = 0; j < srcHeight; j+=4) {
+      if (srcHeight > j + 3) numypixels = 4;
+      else numypixels = srcHeight - j;
+      srcaddr = tempImage + j * srcWidth;
+      for (i = 0; i < srcWidth; i += 4) {
+	 if (srcWidth > i + 3) numxpixels = 4;
+	 else numxpixels = srcWidth - i;
+	 extractsrc_u(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 1);
+	 encode_rgtc_chan_u(blkaddr, srcpixels, numxpixels, numypixels);
+	 srcaddr += numxpixels;
+	 blkaddr += 8;
+      }
+      blkaddr += dstRowDiff;
+   }
+   if (tempImage)
+      free((void *) tempImage);
+
+   return GL_TRUE;
+}
+
+GLboolean
+_mesa_texstore_signed_red_rgtc1(TEXSTORE_PARAMS)
+{
+   GLbyte *dst;
+   const GLint texWidth = dstRowStride * 4 / 8; /* a bit of a hack */
+   const GLfloat *tempImage = NULL;
+   int i, j;
+   int numxpixels, numypixels;
+   const GLfloat *srcaddr;
+   GLbyte srcpixels[4][4];
+   GLbyte *blkaddr;
+   GLint dstRowDiff;
+   ASSERT(dstFormat == MESA_FORMAT_SIGNED_RED_RGTC1);
+   ASSERT(dstXoffset % 4 == 0);
+   ASSERT(dstYoffset % 4 == 0);
+   ASSERT(dstZoffset % 4 == 0);
+   (void) dstZoffset;
+   (void) dstImageOffsets;
+
+   tempImage = _mesa_make_temp_float_image(ctx, dims,
+					   baseInternalFormat,
+					   _mesa_get_format_base_format(dstFormat),
+					   srcWidth, srcHeight, srcDepth,
+					   srcFormat, srcType, srcAddr,
+					   srcPacking, 0x0);
+   if (!tempImage)
+      return GL_FALSE; /* out of memory */
+
+   dst = (GLbyte *)_mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
+						  dstFormat,
+						  texWidth, (GLubyte *) dstAddr);
+
+   blkaddr = dst;
+   dstRowDiff = dstRowStride >= (srcWidth * 4) ? dstRowStride - (((srcWidth + 3) & ~3) * 4) : 0;
+   for (j = 0; j < srcHeight; j+=4) {
+      if (srcHeight > j + 3) numypixels = 4;
+      else numypixels = srcHeight - j;
+      srcaddr = tempImage + j * srcWidth;
+      for (i = 0; i < srcWidth; i += 4) {
+	 if (srcWidth > i + 3) numxpixels = 4;
+	 else numxpixels = srcWidth - i;
+	 extractsrc_s(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 1);
+	 encode_rgtc_chan_s(blkaddr, srcpixels, numxpixels, numypixels);
+	 srcaddr += numxpixels;
+	 blkaddr += 8;
+      }
+      blkaddr += dstRowDiff;
+   }
+   if (tempImage)
+      free((void *) tempImage);
+
+   return GL_TRUE;
+}
+
+GLboolean
+_mesa_texstore_rg_rgtc2(TEXSTORE_PARAMS)
+{
+   GLubyte *dst;
+   const GLint texWidth = dstRowStride * 4 / 16; /* a bit of a hack */
+   const GLchan *tempImage = NULL;
+   int i, j;
+   int numxpixels, numypixels;
+   const GLchan *srcaddr;
+   GLubyte srcpixels[4][4];
+   GLubyte *blkaddr;
+   GLint dstRowDiff;
+
+   ASSERT(dstFormat == MESA_FORMAT_RG_RGTC2);
+   ASSERT(dstXoffset % 4 == 0);
+   ASSERT(dstYoffset % 4 == 0);
+   ASSERT(dstZoffset % 4 == 0);
+   (void) dstZoffset;
+   (void) dstImageOffsets;
+
+   tempImage = _mesa_make_temp_chan_image(ctx, dims,
+					  baseInternalFormat,
+					  _mesa_get_format_base_format(dstFormat),
+					  srcWidth, srcHeight, srcDepth,
+					  srcFormat, srcType, srcAddr,
+					  srcPacking);
+   if (!tempImage)
+      return GL_FALSE; /* out of memory */
+
+   dst = _mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
+                                        dstFormat,
+                                        texWidth, (GLubyte *) dstAddr);
+
+   blkaddr = dst;
+   dstRowDiff = dstRowStride >= (srcWidth * 8) ? dstRowStride - (((srcWidth + 7) & ~7) * 8) : 0;
+   for (j = 0; j < srcHeight; j+=4) {
+      if (srcHeight > j + 3) numypixels = 4;
+      else numypixels = srcHeight - j;
+      srcaddr = tempImage + j * srcWidth * 2;
+      for (i = 0; i < srcWidth; i += 4) {
+	 if (srcWidth > i + 3) numxpixels = 4;
+	 else numxpixels = srcWidth - i;
+	 extractsrc_u(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 2);
+	 encode_rgtc_chan_u(blkaddr, srcpixels, numxpixels, numypixels);
+
+	 blkaddr += 8;
+	 extractsrc_u(srcpixels, (GLchan *)srcaddr + 1, srcWidth, numxpixels, numypixels, 2);
+	 encode_rgtc_chan_u(blkaddr, srcpixels, numxpixels, numypixels);
+
+	 blkaddr += 8;
+
+	 srcaddr += numxpixels * 2;
+      }
+      blkaddr += dstRowDiff;
+   }
+   if (tempImage)
+      free((void *) tempImage);
+
+   return GL_TRUE;
+}
+
+GLboolean
+_mesa_texstore_signed_rg_rgtc2(TEXSTORE_PARAMS)
+{
+   GLbyte *dst;
+   const GLint texWidth = dstRowStride * 4 / 16; /* a bit of a hack */
+   const GLfloat *tempImage = NULL;
+   int i, j;
+   int numxpixels, numypixels;
+   const GLfloat *srcaddr;
+   GLbyte srcpixels[4][4];
+   GLbyte *blkaddr;
+   GLint dstRowDiff;
+
+   ASSERT(dstFormat == MESA_FORMAT_SIGNED_RG_RGTC2);
+   ASSERT(dstXoffset % 4 == 0);
+   ASSERT(dstYoffset % 4 == 0);
+   ASSERT(dstZoffset % 4 == 0);
+   (void) dstZoffset;
+   (void) dstImageOffsets;
+
+   tempImage = _mesa_make_temp_float_image(ctx, dims,
+					   baseInternalFormat,
+					   _mesa_get_format_base_format(dstFormat),
+					   srcWidth, srcHeight, srcDepth,
+					   srcFormat, srcType, srcAddr,
+					   srcPacking, 0x0);
+   if (!tempImage)
+      return GL_FALSE; /* out of memory */
+
+   dst = (GLbyte *)_mesa_compressed_image_address(dstXoffset, dstYoffset, 0,
+						  dstFormat,
+						  texWidth, (GLubyte *) dstAddr);
+
+   blkaddr = dst;
+   dstRowDiff = dstRowStride >= (srcWidth * 8) ? dstRowStride - (((srcWidth + 7) & ~7) * 8) : 0;
+   for (j = 0; j < srcHeight; j += 4) {
+      if (srcHeight > j + 3) numypixels = 4;
+      else numypixels = srcHeight - j;
+      srcaddr = tempImage + j * srcWidth * 2;
+      for (i = 0; i < srcWidth; i += 4) {
+	 if (srcWidth > i + 3) numxpixels = 4;
+	 else numxpixels = srcWidth - i;
+
+	 extractsrc_s(srcpixels, srcaddr, srcWidth, numxpixels, numypixels, 2);
+	 encode_rgtc_chan_s(blkaddr, srcpixels, numxpixels, numypixels);
+	 blkaddr += 8;
+
+	 extractsrc_s(srcpixels, srcaddr + 1, srcWidth, numxpixels, numypixels, 2);
+	 encode_rgtc_chan_s(blkaddr, srcpixels, numxpixels, numypixels);
+	 blkaddr += 8;
+
+	 srcaddr += numxpixels * 2;
+
+      }
+      blkaddr += dstRowDiff;
+   }
+   if (tempImage)
+      free((void *) tempImage);
+
+   return GL_TRUE;
+}
+
+static void _fetch_texel_rgtc_u(GLint srcRowStride, const GLubyte *pixdata,
+				GLint i, GLint j, GLchan *value, int comps)
+{
+   GLchan decode;
+   const GLubyte *blksrc = (pixdata + ((srcRowStride + 3) / 4 * (j / 4) + (i / 4)) * 8 * comps);
+   const GLubyte alpha0 = blksrc[0];
+   const GLubyte alpha1 = blksrc[1];
+   const GLubyte bit_pos = ((j&3) * 4 + (i&3)) * 3;
+   const GLubyte acodelow = blksrc[2 + bit_pos / 8];
+   const GLubyte acodehigh = blksrc[3 + bit_pos / 8];
+   const GLubyte code = (acodelow >> (bit_pos & 0x7) |
+      (acodehigh  << (8 - (bit_pos & 0x7)))) & 0x7;
+
+   if (code == 0)
+      decode = UBYTE_TO_CHAN( alpha0 );
+   else if (code == 1)
+      decode = UBYTE_TO_CHAN( alpha1 );
+   else if (alpha0 > alpha1)
+      decode = UBYTE_TO_CHAN( ((alpha0 * (8 - code) + (alpha1 * (code - 1))) / 7) );
+   else if (code < 6)
+      decode = UBYTE_TO_CHAN( ((alpha0 * (6 - code) + (alpha1 * (code - 1))) / 5) );
+   else if (code == 6)
+      decode = 0;
+   else
+      decode = CHAN_MAX;
+
+   *value = decode;
+}
+
+
+static void _fetch_texel_rgtc_s(GLint srcRowStride, const GLbyte *pixdata,
+				GLint i, GLint j, GLbyte *value, int comps)
+{
+   GLbyte decode;
+   const GLbyte *blksrc = (pixdata + ((srcRowStride + 3) / 4 * (j / 4) + (i / 4)) * 8 * comps);
+   const GLbyte alpha0 = blksrc[0];
+   const GLbyte alpha1 = blksrc[1];
+   const GLbyte bit_pos = ((j&3) * 4 + (i&3)) * 3;
+   const GLbyte acodelow = blksrc[2 + bit_pos / 8];
+   const GLbyte acodehigh = blksrc[3 + bit_pos / 8];
+   const GLbyte code = (acodelow >> (bit_pos & 0x7) |
+      (acodehigh  << (8 - (bit_pos & 0x7)))) & 0x7;
+
+   if (code == 0)
+      decode = alpha0;
+   else if (code == 1)
+      decode = alpha1;
+   else if (alpha0 > alpha1)
+      decode = ((alpha0 * (8 - code) + (alpha1 * (code - 1))) / 7);
+   else if (code < 6)
+      decode = ((alpha0 * (6 - code) + (alpha1 * (code - 1))) / 5);
+   else if (code == 6)
+      decode = -128;
+   else
+      decode = 127;
+
+   *value = decode;
+}
+
+void
+_mesa_fetch_texel_2d_f_red_rgtc1(const struct gl_texture_image *texImage,
+				 GLint i, GLint j, GLint k, GLfloat *texel)
+{
+   GLchan red;
+   _fetch_texel_rgtc_u(texImage->RowStride, (GLubyte *)(texImage->Data),
+		       i, j, &red, 1);
+   texel[RCOMP] = CHAN_TO_FLOAT(red);
+   texel[GCOMP] = 0.0;
+   texel[BCOMP] = 0.0;
+   texel[ACOMP] = 1.0;
+}
+
+void
+_mesa_fetch_texel_2d_f_signed_red_rgtc1(const struct gl_texture_image *texImage,
+					GLint i, GLint j, GLint k, GLfloat *texel)
+{
+   GLbyte red;
+   _fetch_texel_rgtc_s(texImage->RowStride, (GLbyte *)(texImage->Data),
+		       i, j, &red, 1);
+   texel[RCOMP] = BYTE_TO_FLOAT_TEX(red);
+   texel[GCOMP] = 0.0;
+   texel[BCOMP] = 0.0;
+   texel[ACOMP] = 1.0;
+}
+
+void
+_mesa_fetch_texel_2d_f_rg_rgtc2(const struct gl_texture_image *texImage,
+				 GLint i, GLint j, GLint k, GLfloat *texel)
+{
+   GLchan red, green;
+   _fetch_texel_rgtc_u(texImage->RowStride, (GLubyte *)(texImage->Data),
+		     i, j, &red, 2);
+   _fetch_texel_rgtc_u(texImage->RowStride, (GLubyte *)(texImage->Data) + 8,
+		     i, j, &green, 2);
+   texel[RCOMP] = CHAN_TO_FLOAT(red);
+   texel[GCOMP] = CHAN_TO_FLOAT(green);
+   texel[BCOMP] = 0.0;
+   texel[ACOMP] = 1.0;
+}
+
+void
+_mesa_fetch_texel_2d_f_signed_rg_rgtc2(const struct gl_texture_image *texImage,
+				       GLint i, GLint j, GLint k, GLfloat *texel)
+{
+   GLbyte red, green;
+   _fetch_texel_rgtc_s(texImage->RowStride, (GLbyte *)(texImage->Data),
+		     i, j, &red, 2);
+   _fetch_texel_rgtc_s(texImage->RowStride, (GLbyte *)(texImage->Data) + 8,
+		     i, j, &green, 2);
+   texel[RCOMP] = BYTE_TO_FLOAT_TEX(red);
+   texel[GCOMP] = BYTE_TO_FLOAT_TEX(green);
+   texel[BCOMP] = 0.0;
+   texel[ACOMP] = 1.0;
+}
+
+static void write_rgtc_encoded_channel(GLubyte *blkaddr,
+				       GLubyte alphabase1,
+				       GLubyte alphabase2,
+				       GLubyte alphaenc[16])
+{
+   *blkaddr++ = alphabase1;
+   *blkaddr++ = alphabase2;
+   *blkaddr++ = alphaenc[0] | (alphaenc[1] << 3) | ((alphaenc[2] & 3) << 6);
+   *blkaddr++ = (alphaenc[2] >> 2) | (alphaenc[3] << 1) | (alphaenc[4] << 4) | ((alphaenc[5] & 1) << 7);
+   *blkaddr++ = (alphaenc[5] >> 1) | (alphaenc[6] << 2) | (alphaenc[7] << 5);
+   *blkaddr++ = alphaenc[8] | (alphaenc[9] << 3) | ((alphaenc[10] & 3) << 6);
+   *blkaddr++ = (alphaenc[10] >> 2) | (alphaenc[11] << 1) | (alphaenc[12] << 4) | ((alphaenc[13] & 1) << 7);
+   *blkaddr++ = (alphaenc[13] >> 1) | (alphaenc[14] << 2) | (alphaenc[15] << 5);
+}
+
+static void encode_rgtc_chan_u(GLubyte *blkaddr, GLubyte srccolors[4][4],
+			     GLint numxpixels, GLint numypixels)
+{
+   GLubyte alphabase[2], alphause[2];
+   GLshort alphatest[2] = { 0 };
+   GLuint alphablockerror1, alphablockerror2, alphablockerror3;
+   GLubyte i, j, aindex, acutValues[7];
+   GLubyte alphaenc1[16], alphaenc2[16], alphaenc3[16];
+   GLboolean alphaabsmin = GL_FALSE;
+   GLboolean alphaabsmax = GL_FALSE;
+   GLshort alphadist;
+
+   /* find lowest and highest alpha value in block, alphabase[0] lowest, alphabase[1] highest */
+   alphabase[0] = 0xff; alphabase[1] = 0x0;
+   for (j = 0; j < numypixels; j++) {
+      for (i = 0; i < numxpixels; i++) {
+         if (srccolors[j][i] == 0)
+            alphaabsmin = GL_TRUE;
+         else if (srccolors[j][i] == 255)
+            alphaabsmax = GL_TRUE;
+         else {
+            if (srccolors[j][i] > alphabase[1])
+               alphabase[1] = srccolors[j][i];
+            if (srccolors[j][i] < alphabase[0])
+               alphabase[0] = srccolors[j][i];
+         }
+      }
+   }
+
+
+   if ((alphabase[0] > alphabase[1]) && !(alphaabsmin && alphaabsmax)) { /* one color, either max or min */
+      /* shortcut here since it is a very common case (and also avoids later problems) */
+      /* || (alphabase[0] == alphabase[1] && !alphaabsmin && !alphaabsmax) */
+      /* could also thest for alpha0 == alpha1 (and not min/max), but probably not common, so don't bother */
+
+      *blkaddr++ = srccolors[0][0];
+      blkaddr++;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+#if RGTC_DEBUG
+      fprintf(stderr, "enc0 used\n");
+#endif
+      return;
+   }
+
+   /* find best encoding for alpha0 > alpha1 */
+   /* it's possible this encoding is better even if both alphaabsmin and alphaabsmax are true */
+   alphablockerror1 = 0x0;
+   alphablockerror2 = 0xffffffff;
+   alphablockerror3 = 0xffffffff;
+   if (alphaabsmin) alphause[0] = 0;
+   else alphause[0] = alphabase[0];
+   if (alphaabsmax) alphause[1] = 255;
+   else alphause[1] = alphabase[1];
+   /* calculate the 7 cut values, just the middle between 2 of the computed alpha values */
+   for (aindex = 0; aindex < 7; aindex++) {
+      /* don't forget here is always rounded down */
+      acutValues[aindex] = (alphause[0] * (2*aindex + 1) + alphause[1] * (14 - (2*aindex + 1))) / 14;
+   }
+
+   for (j = 0; j < numypixels; j++) {
+      for (i = 0; i < numxpixels; i++) {
+         /* maybe it's overkill to have the most complicated calculation just for the error
+            calculation which we only need to figure out if encoding1 or encoding2 is better... */
+         if (srccolors[j][i] > acutValues[0]) {
+            alphaenc1[4*j + i] = 0;
+            alphadist = srccolors[j][i] - alphause[1];
+         }
+         else if (srccolors[j][i] > acutValues[1]) {
+            alphaenc1[4*j + i] = 2;
+            alphadist = srccolors[j][i] - (alphause[1] * 6 + alphause[0] * 1) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[2]) {
+            alphaenc1[4*j + i] = 3;
+            alphadist = srccolors[j][i] - (alphause[1] * 5 + alphause[0] * 2) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[3]) {
+            alphaenc1[4*j + i] = 4;
+            alphadist = srccolors[j][i] - (alphause[1] * 4 + alphause[0] * 3) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[4]) {
+            alphaenc1[4*j + i] = 5;
+            alphadist = srccolors[j][i] - (alphause[1] * 3 + alphause[0] * 4) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[5]) {
+            alphaenc1[4*j + i] = 6;
+            alphadist = srccolors[j][i] - (alphause[1] * 2 + alphause[0] * 5) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[6]) {
+            alphaenc1[4*j + i] = 7;
+            alphadist = srccolors[j][i] - (alphause[1] * 1 + alphause[0] * 6) / 7;
+         }
+         else {
+            alphaenc1[4*j + i] = 1;
+            alphadist = srccolors[j][i] - alphause[0];
+         }
+         alphablockerror1 += alphadist * alphadist;
+      }
+   }
+
+#if RGTC_DEBUG
+   for (i = 0; i < 16; i++) {
+      fprintf(stderr, "%d ", alphaenc1[i]);
+   }
+   fprintf(stderr, "cutVals ");
+   for (i = 0; i < 8; i++) {
+      fprintf(stderr, "%d ", acutValues[i]);
+   }
+   fprintf(stderr, "srcVals ");
+   for (j = 0; j < numypixels; j++) {
+      for (i = 0; i < numxpixels; i++) {
+	 fprintf(stderr, "%d ", srccolors[j][i]);
+      }
+   }
+   fprintf(stderr, "\n");
+#endif
+
+   /* it's not very likely this encoding is better if both alphaabsmin and alphaabsmax
+      are false but try it anyway */
+   if (alphablockerror1 >= 32) {
+
+      /* don't bother if encoding is already very good, this condition should also imply
+      we have valid alphabase colors which we absolutely need (alphabase[0] <= alphabase[1]) */
+      alphablockerror2 = 0;
+      for (aindex = 0; aindex < 5; aindex++) {
+         /* don't forget here is always rounded down */
+         acutValues[aindex] = (alphabase[0] * (10 - (2*aindex + 1)) + alphabase[1] * (2*aindex + 1)) / 10;
+      }
+      for (j = 0; j < numypixels; j++) {
+         for (i = 0; i < numxpixels; i++) {
+             /* maybe it's overkill to have the most complicated calculation just for the error
+               calculation which we only need to figure out if encoding1 or encoding2 is better... */
+            if (srccolors[j][i] == 0) {
+               alphaenc2[4*j + i] = 6;
+               alphadist = 0;
+            }
+            else if (srccolors[j][i] == 255) {
+               alphaenc2[4*j + i] = 7;
+               alphadist = 0;
+            }
+            else if (srccolors[j][i] <= acutValues[0]) {
+               alphaenc2[4*j + i] = 0;
+               alphadist = srccolors[j][i] - alphabase[0];
+            }
+            else if (srccolors[j][i] <= acutValues[1]) {
+               alphaenc2[4*j + i] = 2;
+               alphadist = srccolors[j][i] - (alphabase[0] * 4 + alphabase[1] * 1) / 5;
+            }
+            else if (srccolors[j][i] <= acutValues[2]) {
+               alphaenc2[4*j + i] = 3;
+               alphadist = srccolors[j][i] - (alphabase[0] * 3 + alphabase[1] * 2) / 5;
+            }
+            else if (srccolors[j][i] <= acutValues[3]) {
+               alphaenc2[4*j + i] = 4;
+               alphadist = srccolors[j][i] - (alphabase[0] * 2 + alphabase[1] * 3) / 5;
+            }
+            else if (srccolors[j][i] <= acutValues[4]) {
+               alphaenc2[4*j + i] = 5;
+               alphadist = srccolors[j][i] - (alphabase[0] * 1 + alphabase[1] * 4) / 5;
+            }
+            else {
+               alphaenc2[4*j + i] = 1;
+               alphadist = srccolors[j][i] - alphabase[1];
+            }
+            alphablockerror2 += alphadist * alphadist;
+         }
+      }
+
+
+      /* skip this if the error is already very small
+         this encoding is MUCH better on average than #2 though, but expensive! */
+      if ((alphablockerror2 > 96) && (alphablockerror1 > 96)) {
+         GLshort blockerrlin1 = 0;
+         GLshort blockerrlin2 = 0;
+         GLubyte nralphainrangelow = 0;
+         GLubyte nralphainrangehigh = 0;
+         alphatest[0] = 0xff;
+         alphatest[1] = 0x0;
+         /* if we have large range it's likely there are values close to 0/255, try to map them to 0/255 */
+         for (j = 0; j < numypixels; j++) {
+            for (i = 0; i < numxpixels; i++) {
+               if ((srccolors[j][i] > alphatest[1]) && (srccolors[j][i] < (255 -(alphabase[1] - alphabase[0]) / 28)))
+                  alphatest[1] = srccolors[j][i];
+               if ((srccolors[j][i] < alphatest[0]) && (srccolors[j][i] > (alphabase[1] - alphabase[0]) / 28))
+                  alphatest[0] = srccolors[j][i];
+            }
+         }
+          /* shouldn't happen too often, don't really care about those degenerated cases */
+          if (alphatest[1] <= alphatest[0]) {
+             alphatest[0] = 1;
+             alphatest[1] = 254;
+         }
+         for (aindex = 0; aindex < 5; aindex++) {
+         /* don't forget here is always rounded down */
+            acutValues[aindex] = (alphatest[0] * (10 - (2*aindex + 1)) + alphatest[1] * (2*aindex + 1)) / 10;
+         }
+
+         /* find the "average" difference between the alpha values and the next encoded value.
+            This is then used to calculate new base values.
+            Should there be some weighting, i.e. those values closer to alphatest[x] have more weight,
+            since they will see more improvement, and also because the values in the middle are somewhat
+            likely to get no improvement at all (because the base values might move in different directions)?
+            OTOH it would mean the values in the middle are even less likely to get an improvement
+         */
+         for (j = 0; j < numypixels; j++) {
+            for (i = 0; i < numxpixels; i++) {
+               if (srccolors[j][i] <= alphatest[0] / 2) {
+               }
+               else if (srccolors[j][i] > ((255 + alphatest[1]) / 2)) {
+               }
+               else if (srccolors[j][i] <= acutValues[0]) {
+                  blockerrlin1 += (srccolors[j][i] - alphatest[0]);
+                  nralphainrangelow += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[1]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 4 + alphatest[1] * 1) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 4 + alphatest[1] * 1) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[2]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 3 + alphatest[1] * 2) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 3 + alphatest[1] * 2) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[3]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 2 + alphatest[1] * 3) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 2 + alphatest[1] * 3) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[4]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 1 + alphatest[1] * 4) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 1 + alphatest[1] * 4) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+                  }
+               else {
+                  blockerrlin2 += (srccolors[j][i] - alphatest[1]);
+                  nralphainrangehigh += 1;
+               }
+            }
+         }
+         /* shouldn't happen often, needed to avoid div by zero */
+         if (nralphainrangelow == 0) nralphainrangelow = 1;
+         if (nralphainrangehigh == 0) nralphainrangehigh = 1;
+         alphatest[0] = alphatest[0] + (blockerrlin1 / nralphainrangelow);
+#if RGTC_DEBUG
+         fprintf(stderr, "block err lin low %d, nr %d\n", blockerrlin1, nralphainrangelow);
+         fprintf(stderr, "block err lin high %d, nr %d\n", blockerrlin2, nralphainrangehigh);
+#endif
+         /* again shouldn't really happen often... */
+         if (alphatest[0] < 0) {
+            alphatest[0] = 0;
+         }
+         alphatest[1] = alphatest[1] + (blockerrlin2 / nralphainrangehigh);
+         if (alphatest[1] > 255) {
+            alphatest[1] = 255;
+         }
+
+         alphablockerror3 = 0;
+         for (aindex = 0; aindex < 5; aindex++) {
+         /* don't forget here is always rounded down */
+            acutValues[aindex] = (alphatest[0] * (10 - (2*aindex + 1)) + alphatest[1] * (2*aindex + 1)) / 10;
+         }
+         for (j = 0; j < numypixels; j++) {
+            for (i = 0; i < numxpixels; i++) {
+                /* maybe it's overkill to have the most complicated calculation just for the error
+                  calculation which we only need to figure out if encoding1 or encoding2 is better... */
+               if (srccolors[j][i] <= alphatest[0] / 2) {
+                  alphaenc3[4*j + i] = 6;
+                  alphadist = srccolors[j][i];
+               }
+               else if (srccolors[j][i] > ((255 + alphatest[1]) / 2)) {
+                  alphaenc3[4*j + i] = 7;
+                  alphadist = 255 - srccolors[j][i];
+               }
+               else if (srccolors[j][i] <= acutValues[0]) {
+                  alphaenc3[4*j + i] = 0;
+                  alphadist = srccolors[j][i] - alphatest[0];
+               }
+               else if (srccolors[j][i] <= acutValues[1]) {
+                 alphaenc3[4*j + i] = 2;
+                 alphadist = srccolors[j][i] - (alphatest[0] * 4 + alphatest[1] * 1) / 5;
+               }
+               else if (srccolors[j][i] <= acutValues[2]) {
+                  alphaenc3[4*j + i] = 3;
+                  alphadist = srccolors[j][i] - (alphatest[0] * 3 + alphatest[1] * 2) / 5;
+               }
+               else if (srccolors[j][i] <= acutValues[3]) {
+                  alphaenc3[4*j + i] = 4;
+                  alphadist = srccolors[j][i] - (alphatest[0] * 2 + alphatest[1] * 3) / 5;
+               }
+               else if (srccolors[j][i] <= acutValues[4]) {
+                  alphaenc3[4*j + i] = 5;
+                  alphadist = srccolors[j][i] - (alphatest[0] * 1 + alphatest[1] * 4) / 5;
+               }
+               else {
+                  alphaenc3[4*j + i] = 1;
+                  alphadist = srccolors[j][i] - alphatest[1];
+               }
+               alphablockerror3 += alphadist * alphadist;
+            }
+         }
+      }
+   }
+  /* write the alpha values and encoding back. */
+   if ((alphablockerror1 <= alphablockerror2) && (alphablockerror1 <= alphablockerror3)) {
+#if RGTC_DEBUG
+      if (alphablockerror1 > 96) fprintf(stderr, "enc1 used, error %d\n", alphablockerror1);
+#endif
+      write_rgtc_encoded_channel( blkaddr, alphause[1], alphause[0], alphaenc1 );
+   }
+   else if (alphablockerror2 <= alphablockerror3) {
+#if RGTC_DEBUG
+      if (alphablockerror2 > 96) fprintf(stderr, "enc2 used, error %d\n", alphablockerror2);
+#endif
+      write_rgtc_encoded_channel( blkaddr, alphabase[0], alphabase[1], alphaenc2 );
+   }
+   else {
+#if RGTC_DEBUG
+      fprintf(stderr, "enc3 used, error %d\n", alphablockerror3);
+#endif
+      write_rgtc_encoded_channel( blkaddr, (GLubyte)alphatest[0], (GLubyte)alphatest[1], alphaenc3 );
+   }
+}
+
+
+static void write_rgtc_encoded_channel_s(GLbyte *blkaddr,
+					 GLbyte alphabase1,
+					 GLbyte alphabase2,
+					 GLbyte alphaenc[16])
+{
+   *blkaddr++ = alphabase1;
+   *blkaddr++ = alphabase2;
+   *blkaddr++ = alphaenc[0] | (alphaenc[1] << 3) | ((alphaenc[2] & 3) << 6);
+   *blkaddr++ = (alphaenc[2] >> 2) | (alphaenc[3] << 1) | (alphaenc[4] << 4) | ((alphaenc[5] & 1) << 7);
+   *blkaddr++ = (alphaenc[5] >> 1) | (alphaenc[6] << 2) | (alphaenc[7] << 5);
+   *blkaddr++ = alphaenc[8] | (alphaenc[9] << 3) | ((alphaenc[10] & 3) << 6);
+   *blkaddr++ = (alphaenc[10] >> 2) | (alphaenc[11] << 1) | (alphaenc[12] << 4) | ((alphaenc[13] & 1) << 7);
+   *blkaddr++ = (alphaenc[13] >> 1) | (alphaenc[14] << 2) | (alphaenc[15] << 5);
+}
+
+static void encode_rgtc_chan_s(GLbyte *blkaddr, GLbyte srccolors[4][4],
+			       GLint numxpixels, GLint numypixels)
+{
+   GLbyte alphabase[2], alphause[2];
+   GLshort alphatest[2] = { 0 };
+   GLuint alphablockerror1, alphablockerror2, alphablockerror3;
+   GLbyte i, j, aindex, acutValues[7];
+   GLbyte alphaenc1[16], alphaenc2[16], alphaenc3[16];
+   GLboolean alphaabsmin = GL_FALSE;
+   GLboolean alphaabsmax = GL_FALSE;
+   GLshort alphadist;
+
+   /* find lowest and highest alpha value in block, alphabase[0] lowest, alphabase[1] highest */
+   alphabase[0] = 0xff; alphabase[1] = 0x0;
+   for (j = 0; j < numypixels; j++) {
+      for (i = 0; i < numxpixels; i++) {
+         if (srccolors[j][i] == 0)
+            alphaabsmin = GL_TRUE;
+         else if (srccolors[j][i] == 255)
+            alphaabsmax = GL_TRUE;
+         else {
+            if (srccolors[j][i] > alphabase[1])
+               alphabase[1] = srccolors[j][i];
+            if (srccolors[j][i] < alphabase[0])
+               alphabase[0] = srccolors[j][i];
+         }
+      }
+   }
+
+
+   if ((alphabase[0] > alphabase[1]) && !(alphaabsmin && alphaabsmax)) { /* one color, either max or min */
+      /* shortcut here since it is a very common case (and also avoids later problems) */
+      /* || (alphabase[0] == alphabase[1] && !alphaabsmin && !alphaabsmax) */
+      /* could also thest for alpha0 == alpha1 (and not min/max), but probably not common, so don't bother */
+
+      *blkaddr++ = srccolors[0][0];
+      blkaddr++;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+      *blkaddr++ = 0;
+#if RGTC_DEBUG
+      fprintf(stderr, "enc0 used\n");
+#endif
+      return;
+   }
+
+   /* find best encoding for alpha0 > alpha1 */
+   /* it's possible this encoding is better even if both alphaabsmin and alphaabsmax are true */
+   alphablockerror1 = 0x0;
+   alphablockerror2 = 0xffffffff;
+   alphablockerror3 = 0xffffffff;
+   if (alphaabsmin) alphause[0] = 0;
+   else alphause[0] = alphabase[0];
+   if (alphaabsmax) alphause[1] = 255;
+   else alphause[1] = alphabase[1];
+   /* calculate the 7 cut values, just the middle between 2 of the computed alpha values */
+   for (aindex = 0; aindex < 7; aindex++) {
+      /* don't forget here is always rounded down */
+      acutValues[aindex] = (alphause[0] * (2*aindex + 1) + alphause[1] * (14 - (2*aindex + 1))) / 14;
+   }
+
+   for (j = 0; j < numypixels; j++) {
+      for (i = 0; i < numxpixels; i++) {
+         /* maybe it's overkill to have the most complicated calculation just for the error
+            calculation which we only need to figure out if encoding1 or encoding2 is better... */
+         if (srccolors[j][i] > acutValues[0]) {
+            alphaenc1[4*j + i] = 0;
+            alphadist = srccolors[j][i] - alphause[1];
+         }
+         else if (srccolors[j][i] > acutValues[1]) {
+            alphaenc1[4*j + i] = 2;
+            alphadist = srccolors[j][i] - (alphause[1] * 6 + alphause[0] * 1) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[2]) {
+            alphaenc1[4*j + i] = 3;
+            alphadist = srccolors[j][i] - (alphause[1] * 5 + alphause[0] * 2) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[3]) {
+            alphaenc1[4*j + i] = 4;
+            alphadist = srccolors[j][i] - (alphause[1] * 4 + alphause[0] * 3) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[4]) {
+            alphaenc1[4*j + i] = 5;
+            alphadist = srccolors[j][i] - (alphause[1] * 3 + alphause[0] * 4) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[5]) {
+            alphaenc1[4*j + i] = 6;
+            alphadist = srccolors[j][i] - (alphause[1] * 2 + alphause[0] * 5) / 7;
+         }
+         else if (srccolors[j][i] > acutValues[6]) {
+            alphaenc1[4*j + i] = 7;
+            alphadist = srccolors[j][i] - (alphause[1] * 1 + alphause[0] * 6) / 7;
+         }
+         else {
+            alphaenc1[4*j + i] = 1;
+            alphadist = srccolors[j][i] - alphause[0];
+         }
+         alphablockerror1 += alphadist * alphadist;
+      }
+   }
+#if RGTC_DEBUG
+   for (i = 0; i < 16; i++) {
+      fprintf(stderr, "%d ", alphaenc1[i]);
+   }
+   fprintf(stderr, "cutVals ");
+   for (i = 0; i < 8; i++) {
+      fprintf(stderr, "%d ", acutValues[i]);
+   }
+   fprintf(stderr, "srcVals ");
+   for (j = 0; j < numypixels; j++)
+      for (i = 0; i < numxpixels; i++) {
+	 fprintf(stderr, "%d ", srccolors[j][i]);
+      }
+   
+   fprintf(stderr, "\n");
+#endif
+
+   /* it's not very likely this encoding is better if both alphaabsmin and alphaabsmax
+      are false but try it anyway */
+   if (alphablockerror1 >= 32) {
+
+      /* don't bother if encoding is already very good, this condition should also imply
+      we have valid alphabase colors which we absolutely need (alphabase[0] <= alphabase[1]) */
+      alphablockerror2 = 0;
+      for (aindex = 0; aindex < 5; aindex++) {
+         /* don't forget here is always rounded down */
+         acutValues[aindex] = (alphabase[0] * (10 - (2*aindex + 1)) + alphabase[1] * (2*aindex + 1)) / 10;
+      }
+      for (j = 0; j < numypixels; j++) {
+         for (i = 0; i < numxpixels; i++) {
+             /* maybe it's overkill to have the most complicated calculation just for the error
+               calculation which we only need to figure out if encoding1 or encoding2 is better... */
+            if (srccolors[j][i] == 0) {
+               alphaenc2[4*j + i] = 6;
+               alphadist = 0;
+            }
+            else if (srccolors[j][i] == 255) {
+               alphaenc2[4*j + i] = 7;
+               alphadist = 0;
+            }
+            else if (srccolors[j][i] <= acutValues[0]) {
+               alphaenc2[4*j + i] = 0;
+               alphadist = srccolors[j][i] - alphabase[0];
+            }
+            else if (srccolors[j][i] <= acutValues[1]) {
+               alphaenc2[4*j + i] = 2;
+               alphadist = srccolors[j][i] - (alphabase[0] * 4 + alphabase[1] * 1) / 5;
+            }
+            else if (srccolors[j][i] <= acutValues[2]) {
+               alphaenc2[4*j + i] = 3;
+               alphadist = srccolors[j][i] - (alphabase[0] * 3 + alphabase[1] * 2) / 5;
+            }
+            else if (srccolors[j][i] <= acutValues[3]) {
+               alphaenc2[4*j + i] = 4;
+               alphadist = srccolors[j][i] - (alphabase[0] * 2 + alphabase[1] * 3) / 5;
+            }
+            else if (srccolors[j][i] <= acutValues[4]) {
+               alphaenc2[4*j + i] = 5;
+               alphadist = srccolors[j][i] - (alphabase[0] * 1 + alphabase[1] * 4) / 5;
+            }
+            else {
+               alphaenc2[4*j + i] = 1;
+               alphadist = srccolors[j][i] - alphabase[1];
+            }
+            alphablockerror2 += alphadist * alphadist;
+         }
+      }
+
+
+      /* skip this if the error is already very small
+         this encoding is MUCH better on average than #2 though, but expensive! */
+      if ((alphablockerror2 > 96) && (alphablockerror1 > 96)) {
+         GLshort blockerrlin1 = 0;
+         GLshort blockerrlin2 = 0;
+         GLubyte nralphainrangelow = 0;
+         GLubyte nralphainrangehigh = 0;
+         alphatest[0] = 0xff;
+         alphatest[1] = 0x0;
+         /* if we have large range it's likely there are values close to 0/255, try to map them to 0/255 */
+         for (j = 0; j < numypixels; j++) {
+            for (i = 0; i < numxpixels; i++) {
+               if ((srccolors[j][i] > alphatest[1]) && (srccolors[j][i] < (255 -(alphabase[1] - alphabase[0]) / 28)))
+                  alphatest[1] = srccolors[j][i];
+               if ((srccolors[j][i] < alphatest[0]) && (srccolors[j][i] > (alphabase[1] - alphabase[0]) / 28))
+                  alphatest[0] = srccolors[j][i];
+            }
+         }
+          /* shouldn't happen too often, don't really care about those degenerated cases */
+          if (alphatest[1] <= alphatest[0]) {
+             alphatest[0] = 1;
+             alphatest[1] = 254;
+         }
+         for (aindex = 0; aindex < 5; aindex++) {
+         /* don't forget here is always rounded down */
+            acutValues[aindex] = (alphatest[0] * (10 - (2*aindex + 1)) + alphatest[1] * (2*aindex + 1)) / 10;
+         }
+
+         /* find the "average" difference between the alpha values and the next encoded value.
+            This is then used to calculate new base values.
+            Should there be some weighting, i.e. those values closer to alphatest[x] have more weight,
+            since they will see more improvement, and also because the values in the middle are somewhat
+            likely to get no improvement at all (because the base values might move in different directions)?
+            OTOH it would mean the values in the middle are even less likely to get an improvement
+         */
+         for (j = 0; j < numypixels; j++) {
+            for (i = 0; i < numxpixels; i++) {
+               if (srccolors[j][i] <= alphatest[0] / 2) {
+               }
+               else if (srccolors[j][i] > ((255 + alphatest[1]) / 2)) {
+               }
+               else if (srccolors[j][i] <= acutValues[0]) {
+                  blockerrlin1 += (srccolors[j][i] - alphatest[0]);
+                  nralphainrangelow += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[1]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 4 + alphatest[1] * 1) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 4 + alphatest[1] * 1) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[2]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 3 + alphatest[1] * 2) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 3 + alphatest[1] * 2) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[3]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 2 + alphatest[1] * 3) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 2 + alphatest[1] * 3) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+               }
+               else if (srccolors[j][i] <= acutValues[4]) {
+                  blockerrlin1 += (srccolors[j][i] - (alphatest[0] * 1 + alphatest[1] * 4) / 5);
+                  blockerrlin2 += (srccolors[j][i] - (alphatest[0] * 1 + alphatest[1] * 4) / 5);
+                  nralphainrangelow += 1;
+                  nralphainrangehigh += 1;
+                  }
+               else {
+                  blockerrlin2 += (srccolors[j][i] - alphatest[1]);
+                  nralphainrangehigh += 1;
+               }
+            }
+         }
+         /* shouldn't happen often, needed to avoid div by zero */
+         if (nralphainrangelow == 0) nralphainrangelow = 1;
+         if (nralphainrangehigh == 0) nralphainrangehigh = 1;
+         alphatest[0] = alphatest[0] + (blockerrlin1 / nralphainrangelow);
+#if RGTC_DEBUG
+	 fprintf(stderr, "block err lin low %d, nr %d\n", blockerrlin1, nralphainrangelow);
+         fprintf(stderr, "block err lin high %d, nr %d\n", blockerrlin2, nralphainrangehigh);
+#endif
+         /* again shouldn't really happen often... */
+         if (alphatest[0] < 0) {
+            alphatest[0] = 0;
+         }
+         alphatest[1] = alphatest[1] + (blockerrlin2 / nralphainrangehigh);
+         if (alphatest[1] > 255) {
+            alphatest[1] = 255;
+         }
+
+         alphablockerror3 = 0;
+         for (aindex = 0; aindex < 5; aindex++) {
+         /* don't forget here is always rounded down */
+            acutValues[aindex] = (alphatest[0] * (10 - (2*aindex + 1)) + alphatest[1] * (2*aindex + 1)) / 10;
+         }
+         for (j = 0; j < numypixels; j++) {
+            for (i = 0; i < numxpixels; i++) {
+                /* maybe it's overkill to have the most complicated calculation just for the error
+                  calculation which we only need to figure out if encoding1 or encoding2 is better... */
+               if (srccolors[j][i] <= alphatest[0] / 2) {
+                  alphaenc3[4*j + i] = 6;
+                  alphadist = srccolors[j][i];
+               }
+               else if (srccolors[j][i] > ((255 + alphatest[1]) / 2)) {
+                  alphaenc3[4*j + i] = 7;
+                  alphadist = 255 - srccolors[j][i];
+               }
+               else if (srccolors[j][i] <= acutValues[0]) {
+                  alphaenc3[4*j + i] = 0;
+                  alphadist = srccolors[j][i] - alphatest[0];
+               }
+               else if (srccolors[j][i] <= acutValues[1]) {
+                 alphaenc3[4*j + i] = 2;
+                 alphadist = srccolors[j][i] - (alphatest[0] * 4 + alphatest[1] * 1) / 5;
+               }
+               else if (srccolors[j][i] <= acutValues[2]) {
+                  alphaenc3[4*j + i] = 3;
+                  alphadist = srccolors[j][i] - (alphatest[0] * 3 + alphatest[1] * 2) / 5;
+               }
+               else if (srccolors[j][i] <= acutValues[3]) {
+                  alphaenc3[4*j + i] = 4;
+                  alphadist = srccolors[j][i] - (alphatest[0] * 2 + alphatest[1] * 3) / 5;
+               }
+               else if (srccolors[j][i] <= acutValues[4]) {
+                  alphaenc3[4*j + i] = 5;
+                  alphadist = srccolors[j][i] - (alphatest[0] * 1 + alphatest[1] * 4) / 5;
+               }
+               else {
+                  alphaenc3[4*j + i] = 1;
+                  alphadist = srccolors[j][i] - alphatest[1];
+               }
+               alphablockerror3 += alphadist * alphadist;
+            }
+         }
+      }
+   }
+  /* write the alpha values and encoding back. */
+   if ((alphablockerror1 <= alphablockerror2) && (alphablockerror1 <= alphablockerror3)) {
+#if RGTC_DEBUG
+      if (alphablockerror1 > 96) fprintf(stderr, "enc1 used, error %d\n", alphablockerror1);
+#endif
+      write_rgtc_encoded_channel_s( blkaddr, alphause[1], alphause[0], alphaenc1 );
+   }
+   else if (alphablockerror2 <= alphablockerror3) {
+#if RGTC_DEBUG
+      if (alphablockerror2 > 96) fprintf(stderr, "enc2 used, error %d\n", alphablockerror2);
+#endif
+      write_rgtc_encoded_channel_s( blkaddr, alphabase[0], alphabase[1], alphaenc2 );
+   }
+   else {
+#if RGTC_DEBUG
+      fprintf(stderr, "enc3 used, error %d\n", alphablockerror3);
+#endif
+      write_rgtc_encoded_channel_s( blkaddr, (GLubyte)alphatest[0], (GLubyte)alphatest[1], alphaenc3 );
+   }
+}
diff --git a/src/mesa/main/texcompress_rgtc.h b/src/mesa/main/texcompress_rgtc.h
new file mode 100644
index 00000000000..424edc4581c
--- /dev/null
+++ b/src/mesa/main/texcompress_rgtc.h
@@ -0,0 +1,60 @@
+/*
+ * Copyright (C) 2011 Red Hat Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#ifndef TEXCOMPRESS_RGTC_H
+#define TEXCOMPRESS_RGTC_H
+
+#include "glheader.h"
+#include "mfeatures.h"
+#include "texstore.h"
+
+struct gl_texture_image;
+
+extern GLboolean
+_mesa_texstore_red_rgtc1(TEXSTORE_PARAMS);
+
+extern GLboolean
+_mesa_texstore_signed_red_rgtc1(TEXSTORE_PARAMS);
+
+extern GLboolean
+_mesa_texstore_rg_rgtc2(TEXSTORE_PARAMS);
+
+extern GLboolean
+_mesa_texstore_signed_rg_rgtc2(TEXSTORE_PARAMS);
+
+extern void
+_mesa_fetch_texel_2d_f_red_rgtc1(const struct gl_texture_image *texImage,
+				 GLint i, GLint j, GLint k, GLfloat *texel);
+
+extern void
+_mesa_fetch_texel_2d_f_signed_red_rgtc1(const struct gl_texture_image *texImage,
+					GLint i, GLint j, GLint k, GLfloat *texel);
+
+extern void
+_mesa_fetch_texel_2d_f_rg_rgtc2(const struct gl_texture_image *texImage,
+				 GLint i, GLint j, GLint k, GLfloat *texel);
+
+extern void
+_mesa_fetch_texel_2d_f_signed_rg_rgtc2(const struct gl_texture_image *texImage,
+				       GLint i, GLint j, GLint k, GLfloat *texel);
+#endif
diff --git a/src/mesa/main/texfetch.c b/src/mesa/main/texfetch.c
index 8aa1e4970d5..550597e1cdf 100644
--- a/src/mesa/main/texfetch.c
+++ b/src/mesa/main/texfetch.c
@@ -38,6 +38,7 @@
 #include "texcompress.h"
 #include "texcompress_fxt1.h"
 #include "texcompress_s3tc.h"
+#include "texcompress_rgtc.h"
 #include "texfetch.h"
 #include "teximage.h"
 
@@ -756,7 +757,35 @@ texfetch_funcs[MESA_FORMAT_COUNT] =
       fetch_texel_2d_rgba_16,
       fetch_texel_3d_rgba_16,
       store_texel_rgba_16
-   }
+   },
+   {
+      MESA_FORMAT_RED_RGTC1,
+      NULL,
+      _mesa_fetch_texel_2d_f_red_rgtc1,
+      NULL,
+      NULL
+   },
+   {
+      MESA_FORMAT_SIGNED_RED_RGTC1,
+      NULL,
+      _mesa_fetch_texel_2d_f_signed_red_rgtc1,
+      NULL,
+      NULL
+   },
+   {
+      MESA_FORMAT_RG_RGTC2,
+      NULL,
+      _mesa_fetch_texel_2d_f_rg_rgtc2,
+      NULL,
+      NULL
+   },
+   {
+      MESA_FORMAT_SIGNED_RG_RGTC2,
+      NULL,
+      _mesa_fetch_texel_2d_f_signed_rg_rgtc2,
+      NULL,
+      NULL
+   },
 };
 
 
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index 2542cea856b..72025cf828e 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -602,6 +602,25 @@ _mesa_choose_tex_format( struct gl_context *ctx, GLint internalFormat,
       }
    }
 
+   if (ctx->Extensions.ARB_texture_compression_rgtc) {
+      switch (internalFormat) {
+         case GL_COMPRESSED_RED_RGTC1:
+	    RETURN_IF_SUPPORTED(MESA_FORMAT_RED_RGTC1);
+	    break;
+         case GL_COMPRESSED_SIGNED_RED_RGTC1:
+	    RETURN_IF_SUPPORTED(MESA_FORMAT_SIGNED_RED_RGTC1);
+	    break;
+         case GL_COMPRESSED_RG_RGTC2:
+	    RETURN_IF_SUPPORTED(MESA_FORMAT_RG_RGTC2);
+	    break;
+         case GL_COMPRESSED_SIGNED_RG_RGTC2:
+	    RETURN_IF_SUPPORTED(MESA_FORMAT_SIGNED_RG_RGTC2);
+	    break;
+         default:
+            ; /* fallthrough */
+      }
+   }
+
    _mesa_problem(ctx, "unexpected format in _mesa_choose_tex_format()");
    return MESA_FORMAT_NONE;
 }
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 7dd4a1fa650..8a3e5f77979 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -65,6 +65,7 @@
 #include "pack.h"
 #include "texcompress.h"
 #include "texcompress_fxt1.h"
+#include "texcompress_rgtc.h"
 #include "texcompress_s3tc.h"
 #include "teximage.h"
 #include "texstore.h"
@@ -310,15 +311,15 @@ compute_component_mapping(GLenum inFormat, GLenum outFormat,
  * \param srcPacking  source image pixel packing
  * \return resulting image with format = textureBaseFormat and type = GLfloat.
  */
-static GLfloat *
-make_temp_float_image(struct gl_context *ctx, GLuint dims,
-                      GLenum logicalBaseFormat,
-                      GLenum textureBaseFormat,
-                      GLint srcWidth, GLint srcHeight, GLint srcDepth,
-                      GLenum srcFormat, GLenum srcType,
-                      const GLvoid *srcAddr,
-                      const struct gl_pixelstore_attrib *srcPacking,
-                      GLbitfield transferOps)
+GLfloat *
+_mesa_make_temp_float_image(struct gl_context *ctx, GLuint dims,
+			    GLenum logicalBaseFormat,
+			    GLenum textureBaseFormat,
+			    GLint srcWidth, GLint srcHeight, GLint srcDepth,
+			    GLenum srcFormat, GLenum srcType,
+			    const GLvoid *srcAddr,
+			    const struct gl_pixelstore_attrib *srcPacking,
+			    GLbitfield transferOps)
 {
    GLfloat *tempImage;
    const GLint components = _mesa_components_in_format(logicalBaseFormat);
@@ -2065,7 +2066,7 @@ _mesa_texstore_argb2101010(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2317,7 +2318,7 @@ _mesa_texstore_unorm1616(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2394,7 +2395,7 @@ _mesa_texstore_unorm16(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2452,7 +2453,7 @@ _mesa_texstore_rgba_16(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2519,7 +2520,7 @@ _mesa_texstore_signed_rgba_16(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2901,7 +2902,7 @@ _mesa_texstore_signed_r8(TEXSTORE_PARAMS)
    /* XXX look at adding optimized paths */
    {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2946,7 +2947,7 @@ _mesa_texstore_signed_rg88(TEXSTORE_PARAMS)
    /* XXX look at adding optimized paths */
    {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -2991,7 +2992,7 @@ _mesa_texstore_signed_rgbx8888(TEXSTORE_PARAMS)
 
    {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -3104,7 +3105,7 @@ _mesa_texstore_signed_rgba8888(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -3413,7 +3414,7 @@ _mesa_texstore_rgba_float32(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -3483,7 +3484,7 @@ _mesa_texstore_rgba_float16(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -3549,7 +3550,7 @@ _mesa_texstore_rgba_int8(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -3614,7 +3615,7 @@ _mesa_texstore_rgba_int16(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -3679,7 +3680,7 @@ _mesa_texstore_rgba_int32(TEXSTORE_PARAMS)
    }
    else {
       /* general path */
-      const GLfloat *tempImage = make_temp_float_image(ctx, dims,
+      const GLfloat *tempImage = _mesa_make_temp_float_image(ctx, dims,
                                                  baseInternalFormat,
                                                  baseFormat,
                                                  srcWidth, srcHeight, srcDepth,
@@ -4128,7 +4129,12 @@ texstore_funcs[MESA_FORMAT_COUNT] =
    { MESA_FORMAT_SIGNED_RG_16, _mesa_texstore_signed_rgba_16 },
    { MESA_FORMAT_SIGNED_RGB_16, _mesa_texstore_signed_rgba_16 },
    { MESA_FORMAT_SIGNED_RGBA_16, _mesa_texstore_signed_rgba_16 },
-   { MESA_FORMAT_RGBA_16, _mesa_texstore_rgba_16 }
+   { MESA_FORMAT_RGBA_16, _mesa_texstore_rgba_16 },
+
+   { MESA_FORMAT_RED_RGTC1, _mesa_texstore_red_rgtc1 },
+   { MESA_FORMAT_SIGNED_RED_RGTC1, _mesa_texstore_signed_red_rgtc1 },
+   { MESA_FORMAT_RG_RGTC2, _mesa_texstore_rg_rgtc2 },
+   { MESA_FORMAT_SIGNED_RG_RGTC2, _mesa_texstore_signed_rg_rgtc2 }
 };
 
 
diff --git a/src/mesa/main/texstore.h b/src/mesa/main/texstore.h
index 177ede423f5..2f3c4e821fc 100644
--- a/src/mesa/main/texstore.h
+++ b/src/mesa/main/texstore.h
@@ -81,6 +81,15 @@ _mesa_make_temp_chan_image(struct gl_context *ctx, GLuint dims,
                            const GLvoid *srcAddr,
                            const struct gl_pixelstore_attrib *srcPacking);
 
+GLfloat *
+_mesa_make_temp_float_image(struct gl_context *ctx, GLuint dims,
+			    GLenum logicalBaseFormat,
+			    GLenum textureBaseFormat,
+			    GLint srcWidth, GLint srcHeight, GLint srcDepth,
+			    GLenum srcFormat, GLenum srcType,
+			    const GLvoid *srcAddr,
+			    const struct gl_pixelstore_attrib *srcPacking,
+			    GLbitfield transferOps);
 
 extern void
 _mesa_store_teximage1d(struct gl_context *ctx, GLenum target, GLint level,
diff --git a/src/mesa/sources.mak b/src/mesa/sources.mak
index 9a78a23aa7e..bdf4126cf58 100644
--- a/src/mesa/sources.mak
+++ b/src/mesa/sources.mak
@@ -78,6 +78,7 @@ MAIN_SOURCES = \
 	main/stencil.c \
 	main/syncobj.c \
 	main/texcompress.c \
+	main/texcompress_rgtc.c \
 	main/texcompress_s3tc.c \
 	main/texcompress_fxt1.c \
 	main/texenv.c \
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index 6530a06ade4..c99eafbadf3 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -579,6 +579,7 @@ st_validate_varrays(struct gl_context *ctx,
    if (is_interleaved_arrays(vp, vpv, arrays)) {
       setup_interleaved_attribs(ctx, vp, vpv, arrays, vbuffer, velements,
                                 max_index);
+
       num_vbuffers = 1;
       num_velements = vpv->num_inputs;
       if (num_velements == 0)
@@ -645,6 +646,7 @@ st_draw_vbo(struct gl_context *ctx,
       for (i = 0; i < nr_prims; i++) {
          min_index = MIN2(min_index, prims[i].start);
          max_index = MAX2(max_index, prims[i].start + prims[i].count - 1);
+         max_index = MAX2(max_index, prims[i].num_instances);
       }
    }
 
diff --git a/src/mesa/state_tracker/st_extensions.c b/src/mesa/state_tracker/st_extensions.c
index 2f45f470334..d2098987d1d 100644
--- a/src/mesa/state_tracker/st_extensions.c
+++ b/src/mesa/state_tracker/st_extensions.c
@@ -416,6 +416,22 @@ void st_init_extensions(struct st_context *st)
       ctx->Extensions.S3_s3tc = GL_TRUE;
    }
 
+   if (screen->is_format_supported(screen, PIPE_FORMAT_RGTC1_UNORM,
+                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_BIND_SAMPLER_VIEW, 0) &&
+       screen->is_format_supported(screen, PIPE_FORMAT_RGTC1_SNORM,
+				   PIPE_TEXTURE_2D, 0,
+                                   PIPE_BIND_SAMPLER_VIEW, 0) &&
+       screen->is_format_supported(screen, PIPE_FORMAT_RGTC2_UNORM,
+                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_BIND_SAMPLER_VIEW, 0) &&
+       screen->is_format_supported(screen, PIPE_FORMAT_RGTC2_SNORM,
+                                   PIPE_TEXTURE_2D, 0,
+                                   PIPE_BIND_SAMPLER_VIEW, 0)
+       ) {
+     ctx->Extensions.ARB_texture_compression_rgtc = GL_TRUE;
+   }
+
    /* ycbcr support */
    if (screen->is_format_supported(screen, PIPE_FORMAT_UYVY, 
                                    PIPE_TEXTURE_2D, 0,
diff --git a/src/mesa/state_tracker/st_format.c b/src/mesa/state_tracker/st_format.c
index 577ee6189bd..c58ec9267dc 100644
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@@ -241,6 +241,14 @@ st_mesa_format_to_pipe_format(gl_format mesaFormat)
    case MESA_FORMAT_RGBA_UINT32:
       return PIPE_FORMAT_R32G32B32A32_USCALED;
 
+   case MESA_FORMAT_RED_RGTC1:
+      return PIPE_FORMAT_RGTC1_UNORM;
+   case MESA_FORMAT_SIGNED_RED_RGTC1:
+      return PIPE_FORMAT_RGTC1_SNORM;
+   case MESA_FORMAT_RG_RGTC2:
+      return PIPE_FORMAT_RGTC2_UNORM;
+   case MESA_FORMAT_SIGNED_RG_RGTC2:
+      return PIPE_FORMAT_RGTC2_SNORM;
    default:
       assert(0);
       return PIPE_FORMAT_NONE;
@@ -380,6 +388,15 @@ st_pipe_format_to_mesa_format(enum pipe_format format)
    case PIPE_FORMAT_R32G32B32A32_USCALED:
       return MESA_FORMAT_RGBA_UINT32;
 
+   case PIPE_FORMAT_RGTC1_UNORM:
+      return MESA_FORMAT_RED_RGTC1;
+   case PIPE_FORMAT_RGTC1_SNORM:
+      return MESA_FORMAT_SIGNED_RED_RGTC1;
+   case PIPE_FORMAT_RGTC2_UNORM:
+      return MESA_FORMAT_RG_RGTC2;
+   case PIPE_FORMAT_RGTC2_SNORM:
+      return MESA_FORMAT_SIGNED_RG_RGTC2;
+
    default:
       assert(0);
       return MESA_FORMAT_NONE;
diff --git a/src/mesa/state_tracker/st_mesa_to_tgsi.c b/src/mesa/state_tracker/st_mesa_to_tgsi.c
index 5c68fd78c30..c07739f9d53 100644
--- a/src/mesa/state_tracker/st_mesa_to_tgsi.c
+++ b/src/mesa/state_tracker/st_mesa_to_tgsi.c
@@ -224,9 +224,9 @@ src_register( struct st_translate *t,
 
    case PROGRAM_TEMPORARY:
       assert(index >= 0);
+      assert(index < Elements(t->temps));
       if (ureg_dst_is_undef(t->temps[index]))
          t->temps[index] = ureg_DECL_temporary( t->ureg );
-      assert(index < Elements(t->temps));
       return ureg_src(t->temps[index]);
 
    case PROGRAM_NAMED_PARAM: