89 files changed, 5742 insertions, 4700 deletions
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 9161747fdbf..d185c6b8497 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -41,7 +41,7 @@
 static const char *
 cell_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 6a63a0e6ced..ae4c61efb3b 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -389,22 +389,14 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
    const uint texWidth = pt->width[level];
    const uint texHeight = pt->height[level];
    const uint stride = ct->stride[level];
-   unsigned flags = 0x0;
    unsigned size;
 
    assert(transfer->texture);
 
-   if (transfer->usage != PIPE_TRANSFER_READ) {
-      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-   }
-
-   if (transfer->usage != PIPE_TRANSFER_WRITE) {
-      flags |= PIPE_BUFFER_USAGE_CPU_READ;
-   }
-
    if (!ct->mapped) {
       /* map now */
-      ct->mapped = pipe_buffer_map(screen, ct->buffer, flags);
+      ct->mapped = pipe_buffer_map(screen, ct->buffer,
+                                   pipe_transfer_buffer_flags(transfer));
    }
 
    /*
@@ -417,8 +409,7 @@ cell_transfer_map(struct pipe_screen *screen, struct pipe_transfer *transfer)
    if (!ctrans->map)
       return NULL; /* out of memory */
 
-   if (transfer->usage == PIPE_TRANSFER_READ ||
-       transfer->usage == PIPE_TRANSFER_READ_WRITE) {
+   if (transfer->usage & PIPE_TRANSFER_READ) {
       /* need to untwiddle the texture to make a linear version */
       const uint bpp = pf_get_size(ct->base.format);
       if (bpp == 4) {
@@ -459,8 +450,7 @@ cell_transfer_unmap(struct pipe_screen *screen,
                                    PIPE_BUFFER_USAGE_CPU_READ);
    }
 
-   if (transfer->usage == PIPE_TRANSFER_WRITE ||
-       transfer->usage == PIPE_TRANSFER_READ_WRITE) {
+   if (transfer->usage & PIPE_TRANSFER_WRITE) {
       /* The user wrote new texture data into the mapped buffer.
        * We need to convert the new linear data into the twiddled/tiled format.
        */
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index b43f7352456..e745f3342d1 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -175,12 +175,19 @@ i915_is_buffer_referenced(struct pipe_context *pipe,
 static void i915_destroy(struct pipe_context *pipe)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
 
    draw_destroy(i915->draw);
    
    if(i915->batch)
       i915->iws->batchbuffer_destroy(i915->batch);
 
+   /* unbind framebuffer */
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, NULL);
+
    FREE(i915);
 }
 
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index b3a7774fd6a..8a3e466c845 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -198,7 +198,7 @@ i915_vbuf_render_map_vertices(struct vbuf_render *render)
    struct intel_winsys *iws = i915->iws;
 
    if (i915->vbo_flushed)
-      debug_printf("%s bad vbo flush occured stalling on hw\n");
+      debug_printf("%s bad vbo flush occured stalling on hw\n", __FUNCTION__);
 
    i915_render->vbo_ptr = iws->buffer_map(iws, i915_render->vbo, TRUE);
 
@@ -389,14 +389,43 @@ i915_vbuf_render_draw_arrays(struct vbuf_render *render,
                              uint nr)
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
+   struct i915_context *i915 = i915_render->i915;
 
    if (i915_render->fallback) {
       draw_arrays_fallback(render, start, nr);
       return;
    }
 
-   /* JB: TODO submit direct cmds */
-   draw_arrays_fallback(render, start, nr);
+   if (i915->dirty)
+      i915_update_derived(i915);
+
+   if (i915->hardware_dirty)
+      i915_emit_hardware_state(i915);
+
+   if (!BEGIN_BATCH(2, 0)) {
+      FLUSH_BATCH(NULL);
+
+      /* Make sure state is re-emitted after a flush:
+       */
+      i915_update_derived(i915);
+      i915_emit_hardware_state(i915);
+      i915->vbo_flushed = 1;
+
+      if (!BEGIN_BATCH(2, 0)) {
+         assert(0);
+         goto out;
+      }
+   }
+
+   OUT_BATCH(_3DPRIMITIVE |
+             PRIM_INDIRECT |
+             PRIM_INDIRECT_SEQUENTIAL |
+             i915_render->hwprim |
+             nr);
+   OUT_BATCH(start); /* Beginning vertex index */
+
+out:
+   return;
 }
 
 /**
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index a1dd43c1bcc..c66558c320e 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -46,7 +46,7 @@
 static const char *
 i915_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 static const char *
diff --git a/src/gallium/drivers/i915simple/i915_state.c b/src/gallium/drivers/i915simple/i915_state.c
index 0087dfa410f..7d48e6e84d5 100644
--- a/src/gallium/drivers/i915simple/i915_state.c
+++ b/src/gallium/drivers/i915simple/i915_state.c
@@ -588,9 +588,17 @@ static void i915_set_framebuffer_state(struct pipe_context *pipe,
 				       const struct pipe_framebuffer_state *fb)
 {
    struct i915_context *i915 = i915_context(pipe);
+   int i;
+
    draw_flush(i915->draw);
 
-   i915->framebuffer = *fb; /* struct copy */
+   i915->framebuffer.width = fb->width;
+   i915->framebuffer.height = fb->height;
+   i915->framebuffer.nr_cbufs = fb->nr_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&i915->framebuffer.cbufs[i], fb->cbufs[i]);
+   }
+   pipe_surface_reference(&i915->framebuffer.zsbuf, fb->zsbuf);
 
    i915->dirty |= I915_NEW_FRAMEBUFFER;
 }
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index 6a6c6542717..286c9ace8e5 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -165,7 +165,7 @@ i915_scanout_layout(struct i915_texture *tex)
    struct pipe_texture *pt = &tex->base;
 
    if (pt->last_level > 0 || pt->block.size != 4)
-      return 0;
+      return FALSE;
 
    i915_miptree_set_level_info(tex, 0, 1,
                                tex->base.width[0],
@@ -191,6 +191,38 @@ i915_scanout_layout(struct i915_texture *tex)
    return TRUE;
 }
 
+/**
+ * Special case to deal with shared textures.
+ */
+static boolean
+i915_display_target_layout(struct i915_texture *tex)
+{
+   struct pipe_texture *pt = &tex->base;
+
+   if (pt->last_level > 0 || pt->block.size != 4)
+      return FALSE;
+
+   /* fallback to normal textures for small textures */
+   if (tex->base.width[0] < 240)
+      return FALSE;
+
+   i915_miptree_set_level_info(tex, 0, 1,
+                               tex->base.width[0],
+                               tex->base.height[0],
+                               1);
+   i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
+
+   tex->stride = power_of_two(tex->base.nblocksx[0] * pt->block.size);
+   tex->total_nblocksy = round_up(tex->base.nblocksy[0], 8);
+   tex->hw_tiled = INTEL_TILE_X;
+
+   debug_printf("%s size: %d,%d,%d offset %d,%d (0x%x)\n", __FUNCTION__,
+      tex->base.width[0], tex->base.height[0], pt->block.size,
+      tex->stride, tex->total_nblocksy, tex->stride * tex->total_nblocksy);
+
+   return TRUE;
+}
+
 static void
 i915_miptree_layout_2d(struct i915_texture *tex)
 {
@@ -201,6 +233,16 @@ i915_miptree_layout_2d(struct i915_texture *tex)
    unsigned nblocksx = pt->nblocksx[0];
    unsigned nblocksy = pt->nblocksy[0];
 
+   /* used for scanouts that need special layouts */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_PRIMARY)
+      if (i915_scanout_layout(tex))
+         return;
+
+   /* for shared buffers we use some very like scanout */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      if (i915_display_target_layout(tex))
+         return;
+
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
    tex->total_nblocksy = 0;
 
@@ -351,6 +393,11 @@ i945_miptree_layout_2d(struct i915_texture *tex)
       if (i915_scanout_layout(tex))
          return;
 
+   /* for shared buffers we use some very like scanout */
+   if (tex->base.tex_usage & PIPE_TEXTURE_USAGE_DISPLAY_TARGET)
+      if (i915_display_target_layout(tex))
+         return;
+
    tex->stride = round_up(pt->nblocksx[0] * pt->block.size, 4);
 
    /* May need to adjust pitch to accomodate the placement of
@@ -812,7 +859,7 @@ i915_transfer_map(struct pipe_screen *screen,
    char *map;
    boolean write = FALSE;
 
-   if (transfer->usage != PIPE_TRANSFER_READ)
+   if (transfer->usage & PIPE_TRANSFER_WRITE)
       write = TRUE;
 
    map = iws->buffer_map(iws, tex->buffer, write);
diff --git a/src/gallium/drivers/i965simple/brw_screen.c b/src/gallium/drivers/i965simple/brw_screen.c
index fb68fd624b3..4a84c4db23f 100644
--- a/src/gallium/drivers/i965simple/brw_screen.c
+++ b/src/gallium/drivers/i965simple/brw_screen.c
@@ -39,7 +39,7 @@
 static const char *
 brw_get_vendor( struct pipe_screen *screen )
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.c b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
index 0b115fc9b07..d27ef0de041 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.c
@@ -48,11 +48,13 @@
 #include "util/u_memory.h"
 #include "util/u_debug.h"
 #include "util/u_string.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
 #include "lp_bld_logic.h"
+#include "lp_bld_debug.h"
 #include "lp_bld_arit.h"
 
 
@@ -71,30 +73,28 @@ lp_build_min_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.min.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.min.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pminu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmins.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pminsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -119,30 +119,28 @@ lp_build_max_simple(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
       if(type.floating) {
-         if(type.width == 32)
+         if(type.width == 32 && util_cpu_caps.has_sse)
             intrinsic = "llvm.x86.sse.max.ps";
-         if(type.width == 64)
+         if(type.width == 64 && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.max.pd";
       }
       else {
-         if(type.width == 8 && !type.sign)
+         if(type.width == 8 && !type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxu.b";
-         if(type.width == 8 && type.sign)
+         if(type.width == 8 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsb";
-         if(type.width == 16 && !type.sign)
+         if(type.width == 16 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxuw";
-         if(type.width == 16 && type.sign)
+         if(type.width == 16 && type.sign && util_cpu_caps.has_sse2)
             intrinsic = "llvm.x86.sse2.pmaxs.w";
-         if(type.width == 32 && !type.sign)
+         if(type.width == 32 && !type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxud";
-         if(type.width == 32 && type.sign)
+         if(type.width == 32 && type.sign && util_cpu_caps.has_sse4_1)
             intrinsic = "llvm.x86.sse41.pmaxsd";
       }
    }
-#endif
 
    if(intrinsic)
       return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -204,15 +202,14 @@ lp_build_add(struct lp_build_context *bld,
       if(a == bld->one || b == bld->one)
         return bld->one;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.b" : "llvm.x86.sse2.paddus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.padds.w" : "llvm.x86.sse2.paddus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -257,15 +254,14 @@ lp_build_sub(struct lp_build_context *bld,
       if(b == bld->one)
         return bld->zero;
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width * type.length == 128 &&
+      if(util_cpu_caps.has_sse2 &&
+         type.width * type.length == 128 &&
          !type.floating && !type.fixed) {
          if(type.width == 8)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.b" : "llvm.x86.sse2.psubus.b";
          if(type.width == 16)
             intrinsic = type.sign ? "llvm.x86.sse2.psubs.w" : "llvm.x86.sse2.psubus.w";
       }
-#endif
    
       if(intrinsic)
          return lp_build_intrinsic_binary(bld->builder, intrinsic, lp_build_vec_type(bld->type), a, b);
@@ -419,8 +415,7 @@ lp_build_mul(struct lp_build_context *bld,
       return bld->undef;
 
    if(!type.floating && !type.fixed && type.norm) {
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-      if(type.width == 8 && type.length == 16) {
+      if(util_cpu_caps.has_sse2 && type.width == 8 && type.length == 16) {
          LLVMTypeRef i16x8 = LLVMVectorType(LLVMInt16Type(), 8);
          LLVMTypeRef i8x16 = LLVMVectorType(LLVMInt8Type(), 16);
          static LLVMValueRef ml = NULL;
@@ -456,7 +451,6 @@ lp_build_mul(struct lp_build_context *bld,
          
          return ab;
       }
-#endif
 
       /* FIXME */
       assert(0);
@@ -493,10 +487,8 @@ lp_build_div(struct lp_build_context *bld,
    if(LLVMIsConstant(a) && LLVMIsConstant(b))
       return LLVMConstFDiv(a, b);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_mul(bld, a, lp_build_rcp(bld, b));
-#endif
 
    return LLVMBuildFDiv(bld->builder, a, b, "");
 }
@@ -606,8 +598,7 @@ lp_build_abs(struct lp_build_context *bld,
       return a;
    }
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width*type.length == 128) {
+   if(type.width*type.length == 128 && util_cpu_caps.has_ssse3) {
       switch(type.width) {
       case 8:
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.b.128", vec_type, a);
@@ -617,7 +608,6 @@ lp_build_abs(struct lp_build_context *bld,
          return lp_build_intrinsic_unary(bld->builder, "llvm.x86.ssse3.pabs.d.128", vec_type, a);
       }
    }
-#endif
 
    return lp_build_max(bld, a, LLVMBuildNeg(bld->builder, a, ""));
 }
@@ -684,6 +674,8 @@ lp_build_round_sse41(struct lp_build_context *bld,
 
    assert(type.floating);
    assert(type.width*type.length == 128);
+   assert(lp_check_value(type, a));
+   assert(util_cpu_caps.has_sse4_1);
 
    switch(type.width) {
    case 32:
@@ -703,20 +695,45 @@ lp_build_round_sse41(struct lp_build_context *bld,
 
 
 LLVMValueRef
-lp_build_round(struct lp_build_context *bld,
+lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+      LLVMValueRef res;
+      res = LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
-#endif
 
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+LLVMValueRef
+lp_build_round(struct lp_build_context *bld,
+               LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iround(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
 }
 
 
@@ -728,13 +745,15 @@ lp_build_floor(struct lp_build_context *bld,
 
    assert(type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
-#endif
-
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_ifloor(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
 }
 
 
@@ -745,59 +764,143 @@ lp_build_ceil(struct lp_build_context *bld,
    const struct lp_type type = bld->type;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1)
+      return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef res;
+      res = lp_build_iceil(bld, a);
+      res = LLVMBuildSIToFP(bld->builder, res, vec_type, "");
+      return res;
+   }
+}
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
-#endif
 
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+/**
+ * Convert to integer, through whichever rounding method that's fastest,
+ * typically truncating to zero.
+ */
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
+                LLVMValueRef a)
+{
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
 }
 
 
 LLVMValueRef
-lp_build_trunc(struct lp_build_context *bld,
-               LLVMValueRef a)
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   return lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_TRUNCATE);
-#endif
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_NEAREST);
+   }
+   else {
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef half;
+
+      /* get sign bit */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+
+      /* sign * 0.5 */
+      half = lp_build_const_scalar(type, 0.5);
+      half = LLVMBuildBitCast(bld->builder, half, int_vec_type, "");
+      half = LLVMBuildOr(bld->builder, sign, half, "");
+      half = LLVMBuildBitCast(bld->builder, half, vec_type, "");
 
-   /* FIXME */
-   assert(0);
-   return bld->undef;
+      res = LLVMBuildAdd(bld->builder, a, half, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
 }
 
 
-/**
- * Convert to integer, through whichever rounding method that's fastest,
- * typically truncating to zero.
- */
 LLVMValueRef
-lp_build_int(struct lp_build_context *bld,
-             LLVMValueRef a)
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a)
 {
    const struct lp_type type = bld->type;
    LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
 
    assert(type.floating);
+   assert(lp_check_value(type, a));
 
-   return LLVMBuildFPToSI(bld->builder, a, int_vec_type, "");
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_FLOOR);
+   }
+   else {
+      /* Take the sign bit and add it to 1 constant */
+      LLVMTypeRef vec_type = lp_build_vec_type(type);
+      unsigned mantissa = lp_mantissa(type);
+      LLVMValueRef mask = lp_build_int_const_scalar(type, (unsigned long long)1 << (type.width - 1));
+      LLVMValueRef sign;
+      LLVMValueRef offset;
+
+      /* sign = a < 0 ? ~0 : 0 */
+      sign = LLVMBuildBitCast(bld->builder, a, int_vec_type, "");
+      sign = LLVMBuildAnd(bld->builder, sign, mask, "");
+      sign = LLVMBuildAShr(bld->builder, sign, lp_build_int_const_scalar(type, type.width - 1), "");
+
+      /* offset = -0.99999(9)f */
+      offset = lp_build_const_scalar(type, -(double)(((unsigned long long)1 << mantissa) - 1)/((unsigned long long)1 << mantissa));
+      offset = LLVMConstBitCast(offset, int_vec_type);
+
+      /* offset = a < 0 ? -0.99999(9)f : 0.0f */
+      offset = LLVMBuildAnd(bld->builder, offset, sign, "");
+      offset = LLVMBuildBitCast(bld->builder, offset, vec_type, "");
+
+      res = LLVMBuildAdd(bld->builder, a, offset, "");
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
 }
 
 
 LLVMValueRef
-lp_build_ifloor(struct lp_build_context *bld,
-                LLVMValueRef a)
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a)
 {
-   a = lp_build_floor(bld, a);
-   a = lp_build_int(bld, a);
-   return a;
+   const struct lp_type type = bld->type;
+   LLVMTypeRef int_vec_type = lp_build_int_vec_type(type);
+   LLVMValueRef res;
+
+   assert(type.floating);
+   assert(lp_check_value(type, a));
+
+   if(util_cpu_caps.has_sse4_1) {
+      res = lp_build_round_sse41(bld, a, LP_BUILD_ROUND_SSE41_CEIL);
+   }
+   else {
+      assert(0);
+      res = bld->undef;
+   }
+
+   res = LLVMBuildFPToSI(bld->builder, res, int_vec_type, "");
+
+   return res;
 }
 
 
@@ -837,11 +940,9 @@ lp_build_rcp(struct lp_build_context *bld,
    if(LLVMIsConstant(a))
       return LLVMConstFDiv(bld->one, a);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
+      /* FIXME: improve precision */
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rcp.ps", lp_build_vec_type(type), a);
-#endif
 
    return LLVMBuildFDiv(bld->builder, bld->one, a, "");
 }
@@ -858,11 +959,8 @@ lp_build_rsqrt(struct lp_build_context *bld,
 
    assert(type.floating);
 
-   /* XXX: is this really necessary? */
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(type.width == 32 && type.length == 4)
+   if(util_cpu_caps.has_sse && type.width == 32 && type.length == 4)
       return lp_build_intrinsic_unary(bld->builder, "llvm.x86.sse.rsqrt.ps", lp_build_vec_type(type), a);
-#endif
 
    return lp_build_rcp(bld, lp_build_sqrt(bld, a));
 }
@@ -918,7 +1016,8 @@ lp_build_pow(struct lp_build_context *bld,
 {
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x) && LLVMIsConstant(y))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    return lp_build_exp2(bld, lp_build_mul(bld, lp_build_log2(bld, x), y));
 }
@@ -972,7 +1071,8 @@ lp_build_polynomial(struct lp_build_context *bld,
 
    /* TODO: optimize the constant case */
    if(LLVMIsConstant(x))
-      debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                   __FUNCTION__);
 
    for (i = num_coeffs; i--; ) {
       LLVMValueRef coeff = lp_build_const_scalar(type, coeffs[i]);
@@ -1026,7 +1126,8 @@ lp_build_exp2_approx(struct lp_build_context *bld,
    if(p_exp2_int_part || p_frac_part || p_exp2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
@@ -1125,7 +1226,8 @@ lp_build_log2_approx(struct lp_build_context *bld,
    if(p_exp || p_floor_log2 || p_log2) {
       /* TODO: optimize the constant case */
       if(LLVMIsConstant(x))
-         debug_printf("%s: inefficient/imprecise constant arithmetic\n");
+         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
+                      __FUNCTION__);
 
       assert(type.floating && type.width == 32);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_arit.h b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
index d68a97c4b87..095a8e1cabe 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_arit.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_arit.h
@@ -126,11 +126,18 @@ lp_build_trunc(struct lp_build_context *bld,
                LLVMValueRef a);
 
 LLVMValueRef
-lp_build_int(struct lp_build_context *bld,
-             LLVMValueRef a);
+lp_build_ifloor(struct lp_build_context *bld,
+                LLVMValueRef a);
+LLVMValueRef
+lp_build_iceil(struct lp_build_context *bld,
+               LLVMValueRef a);
 
 LLVMValueRef
-lp_build_ifloor(struct lp_build_context *bld,
+lp_build_iround(struct lp_build_context *bld,
+                LLVMValueRef a);
+
+LLVMValueRef
+lp_build_itrunc(struct lp_build_context *bld,
                 LLVMValueRef a);
 
 LLVMValueRef
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_conv.c b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
index 186cac70f62..20c8710214b 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_conv.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_conv.c
@@ -63,6 +63,7 @@
 
 #include "util/u_debug.h"
 #include "util/u_math.h"
+#include "util/u_cpu_detect.h"
 
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
@@ -334,8 +335,7 @@ lp_build_pack2(LLVMBuilderRef builder,
    assert(!src_type.floating);
    assert(!dst_type.floating);
 
-#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
-   if(src_type.width * src_type.length == 128) {
+   if(util_cpu_caps.has_sse2 && src_type.width * src_type.length == 128) {
       /* All X86 non-interleaved pack instructions all take signed inputs and
        * saturate them, so saturate beforehand. */
       if(!src_type.sign && !clamped) {
@@ -349,7 +349,7 @@ lp_build_pack2(LLVMBuilderRef builder,
 
       switch(src_type.width) {
       case 32:
-         if(dst_type.sign)
+         if(dst_type.sign || !util_cpu_caps.has_sse4_1)
             res = lp_build_intrinsic_binary(builder, "llvm.x86.sse2.packssdw.128", src_vec_type, lo, hi);
          else
             /* PACKUSDW is the only instrinsic with a consistent signature */
@@ -372,7 +372,6 @@ lp_build_pack2(LLVMBuilderRef builder,
       res = LLVMBuildBitCast(builder, res, dst_vec_type, "");
       return res;
    }
-#endif
 
    lo = LLVMBuildBitCast(builder, lo, dst_vec_type, "");
    hi = LLVMBuildBitCast(builder, hi, dst_vec_type, "");
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 21c665c4d4c..98ec1cb1b9d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -210,7 +210,4 @@ lp_build_depth_test(LLVMBuilderRef builder,
       dst = lp_build_select(&bld, z_bitmask, src, dst);
       LLVMBuildStore(builder, dst, dst_ptr);
    }
-
-   /* FIXME */
-   assert(!state->occlusion_count);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_logic.c b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
index 6b6f8207697..db22a8028a6 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_logic.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_logic.c
@@ -33,6 +33,8 @@
  */
 
 
+#include "util/u_cpu_detect.h"
+
 #include "lp_bld_type.h"
 #include "lp_bld_const.h"
 #include "lp_bld_intr.h"
@@ -65,7 +67,7 @@ lp_build_cmp(struct lp_build_context *bld,
 
 #if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
    if(type.width * type.length == 128) {
-      if(type.floating) {
+      if(type.floating && util_cpu_caps.has_sse) {
          LLVMValueRef args[3];
          unsigned cc;
          boolean swap;
@@ -114,7 +116,7 @@ lp_build_cmp(struct lp_build_context *bld,
          res = LLVMBuildBitCast(bld->builder, res, int_vec_type, "");
          return res;
       }
-      else {
+      else if(util_cpu_caps.has_sse2) {
          static const struct {
             unsigned swap:1;
             unsigned eq:1;
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
index 8ca1be6f1be..1a47ca32d2d 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_sample_soa.c
@@ -274,8 +274,8 @@ lp_build_sample_2d_linear_soa(struct lp_build_sample_context *bld,
    s_fpart = lp_build_sub(&bld->coord_bld, s, s_ipart);
    t_fpart = lp_build_sub(&bld->coord_bld, t, t_ipart);
 
-   x0 = lp_build_int(&bld->coord_bld, s_ipart);
-   y0 = lp_build_int(&bld->coord_bld, t_ipart);
+   x0 = lp_build_itrunc(&bld->coord_bld, s_ipart);
+   y0 = lp_build_itrunc(&bld->coord_bld, t_ipart);
 
    x0 = lp_build_sample_wrap(bld, x0, width,  bld->static_state->pot_width,  bld->static_state->wrap_s);
    y0 = lp_build_sample_wrap(bld, y0, height, bld->static_state->pot_height, bld->static_state->wrap_t);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index a4b2bd8c2ad..202cb8ef439 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -107,11 +107,16 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
    if (llvmpipe->draw)
       draw_destroy( llvmpipe->draw );
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       lp_destroy_tile_cache(llvmpipe->cbuf_cache[i]);
+      pipe_surface_reference(&llvmpipe->framebuffer.cbufs[i], NULL);
+   }
+   pipe_surface_reference(&llvmpipe->framebuffer.zsbuf, NULL);
 
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
       lp_destroy_tex_tile_cache(llvmpipe->tex_cache[i]);
+      pipe_texture_reference(&llvmpipe->texture[i], NULL);
+   }
 
    for (i = 0; i < Elements(llvmpipe->constants); i++) {
       if (llvmpipe->constants[i].buffer) {
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index b4a22ff4a97..1126bf90b96 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,6 +36,7 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
+#include "util/u_cpu_detect.h"
 #include "lp_screen.h"
 #include "lp_bld_intr.h"
 #include "lp_jit.h"
@@ -147,6 +148,19 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
 {
    char *error = NULL;
 
+   util_cpu_detect();
+
+#if 0
+   /* For simulating less capable machines */
+   util_cpu_caps.has_sse3 = 0;
+   util_cpu_caps.has_sse4_1 = 0;
+#endif
+
+#ifdef LLVM_NATIVE_ARCH
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+#endif
+
    screen->module = LLVMModuleCreateWithName("llvmpipe");
 
    screen->provider = LLVMCreateModuleProviderForExistingModule(screen->module);
@@ -163,8 +177,15 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
    LLVMAddTargetData(screen->target, screen->pass);
    /* These are the passes currently listed in llvm-c/Transforms/Scalar.h,
     * but there are more on SVN. */
+   /* TODO: Add more passes */
    LLVMAddConstantPropagationPass(screen->pass);
-   LLVMAddInstructionCombiningPass(screen->pass);
+   if(util_cpu_caps.has_sse4_1) {
+      /* FIXME: There is a bug in this pass, whereby the combination of fptosi
+       * and sitofp (necessary for trunc/floor/ceil/round implementation)
+       * somehow becomes invalid code.
+       */
+      LLVMAddInstructionCombiningPass(screen->pass);
+   }
    LLVMAddPromoteMemoryToRegisterPass(screen->pass);
    LLVMAddGVNPass(screen->pass);
    LLVMAddCFGSimplificationPass(screen->pass);
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index ff7ef8658a9..05189274589 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -82,7 +82,7 @@ llvmpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 8;  /* max 128x128x128 */
+      return 9;  /* max 256x256x256 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_TGSI_CONT_SUPPORTED:
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index 9faed5a0b18..b00be0cc32a 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -401,7 +401,6 @@ generate_fragment(struct llvmpipe_context *lp,
    if(key->depth.enabled) {
       debug_printf("depth.func = %s\n", debug_dump_func(key->depth.func, TRUE));
       debug_printf("depth.writemask = %u\n", key->depth.writemask);
-      debug_printf("depth.occlusion_count = %u\n", key->depth.occlusion_count);
    }
    if(key->alpha.enabled) {
       debug_printf("alpha.func = %s\n", debug_dump_func(key->alpha.func, TRUE));
@@ -582,6 +581,11 @@ generate_fragment(struct llvmpipe_context *lp,
     * Translate the LLVM IR into machine code.
     */
 
+   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
+      LLVMDumpValue(variant->function);
+      abort();
+   }
+
    LLVMRunFunctionPassManager(screen->pass, variant->function);
 
 #ifdef DEBUG
@@ -589,11 +593,6 @@ generate_fragment(struct llvmpipe_context *lp,
    debug_printf("\n");
 #endif
 
-   if(LLVMVerifyFunction(variant->function, LLVMPrintMessageAction)) {
-      LLVMDumpValue(variant->function);
-      abort();
-   }
-
    variant->jit_function = (lp_jit_frag_func)LLVMGetPointerToGlobal(screen->engine, variant->function);
 
 #ifdef DEBUG
diff --git a/src/gallium/drivers/llvmpipe/lp_state_surface.c b/src/gallium/drivers/llvmpipe/lp_state_surface.c
index 177a26b7b1f..2c29144c039 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_surface.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_surface.c
@@ -56,7 +56,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
          lp_flush_tile_cache(lp->cbuf_cache[i]);
 
          /* assign new */
-         lp->framebuffer.cbufs[i] = fb->cbufs[i];
+         pipe_surface_reference(&lp->framebuffer.cbufs[i], fb->cbufs[i]);
 
          /* update cache */
          lp_tile_cache_set_surface(lp->cbuf_cache[i], fb->cbufs[i]);
@@ -81,7 +81,7 @@ llvmpipe_set_framebuffer_state(struct pipe_context *pipe,
       }
 
       /* assign new */
-      lp->framebuffer.zsbuf = fb->zsbuf;
+      pipe_surface_reference(&lp->framebuffer.zsbuf, fb->zsbuf);
 
       /* Tell draw module how deep the Z/depth buffer is */
       if (lp->framebuffer.zsbuf) {
diff --git a/src/gallium/drivers/llvmpipe/lp_test_format.c b/src/gallium/drivers/llvmpipe/lp_test_format.c
index d8455e56490..7d83f899e6a 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_format.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_format.c
@@ -264,6 +264,11 @@ int main(int argc, char **argv)
    unsigned i;
    int ret;
 
+#ifdef LLVM_NATIVE_ARCH
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+#endif
+
    for (i = 0; i < sizeof(test_cases)/sizeof(test_cases[0]); ++i)
       if(!test_format(&test_cases[i]))
         ret = 1;
diff --git a/src/gallium/drivers/llvmpipe/lp_test_main.c b/src/gallium/drivers/llvmpipe/lp_test_main.c
index 4592dc0b2d0..f07fa256f16 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_main.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_main.c
@@ -365,6 +365,11 @@ int main(int argc, char **argv)
          n = atoi(argv[i]);
    }
 
+#ifdef LLVM_NATIVE_ARCH
+   LLVMLinkInJIT();
+   LLVMInitializeNativeTarget();
+#endif
+
    if(fp) {
       /* Warm up the caches */
       test_some(0, NULL, 100);
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 724d4378336..08f0950d475 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -353,17 +353,9 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
 
    if(lpt->dt) {
       struct llvmpipe_winsys *winsys = screen->winsys;
-      unsigned flags = 0;
 
-      if (transfer->usage != PIPE_TRANSFER_READ) {
-         flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-      }
-
-      if (transfer->usage != PIPE_TRANSFER_WRITE) {
-         flags |= PIPE_BUFFER_USAGE_CPU_READ;
-      }
-
-      map = winsys->displaytarget_map(winsys, lpt->dt, flags);
+      map = winsys->displaytarget_map(winsys, lpt->dt,
+                                      pipe_transfer_buffer_flags(transfer));
       if (map == NULL)
          return NULL;
    }
@@ -373,7 +365,7 @@ llvmpipe_transfer_map( struct pipe_screen *_screen,
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) 
+   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE))
    {
       /* Do something to notify sharing contexts of a texture change.
        * In llvmpipe, that would mean flushing the texture cache.
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
index 2e576e6039d..0c06b659a1f 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
@@ -44,10 +44,53 @@
 #include "lp_tile_cache.h"
 
 
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+enum llvmpipe_tile_status
+{
+   LP_TILE_STATUS_UNDEFINED = 0,
+   LP_TILE_STATUS_CLEAR = 1,
+   LP_TILE_STATUS_DEFINED = 2
+};
+
+
+struct llvmpipe_cached_tile
+{
+   enum llvmpipe_tile_status status;
+
+   /** color in SOA format */
+   uint8_t *color;
+};
+
+
+struct llvmpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
+
+   uint8_t clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+
+   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
 struct llvmpipe_tile_cache *
 lp_create_tile_cache( struct pipe_screen *screen )
 {
    struct llvmpipe_tile_cache *tc;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
 
    tc = CALLOC_STRUCT( llvmpipe_tile_cache );
    if(!tc)
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
index 6d8ba5ece7a..161bab37991 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
@@ -33,42 +33,7 @@
 #include "lp_tile_soa.h"
 
 
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-/** XXX move these */
-#define MAX_WIDTH 2048
-#define MAX_HEIGHT 2048
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
+struct llvmpipe_tile_cache;  /* opaque */
 
 
 extern struct llvmpipe_tile_cache *
diff --git a/src/gallium/drivers/nv04/nv04_transfer.c b/src/gallium/drivers/nv04/nv04_transfer.c
index 854b855d64a..6618660743d 100644
--- a/src/gallium/drivers/nv04/nv04_transfer.c
+++ b/src/gallium/drivers/nv04/nv04_transfer.c
@@ -13,22 +13,6 @@ struct nv04_transfer {
 	bool direct;
 };
 
-static unsigned nv04_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv04_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               0, 0, 0,
-	                                               nv04_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
-	                                       nv04_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv04_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv04_screen *nvscreen = nv04_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv04_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv04_transfer *tx = (struct nv04_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv04_screen *nvscreen = nv04_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv04_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv04_miptree *mt = (struct nv04_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv04_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv10/nv10_transfer.c b/src/gallium/drivers/nv10/nv10_transfer.c
index c06b8d34c72..8feb85e4bda 100644
--- a/src/gallium/drivers/nv10/nv10_transfer.c
+++ b/src/gallium/drivers/nv10/nv10_transfer.c
@@ -13,22 +13,6 @@ struct nv10_transfer {
 	bool direct;
 };
 
-static unsigned nv10_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv10_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               0, 0, 0,
-	                                               nv10_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
-	                                       nv10_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv10_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv10_screen *nvscreen = nv10_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv10_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv10_transfer *tx = (struct nv10_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv10_screen *nvscreen = nv10_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv10_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv10_miptree *mt = (struct nv10_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv10_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv20/nv20_transfer.c b/src/gallium/drivers/nv20/nv20_transfer.c
index 5018995596c..81b4f1a9177 100644
--- a/src/gallium/drivers/nv20/nv20_transfer.c
+++ b/src/gallium/drivers/nv20/nv20_transfer.c
@@ -13,22 +13,6 @@ struct nv20_transfer {
 	bool direct;
 };
 
-static unsigned nv20_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv20_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               0, 0, 0,
-	                                               nv20_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       face, level, zslice,
-	                                       nv20_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv20_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv20_screen *nvscreen = nv20_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv20_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv20_transfer *tx = (struct nv20_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage = PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv20_screen *nvscreen = nv20_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv20_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv20_miptree *mt = (struct nv20_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv20_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv30/nv30_transfer.c b/src/gallium/drivers/nv30/nv30_transfer.c
index 23675718781..98011decf7c 100644
--- a/src/gallium/drivers/nv30/nv30_transfer.c
+++ b/src/gallium/drivers/nv30/nv30_transfer.c
@@ -13,22 +13,6 @@ struct nv30_transfer {
 	bool direct;
 };
 
-static unsigned nv30_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv30_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               face, level, zslice,
-	                                               nv30_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
-	                                       nv30_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv30_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv30_screen *nvscreen = nv30_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv30_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv30_transfer *tx = (struct nv30_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv30_screen *nvscreen = nv30_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv30_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv30_miptree *mt = (struct nv30_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv30_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv40/nv40_transfer.c b/src/gallium/drivers/nv40/nv40_transfer.c
index 6d92ac3db9c..92caee6f382 100644
--- a/src/gallium/drivers/nv40/nv40_transfer.c
+++ b/src/gallium/drivers/nv40/nv40_transfer.c
@@ -13,22 +13,6 @@ struct nv40_transfer {
 	bool direct;
 };
 
-static unsigned nv40_usage_tx_to_buf(unsigned tx_usage)
-{
-	switch (tx_usage) {
-		case PIPE_TRANSFER_READ:
-			return PIPE_BUFFER_USAGE_CPU_READ;
-		case PIPE_TRANSFER_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_WRITE;
-		case PIPE_TRANSFER_READ_WRITE:
-			return PIPE_BUFFER_USAGE_CPU_READ_WRITE;
-		default:
-			assert(0);
-	}
-
-	return -1;
-}
-
 static void
 nv40_compatible_transfer_tex(struct pipe_texture *pt, unsigned level,
                              struct pipe_texture *template)
@@ -86,7 +70,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		tx->direct = true;
 		tx->surface = pscreen->get_tex_surface(pscreen, pt,
 	                                               face, level, zslice,
-	                                               nv40_usage_tx_to_buf(usage));
+	                                               pipe_transfer_buffer_flags(&tx->base));
 		return &tx->base;
 	}
 
@@ -103,7 +87,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 
 	tx->surface = pscreen->get_tex_surface(pscreen, tx_tex,
 	                                       0, 0, 0,
-	                                       nv40_usage_tx_to_buf(usage));
+	                                       pipe_transfer_buffer_flags(&tx->base));
 
 	pipe_texture_reference(&tx_tex, NULL);
 
@@ -114,7 +98,7 @@ nv40_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		struct nv40_screen *nvscreen = nv40_screen(pscreen);
 		struct pipe_surface *src;
 
@@ -140,7 +124,7 @@ nv40_transfer_del(struct pipe_transfer *ptx)
 {
 	struct nv40_transfer *tx = (struct nv40_transfer *)ptx;
 
-	if (!tx->direct && ptx->usage != PIPE_TRANSFER_READ) {
+	if (!tx->direct && (ptx->usage & PIPE_TRANSFER_WRITE)) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		struct nv40_screen *nvscreen = nv40_screen(pscreen);
 		struct pipe_surface *dst;
@@ -170,7 +154,7 @@ nv40_transfer_map(struct pipe_screen *pscreen, struct pipe_transfer *ptx)
 	struct nv04_surface *ns = (struct nv04_surface *)tx->surface;
 	struct nv40_miptree *mt = (struct nv40_miptree *)tx->surface->texture;
 	void *map = pipe_buffer_map(pscreen, mt->buffer,
-	                            nv40_usage_tx_to_buf(ptx->usage));
+	                            pipe_transfer_buffer_flags(ptx));
 
 	return map + ns->base.offset +
 	       ptx->y * ns->pitch + ptx->x * ptx->block.size;
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index eb90d5e66f9..576d075318f 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -90,6 +90,10 @@ struct nv50_reg {
 	int acc; /* instruction where this reg is last read (first insn == 1) */
 };
 
+/* arbitrary limits */
+#define MAX_IF_DEPTH 4
+#define MAX_LOOP_DEPTH 4
+
 struct nv50_pc {
 	struct nv50_program *p;
 
@@ -121,6 +125,13 @@ struct nv50_pc {
 	struct nv50_reg *iv_p;
 	struct nv50_reg *iv_c;
 
+	struct nv50_program_exec *if_cond;
+	struct nv50_program_exec *if_insn[MAX_IF_DEPTH];
+	struct nv50_program_exec *br_join[MAX_IF_DEPTH];
+	struct nv50_program_exec *br_loop[MAX_LOOP_DEPTH]; /* for BRK branch */
+	int if_lvl, loop_lvl;
+	unsigned loop_pos[MAX_LOOP_DEPTH];
+
 	/* current instruction and total number of insns */
 	unsigned insn_cur;
 	unsigned insn_nr;
@@ -196,6 +207,10 @@ alloc_reg(struct nv50_pc *pc, struct nv50_reg *reg)
 	assert(0);
 }
 
+/* XXX: For shaders that aren't executed linearly (e.g. shaders that
+ * contain loops), we need to assign all hw regs to TGSI TEMPs early,
+ * lest we risk temp_temps overwriting regs alloc'd "later".
+ */
 static struct nv50_reg *
 alloc_temp(struct nv50_pc *pc, struct nv50_reg *dst)
 {
@@ -559,6 +574,22 @@ check_swap_src_0_1(struct nv50_pc *pc,
 }
 
 static void
+set_src_0_restricted(struct nv50_pc *pc, struct nv50_reg *src,
+		     struct nv50_program_exec *e)
+{
+	struct nv50_reg *temp;
+
+	if (src->type != P_TEMP) {
+		temp = temp_temp(pc);
+		emit_mov(pc, temp, src);
+		src = temp;
+	}
+
+	alloc_reg(pc, src);
+	e->inst[0] |= (src->hw << 9);
+}
+
+static void
 set_src_0(struct nv50_pc *pc, struct nv50_reg *src, struct nv50_program_exec *e)
 {
 	if (src->type == P_ATTR) {
@@ -760,7 +791,11 @@ emit_flop(struct nv50_pc *pc, unsigned sub,
 	}
 
 	set_dst(pc, dst, e);
-	set_src_0(pc, src, e);
+
+	if (sub == 0 || sub == 2)
+		set_src_0_restricted(pc, src, e);
+	else
+		set_src_0(pc, src, e);
 
 	emit(pc, e);
 }
@@ -810,7 +845,8 @@ emit_precossin(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 #define CVT_F32_U32 0x64
 #define CVT_S32_F32 0x8c
 #define CVT_S32_S32 0x0c
-#define CVT_F32_F32_ROP 0xcc
+#define CVT_NEG     0x20
+#define CVT_RI      0x08
 
 static void
 emit_cvt(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src,
@@ -890,6 +926,7 @@ emit_set(struct nv50_pc *pc, unsigned ccode, struct nv50_reg *dst, int wp,
 	set_src_1(pc, src1, e);
 
 	emit(pc, e);
+	pc->if_cond = pc->p->exec_tail; /* record for OPCODE_IF */
 
 	/* cvt.f32.u32/s32 (?) if we didn't only write the predicate */
 	if (rdst)
@@ -917,7 +954,7 @@ map_tgsi_setop_cc(unsigned op)
 static INLINE void
 emit_flr(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 {
-	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32_ROP);
+	emit_cvt(pc, dst, src, -1, CVTOP_FLOOR, CVT_F32_F32 | CVT_RI);
 }
 
 static void
@@ -1129,10 +1166,11 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 	emit(pc, e);
 
 #if 1
-	if (mask & 1) emit_mov(pc, dst[0], t[0]);
-	if (mask & 2) emit_mov(pc, dst[1], t[1]);
-	if (mask & 4) emit_mov(pc, dst[2], t[2]);
-	if (mask & 8) emit_mov(pc, dst[3], t[3]);
+	c = 0;
+	if (mask & 1) emit_mov(pc, dst[0], t[c++]);
+	if (mask & 2) emit_mov(pc, dst[1], t[c++]);
+	if (mask & 4) emit_mov(pc, dst[2], t[c++]);
+	if (mask & 8) emit_mov(pc, dst[3], t[c]);
 
 	free_temp4(pc, t);
 #else
@@ -1149,6 +1187,38 @@ emit_tex(struct nv50_pc *pc, struct nv50_reg **dst, unsigned mask,
 }
 
 static void
+emit_branch(struct nv50_pc *pc, int pred, unsigned cc,
+	    struct nv50_program_exec **join)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	if (join) {
+		set_long(pc, e);
+		e->inst[0] |= 0xa0000002;
+		emit(pc, e);
+		*join = e;
+		e = exec(pc);
+	}
+
+	set_long(pc, e);
+	e->inst[0] |= 0x10000002;
+	if (pred >= 0)
+		set_pred(pc, cc, pred, e);
+	emit(pc, e);
+}
+
+static void
+emit_nop(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e = exec(pc);
+
+	e->inst[0] = 0xf0000000;
+	set_long(pc, e);
+	e->inst[1] = 0xe0000000;
+	emit(pc, e);
+}
+
+static void
 convert_to_long(struct nv50_pc *pc, struct nv50_program_exec *e)
 {
 	unsigned q = 0, m = ~0;
@@ -1444,6 +1514,55 @@ nv50_tgsi_dst_revdep(unsigned op, int s, int c)
 	}
 }
 
+static INLINE boolean
+has_pred(struct nv50_program_exec *e, unsigned cc)
+{
+	if (!is_long(e) || is_immd(e))
+		return FALSE;
+	return ((e->inst[1] & 0x780) == (cc << 7));
+}
+
+/* on ENDIF see if we can do "@p0.neu single_op" instead of:
+ *        join_at ENDIF
+ *        @p0.eq bra ENDIF
+ *        single_op
+ * ENDIF: nop.join
+ */
+static boolean
+nv50_kill_branch(struct nv50_pc *pc)
+{
+	int lvl = pc->if_lvl;
+
+	if (pc->if_insn[lvl]->next != pc->p->exec_tail)
+		return FALSE;
+
+	/* if ccode == 'true', the BRA is from an ELSE and the predicate
+	 * reg may no longer be valid, since we currently always use $p0
+	 */
+	if (has_pred(pc->if_insn[lvl], 0xf))
+		return FALSE;
+	assert(pc->if_insn[lvl] && pc->br_join[lvl]);
+
+	/* We'll use the exec allocated for JOIN_AT (as we can't easily
+	 * update prev's next); if exec_tail is BRK, update the pointer.
+	 */
+	if (pc->loop_lvl && pc->br_loop[pc->loop_lvl - 1] == pc->p->exec_tail)
+		pc->br_loop[pc->loop_lvl - 1] = pc->br_join[lvl];
+
+	pc->p->exec_size -= 4; /* remove JOIN_AT and BRA */
+
+	*pc->br_join[lvl] = *pc->p->exec_tail;
+
+	FREE(pc->if_insn[lvl]);
+	FREE(pc->p->exec_tail);
+
+	pc->p->exec_tail = pc->br_join[lvl];
+	pc->p->exec_tail->next = NULL;
+	set_pred(pc, 0xd, 0, pc->p->exec_tail);
+
+	return TRUE;
+}
+
 static boolean
 nv50_program_tx_insn(struct nv50_pc *pc,
 		     const struct tgsi_full_instruction *inst)
@@ -1513,12 +1632,20 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_add(pc, dst[c], src[0][c], src[1][c]);
 		}
 		break;
+	case TGSI_OPCODE_BGNLOOP:
+		pc->loop_pos[pc->loop_lvl++] = pc->p->exec_size;
+		break;
+	case TGSI_OPCODE_BRK:
+		emit_branch(pc, -1, 0, NULL);
+		assert(pc->loop_lvl > 0);
+		pc->br_loop[pc->loop_lvl - 1] = pc->p->exec_tail;
+		break;
 	case TGSI_OPCODE_CEIL:
 		for (c = 0; c < 4; c++) {
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_CEIL, CVT_F32_F32);
+				 CVTOP_CEIL, CVT_F32_F32 | CVT_RI);
 		}
 		break;
 	case TGSI_OPCODE_COS:
@@ -1560,6 +1687,33 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 		if (mask & (1 << 0))
 			emit_mov_immdval(pc, dst[0], 1.0f);
 		break;
+	case TGSI_OPCODE_ELSE:
+		emit_branch(pc, -1, 0, NULL);
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		break;
+	case TGSI_OPCODE_ENDIF:
+		pc->if_insn[--pc->if_lvl]->param.index = pc->p->exec_size;
+
+		/* try to replace branch over 1 insn with a predicated insn */
+		if (nv50_kill_branch(pc) == TRUE)
+			break;
+
+		if (pc->br_join[pc->if_lvl]) {
+			pc->br_join[pc->if_lvl]->param.index = pc->p->exec_size;
+			pc->br_join[pc->if_lvl] = NULL;
+		}
+		/* emit a NOP as join point, we could set it on the next
+		 * one, but would have to make sure it is long and !immd
+		 */
+		emit_nop(pc);
+		pc->p->exec_tail->inst[1] |= 2;
+		break;
+	case TGSI_OPCODE_ENDLOOP:
+		emit_branch(pc, -1, 0, NULL);
+		pc->p->exec_tail->param.index = pc->loop_pos[--pc->loop_lvl];
+		pc->br_loop[pc->loop_lvl]->param.index = pc->p->exec_size;
+		break;
 	case TGSI_OPCODE_EX2:
 		emit_preex2(pc, temp, src[0][0]);
 		emit_flop(pc, 6, brdc, temp);
@@ -1580,6 +1734,13 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			emit_sub(pc, dst[c], src[0][c], temp);
 		}
 		break;
+	case TGSI_OPCODE_IF:
+		/* emitting a join_at may not be necessary */
+		assert(pc->if_lvl < MAX_IF_DEPTH);
+		set_pred_wr(pc, 1, 0, pc->if_cond);
+		emit_branch(pc, 0, 2, &pc->br_join[pc->if_lvl]);
+		pc->if_insn[pc->if_lvl++] = pc->p->exec_tail;
+		break;
 	case TGSI_OPCODE_KIL:
 		emit_kil(pc, src[0][0]);
 		emit_kil(pc, src[0][1]);
@@ -1704,7 +1865,7 @@ nv50_program_tx_insn(struct nv50_pc *pc,
 			if (!(mask & (1 << c)))
 				continue;
 			emit_cvt(pc, dst[c], src[0][c], -1,
-				 CVTOP_TRUNC, CVT_F32_F32);
+				 CVTOP_TRUNC, CVT_F32_F32 | CVT_RI);
 		}
 		break;
 	case TGSI_OPCODE_XPD:
@@ -2237,6 +2398,8 @@ nv50_program_tx_prep(struct nv50_pc *pc)
 			pc->result[i].rhw = rid++;
 		if (p->info.writes_z)
 			pc->result[2].rhw = rid;
+
+		p->cfg.high_result = rid;
 	}
 
 	if (pc->immd_nr) {
@@ -2362,12 +2525,75 @@ ctor_nv50_pc(struct nv50_pc *pc, struct nv50_program *p)
 	return TRUE;
 }
 
+static void
+nv50_fp_move_results(struct nv50_pc *pc)
+{
+	struct nv50_reg reg;
+	unsigned i;
+
+	ctor_reg(&reg, P_TEMP, -1, -1);
+
+	for (i = 0; i < pc->result_nr * 4; ++i) {
+		if (pc->result[i].rhw < 0 || pc->result[i].hw < 0)
+			continue;
+		if (pc->result[i].rhw != pc->result[i].hw) {
+			reg.hw = pc->result[i].rhw;
+			emit_mov(pc, &reg, &pc->result[i]);
+		}
+	}
+}
+
+static void
+nv50_program_fixup_insns(struct nv50_pc *pc)
+{
+	struct nv50_program_exec *e, *prev = NULL, **bra_list;
+	unsigned i, n, pos;
+
+	bra_list = CALLOC(pc->p->exec_size, sizeof(struct nv50_program_exec *));
+
+	/* Collect branch instructions, we need to adjust their offsets
+	 * when converting 32 bit instructions to 64 bit ones
+	 */
+	for (n = 0, e = pc->p->exec_head; e; e = e->next)
+		if (e->param.index >= 0 && !e->param.mask)
+			bra_list[n++] = e;
+
+	/* Make sure we don't have any single 32 bit instructions. */
+	for (e = pc->p->exec_head, pos = 0; e; e = e->next) {
+		pos += is_long(e) ? 2 : 1;
+
+		if ((pos & 1) && (!e->next || is_long(e->next))) {
+			for (i = 0; i < n; ++i)
+				if (bra_list[i]->param.index >= pos)
+					bra_list[i]->param.index += 1;
+			convert_to_long(pc, e);
+			++pos;
+		}
+		if (e->next)
+			prev = e;
+	}
+
+	assert(!is_immd(pc->p->exec_head));
+	assert(!is_immd(pc->p->exec_tail));
+
+	/* last instruction must be long so it can have the end bit set */
+	if (!is_long(pc->p->exec_tail)) {
+		convert_to_long(pc, pc->p->exec_tail);
+		if (prev)
+			convert_to_long(pc, prev);
+	}
+	assert(!(pc->p->exec_tail->inst[1] & 2));
+	/* set the end-bit */
+	pc->p->exec_tail->inst[1] |= 1;
+
+	FREE(bra_list);
+}
+
 static boolean
 nv50_program_tx(struct nv50_program *p)
 {
 	struct tgsi_parse_context parse;
 	struct nv50_pc *pc;
-	unsigned k;
 	boolean ret;
 
 	pc = CALLOC_STRUCT(nv50_pc);
@@ -2405,48 +2631,10 @@ nv50_program_tx(struct nv50_program *p)
 		}
 	}
 
-	if (p->type == PIPE_SHADER_FRAGMENT) {
-		struct nv50_reg out;
-		ctor_reg(&out, P_TEMP, -1, -1);
-
-		for (k = 0; k < pc->result_nr * 4; k++) {
-			if (pc->result[k].rhw == -1)
-				continue;
-			if (pc->result[k].hw != pc->result[k].rhw) {
-				out.hw = pc->result[k].rhw;
-				emit_mov(pc, &out, &pc->result[k]);
-			}
-			if (pc->p->cfg.high_result < (pc->result[k].rhw + 1))
-				pc->p->cfg.high_result = pc->result[k].rhw + 1;
-		}
-	}
-
-	/* look for single half instructions and make them long */
-	struct nv50_program_exec *e, *e_prev;
+	if (pc->p->type == PIPE_SHADER_FRAGMENT)
+		nv50_fp_move_results(pc);
 
-	for (k = 0, e = pc->p->exec_head, e_prev = NULL; e; e = e->next) {
-		if (!is_long(e))
-			k++;
-
-		if (!e->next || is_long(e->next)) {
-			if (k & 1)
-				convert_to_long(pc, e);
-			k = 0;
-		}
-
-		if (e->next)
-			e_prev = e;
-	}
-
-	if (!is_long(pc->p->exec_tail)) {
-		/* this may occur if moving FP results */
-		assert(e_prev && !is_long(e_prev));
-		convert_to_long(pc, e_prev);
-		convert_to_long(pc, pc->p->exec_tail);
-	}
-
-	assert(is_long(pc->p->exec_tail) && !is_immd(pc->p->exec_head));
-	pc->p->exec_tail->inst[1] |= 0x00000001;
+	nv50_program_fixup_insns(pc);
 
 	p->param_nr = pc->param_nr * 4;
 	p->immd_nr = pc->immd_nr * 4;
@@ -2558,6 +2746,17 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 
 		if (e->param.index < 0)
 			continue;
+
+		if (e->param.mask == 0) {
+			assert(!(e->param.index & 1));
+			/* seem to be 8 byte steps */
+			ei = (e->param.index >> 1) + 0 /* START_ID */;
+
+			e->inst[0] &= 0xf0000fff;
+			e->inst[0] |= ei << 12;
+			continue;
+		}
+
 		bs = (e->inst[1] >> 22) & 0x07;
 		assert(bs < 2);
 		ei = e->param.shift >> 5;
diff --git a/src/gallium/drivers/nv50/nv50_state_validate.c b/src/gallium/drivers/nv50/nv50_state_validate.c
index 5a3559ed181..4ed76973c4b 100644
--- a/src/gallium/drivers/nv50/nv50_state_validate.c
+++ b/src/gallium/drivers/nv50/nv50_state_validate.c
@@ -312,7 +312,7 @@ scissor_uptodate:
 			goto viewport_uptodate;
 		nv50->state.viewport_bypass = bypass;
 
-		so = so_new(12, 0);
+		so = so_new(14, 0);
 		if (!bypass) {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSLATE(0), 3);
 			so_data  (so, fui(nv50->viewport.translate[0]));
@@ -325,12 +325,21 @@ scissor_uptodate:
 
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 1);
+			/* 0x0000 = remove whole primitive only (xyz)
+			 * 0x1018 = remove whole primitive only (xy), clamp z
+			 * 0x1080 = clip primitive (xyz)
+			 * 0x1098 = clip primitive (xy), clamp z
+			 */
+			so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+			so_data  (so, 0x1080);
 			/* no idea what 0f90 does */
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 0);
 		} else {
 			so_method(so, tesla, NV50TCL_VIEWPORT_TRANSFORM_EN, 1);
 			so_data  (so, 0);
+			so_method(so, tesla, NV50TCL_VIEW_VOLUME_CLIP_CTRL, 1);
+			so_data  (so, 0x0000);
 			so_method(so, tesla, 0x0f90, 1);
 			so_data  (so, 1);
 		}
diff --git a/src/gallium/drivers/nv50/nv50_transfer.c b/src/gallium/drivers/nv50/nv50_transfer.c
index bb7731855cd..9c289026bbb 100644
--- a/src/gallium/drivers/nv50/nv50_transfer.c
+++ b/src/gallium/drivers/nv50/nv50_transfer.c
@@ -161,7 +161,7 @@ nv50_transfer_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 		return NULL;
 	}
 
-	if (usage != PIPE_TRANSFER_WRITE) {
+	if (usage & PIPE_TRANSFER_READ) {
 		nv50_transfer_rect_m2mf(pscreen, mt->base.bo, tx->level_offset,
 					tx->level_pitch, tx->level_tiling,
 					x, y,
@@ -183,7 +183,7 @@ nv50_transfer_del(struct pipe_transfer *ptx)
 	struct nv50_transfer *tx = (struct nv50_transfer *)ptx;
 	struct nv50_miptree *mt = nv50_miptree(ptx->texture);
 
-	if (ptx->usage != PIPE_TRANSFER_READ) {
+	if (ptx->usage & PIPE_TRANSFER_WRITE) {
 		struct pipe_screen *pscreen = ptx->texture->screen;
 		nv50_transfer_rect_m2mf(pscreen, tx->bo, 0, tx->base.stride,
 					tx->bo->tile_mode, 0, 0,
diff --git a/src/gallium/drivers/r300/r300_cs.h b/src/gallium/drivers/r300/r300_cs.h
index 0a7e4703636..883f0a02dc5 100644
--- a/src/gallium/drivers/r300/r300_cs.h
+++ b/src/gallium/drivers/r300/r300_cs.h
@@ -68,11 +68,17 @@
 } while (0)
 
 #define OUT_CS(value) do { \
+    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
+        DBG(cs_context_copy, DBG_CS, "r300: writing %08x\n", value); \
+    } \
     cs_winsys->write_cs_dword(cs_winsys, (value)); \
     cs_count--; \
 } while (0)
 
 #define OUT_CS_32F(value) do { \
+    if (VERY_VERBOSE_CS || VERY_VERBOSE_REGISTERS) { \
+        DBG(cs_context_copy, DBG_CS, "r300: writing %f\n", value); \
+    } \
     cs_winsys->write_cs_dword(cs_winsys, fui(value)); \
     cs_count--; \
 } while (0)
@@ -82,8 +88,9 @@
         DBG(cs_context_copy, DBG_CS, "r300: writing 0x%08X to register 0x%04X\n", \
             value, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, 0)); \
-    OUT_CS(value); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0(register, 0)); \
+    cs_winsys->write_cs_dword(cs_winsys, value); \
+    cs_count -= 2; \
 } while (0)
 
 /* Note: This expects count to be the number of registers,
@@ -93,7 +100,8 @@
         DBG(cs_context_copy, DBG_CS, "r300: writing register sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, ((count) - 1))); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1))); \
+    cs_count--; \
 } while (0)
 
 #define OUT_CS_RELOC(bo, offset, rd, wd, flags) do { \
@@ -101,9 +109,9 @@
             "domains (%d, %d, %d)\n", \
         bo, offset, rd, wd, flags); \
     assert(bo); \
-    OUT_CS(offset); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 2; \
+    cs_count -= 3; \
 } while (0)
 
 #define END_CS do { \
@@ -131,24 +139,26 @@
         DBG(cs_context_copy, DBG_CS, "r300: writing data sequence of %d to 0x%04X\n", \
             count, register); \
     assert(register); \
-    OUT_CS(CP_PACKET0(register, ((count) - 1)) | RADEON_ONE_REG_WR); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET0((register), ((count) - 1)) | RADEON_ONE_REG_WR); \
+    cs_count--; \
 } while (0)
 
 #define CP_PACKET3(op, count) \
     (RADEON_CP_PACKET3 | (op) | ((count) << 16))
 
 #define OUT_CS_PKT3(op, count) do { \
-    OUT_CS(CP_PACKET3(op, count)); \
+    cs_winsys->write_cs_dword(cs_winsys, CP_PACKET3(op, count)); \
+    cs_count--; \
 } while (0)
 
 #define OUT_CS_INDEX_RELOC(bo, offset, count, rd, wd, flags) do { \
     DBG(cs_context_copy, DBG_CS, "r300: writing relocation for index buffer %p," \
             "offset %d\n", bo, offset); \
     assert(bo); \
-    OUT_CS(offset); \
-    OUT_CS(count); \
+    cs_winsys->write_cs_dword(cs_winsys, offset); \
+    cs_winsys->write_cs_dword(cs_winsys, count); \
     cs_winsys->write_cs_reloc(cs_winsys, bo, rd, wd, flags); \
-    cs_count -= 2; \
+    cs_count -= 4; \
 } while (0)
 
 #endif /* R300_CS_H */
diff --git a/src/gallium/drivers/r300/r300_emit.c b/src/gallium/drivers/r300/r300_emit.c
index a1b36ba2ed1..77ce431cdc3 100644
--- a/src/gallium/drivers/r300/r300_emit.c
+++ b/src/gallium/drivers/r300/r300_emit.c
@@ -381,6 +381,7 @@ void r300_emit_query_end(struct r300_context* r300,
             OUT_CS_REG_SEQ(R300_ZB_ZPASS_ADDR, 1);
             OUT_CS_RELOC(r300->oqbo, query->offset + (sizeof(uint32_t) * 0),
                     0, RADEON_GEM_DOMAIN_GTT, 0);
+	    break;
         default:
             debug_printf("r300: Implementation error: Chipset reports %d"
                     " pixel pipes!\n", caps->num_frag_pipes);
diff --git a/src/gallium/drivers/r300/r300_fs.c b/src/gallium/drivers/r300/r300_fs.c
index a0e848a59ac..546ad545a53 100644
--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@@ -126,9 +126,14 @@ void r300_translate_fragment_shader(struct r300_context* r300,
     /* Invoke the compiler */
     r3xx_compile_fragment_program(&compiler);
     if (compiler.Base.Error) {
-        /* Todo: Fail gracefully */
-        fprintf(stderr, "r300 FP: Compiler error\n");
-        abort();
+        /* Todo: Fallback to software rendering gracefully? */
+        fprintf(stderr, "r300 FP: Compiler error: %s\n", compiler.Base.ErrorMsg);
+
+        if (compiler.is_r500) {
+            memcpy(compiler.code, &r5xx_passthrough_fragment_shader, sizeof(r5xx_passthrough_fragment_shader));
+        } else {
+            memcpy(compiler.code, &r3xx_passthrough_fragment_shader, sizeof(r3xx_passthrough_fragment_shader));
+        }
     }
 
     /* And, finally... */
diff --git a/src/gallium/drivers/r300/r300_fs.h b/src/gallium/drivers/r300/r300_fs.h
index 9fab7894024..967e9f697e9 100644
--- a/src/gallium/drivers/r300/r300_fs.h
+++ b/src/gallium/drivers/r300/r300_fs.h
@@ -48,4 +48,4 @@ struct r300_fragment_shader {
 void r300_translate_fragment_shader(struct r300_context* r300,
                                     struct r300_fragment_shader* fs);
 
-    #endif /* R300_FS_H */
+#endif /* R300_FS_H */
diff --git a/src/gallium/drivers/r300/r300_reg.h b/src/gallium/drivers/r300/r300_reg.h
index 03cd219cde9..3abff5db622 100644
--- a/src/gallium/drivers/r300/r300_reg.h
+++ b/src/gallium/drivers/r300/r300_reg.h
@@ -1478,6 +1478,7 @@ USE OR OTHER DEALINGS IN THE SOFTWARE.
 #       define R300_TX_PITCH_EN                  (1 << 31)
 #       define R300_TX_WIDTH(x)                  ((x) << 0)
 #       define R300_TX_HEIGHT(x)                 ((x) << 11)
+#       define R300_TX_DEPTH(x)                  ((x) << 22)
 #       define R300_TX_NUM_LEVELS(x)             ((x) << 26)
 
 #define R300_TX_FORMAT1_0                   0x44C0
diff --git a/src/gallium/drivers/r300/r300_render.c b/src/gallium/drivers/r300/r300_render.c
index 737396d8d97..ca44e0f6615 100644
--- a/src/gallium/drivers/r300/r300_render.c
+++ b/src/gallium/drivers/r300/r300_render.c
@@ -79,12 +79,14 @@ static boolean r300_render_allocate_vertices(struct vbuf_render* render,
     struct pipe_screen* screen = r300->context.screen;
     size_t size = (size_t)vertex_size * (size_t)count;
 
-    if (size + r300render->vbo_offset > r300render->vbo_size) 
+    if (size + r300render->vbo_offset > r300render->vbo_size)
     {
+        pipe_buffer_reference(&r300->vbo, NULL);
         r300render->vbo = pipe_buffer_create(screen,
                                              64,
                                              PIPE_BUFFER_USAGE_VERTEX,
                                              R300_MAX_VBO_SIZE);
+        r300render->vbo_offset = 0;
         r300render->vbo_size = R300_MAX_VBO_SIZE;
     }
 
@@ -117,7 +119,7 @@ static void r300_render_unmap_vertices(struct vbuf_render* render,
     OUT_CS_REG(R300_VAP_VF_MAX_VTX_INDX, max);
     END_CS;
 
-    r300render->vbo_max_used = MAX2(r300render->vbo_max_used, 
+    r300render->vbo_max_used = MAX2(r300render->vbo_max_used,
                                     r300render->vertex_size * (max + 1));
     pipe_buffer_unmap(screen, r300render->vbo);
 }
@@ -129,7 +131,6 @@ static void r300_render_release_vertices(struct vbuf_render* render)
 
     r300render->vbo_offset += r300render->vbo_max_used;
     r300render->vbo_max_used = 0;
-    r300->vbo = NULL;
 }
 
 static boolean r300_render_set_primitive(struct vbuf_render* render,
@@ -222,13 +223,6 @@ static void r300_render_draw(struct vbuf_render* render,
 
     r300_prepare_render(r300render, count);
 
-    /* Send our indices into an index buffer. */
-    index_buffer = pipe_buffer_create(screen, 64, PIPE_BUFFER_USAGE_VERTEX,
-                                      count * 2);
-    if (!index_buffer) {
-        return;
-    }
-
     BEGIN_CS(2 + (count+1)/2);
     OUT_CS_PKT3(R300_PACKET3_3D_DRAW_INDX_2, (count+1)/2);
     OUT_CS(R300_VAP_VF_CNTL__PRIM_WALK_INDICES | (count << 16) |
diff --git a/src/gallium/drivers/r300/r300_screen.c b/src/gallium/drivers/r300/r300_screen.c
index 3b5b1bbd37f..f2659ca61f7 100644
--- a/src/gallium/drivers/r300/r300_screen.c
+++ b/src/gallium/drivers/r300/r300_screen.c
@@ -101,11 +101,9 @@ static int r300_get_param(struct pipe_screen* pscreen, int param)
         case PIPE_CAP_MAX_RENDER_TARGETS:
             return 4;
         case PIPE_CAP_OCCLUSION_QUERY:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_TEXTURE_SHADOW_MAP:
-            /* IN THEORY */
-            return 0;
+            return 1;
         case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
             if (r300screen->caps->is_r500) {
                 /* 13 == 4096x4096 */
@@ -347,16 +345,9 @@ static void* r300_transfer_map(struct pipe_screen* screen,
 {
     struct r300_texture* tex = (struct r300_texture*)transfer->texture;
     char* map;
-    unsigned flags = 0;
-
-    if (transfer->usage != PIPE_TRANSFER_WRITE) {
-        flags |= PIPE_BUFFER_USAGE_CPU_READ;
-    }
-    if (transfer->usage != PIPE_TRANSFER_READ) {
-        flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-    }
 
-    map = pipe_buffer_map(screen, tex->buffer, flags);
+    map = pipe_buffer_map(screen, tex->buffer,
+                          pipe_transfer_buffer_flags(transfer));
 
     if (!map) {
         return NULL;
diff --git a/src/gallium/drivers/r300/r300_state_derived.c b/src/gallium/drivers/r300/r300_state_derived.c
index 5f6b225d340..02b7ab91076 100644
--- a/src/gallium/drivers/r300/r300_state_derived.c
+++ b/src/gallium/drivers/r300/r300_state_derived.c
@@ -53,7 +53,7 @@ static void r300_vs_tab_routes(struct r300_context* r300,
     if (!r300screen->caps->has_tcl || !r300->rs_state->enable_vte)
     {
         for (i = 0; i < info->num_inputs; i++) {
-            switch (info->input_semantic_name[i]) {
+            switch (r300->vs->code.inputs[i]) {
                 case TGSI_SEMANTIC_POSITION:
                     pos = TRUE;
                     tab[i] = 0;
@@ -63,10 +63,12 @@ static void r300_vs_tab_routes(struct r300_context* r300,
                     cols++;
                     break;
                 case TGSI_SEMANTIC_PSIZE:
+                    assert(psize == FALSE);
                     psize = TRUE;
                     tab[i] = 15;
                     break;
                 case TGSI_SEMANTIC_FOG:
+                    assert(fog == FALSE);
                     fog = TRUE;
                     /* Fall through */
                 case TGSI_SEMANTIC_GENERIC:
@@ -125,7 +127,9 @@ static void r300_vs_tab_routes(struct r300_context* r300,
 
     vinfo->hwfmt[0] = 0x5555; /* XXX this is classic Mesa bonghits */
 
-    if (!pos) {
+    /* We need to add vertex position attribute only for SW TCL case,
+     * for HW TCL case it could be generated by vertex shader */
+    if (!pos && !r300screen->caps->has_tcl) {
         debug_printf("r300: Forcing vertex position attribute emit...\n");
         /* Make room for the position attribute
          * at the beginning of the tab. */
@@ -164,7 +168,7 @@ static void r300_vs_tab_routes(struct r300_context* r300,
         vinfo->hwfmt[3] |= (4 << (3 * i));
     }
 
-    for (i; i < texs; i++) {
+    for (; i < texs; i++) {
         draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE,
             draw_find_vs_output(r300->draw, TGSI_SEMANTIC_GENERIC, i));
         vinfo->hwfmt[1] |= (R300_INPUT_CNTL_TC0 << i);
@@ -334,48 +338,37 @@ static void r300_update_rs_block(struct r300_context* r300)
     struct r300_rs_block* rs = r300->rs_block;
     struct tgsi_shader_info* info = &r300->fs->info;
     int* tab = r300->vertex_info.fs_tab;
-    int col_count = 0, fp_offset = 0, i, memory_pos, tex_count = 0;
-
+    int col_count = 0, fp_offset = 0, i, tex_count = 0;
+    int rs_tex_comp = 0;
     memset(rs, 0, sizeof(struct r300_rs_block));
 
     if (r300_screen(r300->context.screen)->caps->is_r500) {
         for (i = 0; i < info->num_inputs; i++) {
             assert(tab[i] != -1);
-            memory_pos = tab[i] * 4;
             switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_COLOR:
                     rs->ip[col_count] |=
-                        R500_RS_COL_PTR(memory_pos) |
+                        R500_RS_COL_PTR(col_count) |
                         R500_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
                     col_count++;
                     break;
                 case TGSI_SEMANTIC_GENERIC:
                     rs->ip[tex_count] |=
-                        R500_RS_SEL_S(memory_pos) |
-                        R500_RS_SEL_T(memory_pos + 1) |
-                        R500_RS_SEL_R(memory_pos + 2) |
-                        R500_RS_SEL_Q(memory_pos + 3);
+                        R500_RS_SEL_S(rs_tex_comp) |
+                        R500_RS_SEL_T(rs_tex_comp + 1) |
+                        R500_RS_SEL_R(rs_tex_comp + 2) |
+                        R500_RS_SEL_Q(rs_tex_comp + 3);
                     tex_count++;
+                    rs_tex_comp += 4;
                     break;
                 default:
                     break;
             }
         }
 
-        if (col_count == 0) {
-            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
-        }
-
-        if (tex_count == 0) {
-            rs->ip[0] |=
-                R500_RS_SEL_S(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_T(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_R(R500_RS_IP_PTR_K0) |
-                R500_RS_SEL_Q(R500_RS_IP_PTR_K1);
-        }
-
         /* Rasterize at least one color, or bad things happen. */
         if ((col_count == 0) && (tex_count == 0)) {
+            rs->ip[0] |= R500_RS_COL_FMT(R300_RS_COL_FMT_0001);
             col_count++;
         }
 
@@ -393,22 +386,22 @@ static void r300_update_rs_block(struct r300_context* r300)
     } else {
         for (i = 0; i < info->num_inputs; i++) {
             assert(tab[i] != -1);
-            memory_pos = tab[i] * 4;
             switch (info->input_semantic_name[i]) {
                 case TGSI_SEMANTIC_COLOR:
                     rs->ip[col_count] |=
-                        R300_RS_COL_PTR(memory_pos) |
+                        R300_RS_COL_PTR(col_count) |
                         R300_RS_COL_FMT(R300_RS_COL_FMT_RGBA);
                     col_count++;
                     break;
                 case TGSI_SEMANTIC_GENERIC:
                     rs->ip[tex_count] |=
-                        R300_RS_TEX_PTR(memory_pos) |
+                        R300_RS_TEX_PTR(rs_tex_comp) |
                         R300_RS_SEL_S(R300_RS_SEL_C0) |
                         R300_RS_SEL_T(R300_RS_SEL_C1) |
                         R300_RS_SEL_R(R300_RS_SEL_C2) |
                         R300_RS_SEL_Q(R300_RS_SEL_C3);
                     tex_count++;
+                    rs_tex_comp+=4;
                     break;
                 default:
                     break;
@@ -445,7 +438,7 @@ static void r300_update_rs_block(struct r300_context* r300)
         }
     }
 
-    rs->count = (tex_count * 4) | (col_count << R300_IC_COUNT_SHIFT) |
+    rs->count = (rs_tex_comp) | (col_count << R300_IC_COUNT_SHIFT) |
         R300_HIRES_EN;
 
     rs->inst_count = MAX2(MAX2(col_count - 1, tex_count - 1), 0);
diff --git a/src/gallium/drivers/r300/r300_texture.c b/src/gallium/drivers/r300/r300_texture.c
index 6e8c3683200..ce60ded7caf 100644
--- a/src/gallium/drivers/r300/r300_texture.c
+++ b/src/gallium/drivers/r300/r300_texture.c
@@ -22,34 +22,39 @@
 
 #include "r300_texture.h"
 
-static void r300_setup_texture_state(struct r300_texture* tex,
-                                     unsigned width,
-                                     unsigned height,
-                                     unsigned levels)
+static void r300_setup_texture_state(struct r300_texture* tex)
 {
     struct r300_texture_state* state = &tex->state;
+    struct pipe_texture *pt = &tex->tex;
 
-    state->format0 = R300_TX_WIDTH((width - 1) & 0x7ff) |
-        R300_TX_HEIGHT((height - 1) & 0x7ff) |
-        R300_TX_NUM_LEVELS(levels) |
+    state->format0 = R300_TX_WIDTH((pt->width[0] - 1) & 0x7ff) |
+        R300_TX_HEIGHT((pt->height[0] - 1) & 0x7ff) |
+        R300_TX_DEPTH(util_logbase2(pt->depth[0]) & 0xf) |
+        R300_TX_NUM_LEVELS(pt->last_level) |
         R300_TX_PITCH_EN;
 
     /* XXX */
-    state->format1 = r300_translate_texformat(tex->tex.format);
+    state->format1 = r300_translate_texformat(pt->format);
+    if (pt->target == PIPE_TEXTURE_CUBE) {
+	state->format1 |= R300_TX_FORMAT_CUBIC_MAP;
+    }
+    if (pt->target == PIPE_TEXTURE_3D) {
+	state->format1 |= R300_TX_FORMAT_3D;
+    }
 
-    state->format2 = r300_texture_get_stride(tex, 0);
+    state->format2 = (r300_texture_get_stride(tex, 0) / pt->block.size) - 1;
 
     /* Assume (somewhat foolishly) that oversized textures will
      * not be permitted by the state tracker. */
-    if (width > 2048) {
+    if (pt->width[0] > 2048) {
         state->format2 |= R500_TXWIDTH_BIT11;
     }
-    if (height > 2048) {
+    if (pt->height[0] > 2048) {
         state->format2 |= R500_TXHEIGHT_BIT11;
     }
 
-    debug_printf("r300: Set texture state (%dx%d, pitch %d, %d levels)\n",
-            width, height, levels);
+    debug_printf("r300: Set texture state (%dx%d, %d levels)\n",
+		 pt->width[0], pt->height[0], pt->last_level);
 }
 
 /**
@@ -62,7 +67,7 @@ unsigned r300_texture_get_stride(struct r300_texture* tex, unsigned level)
         return tex->stride_override;
 
     if (level > tex->tex.last_level) {
-        debug_printf("%s: level (%u) > last_level (%u)\n", level, tex->tex.last_level);
+        debug_printf("%s: level (%u) > last_level (%u)\n", __FUNCTION__, level, tex->tex.last_level);
         return 0;
     }
 
@@ -120,8 +125,7 @@ static struct pipe_texture*
 
     r300_setup_miptree(tex);
 
-    r300_setup_texture_state(tex, template->width[0], template->height[0],
-                             template->last_level);
+    r300_setup_texture_state(tex);
 
     tex->buffer = screen->buffer_create(screen, 1024,
                                         PIPE_BUFFER_USAGE_PIXEL,
@@ -204,7 +208,7 @@ static struct pipe_texture*
     tex->stride_override = *stride;
 
     /* XXX */
-    r300_setup_texture_state(tex, tex->tex.width[0], tex->tex.height[0], 0);
+    r300_setup_texture_state(tex);
 
     pipe_buffer_reference(&tex->buffer, buffer);
 
diff --git a/src/gallium/drivers/r300/r300_texture.h b/src/gallium/drivers/r300/r300_texture.h
index 3109af5baca..78ee0f1611a 100644
--- a/src/gallium/drivers/r300/r300_texture.h
+++ b/src/gallium/drivers/r300/r300_texture.h
@@ -72,6 +72,11 @@ static INLINE uint32_t r300_translate_texformat(enum pipe_format format)
         /* W24_FP */
         case PIPE_FORMAT_Z24S8_UNORM:
             return R300_EASY_TX_FORMAT(X, X, X, X, W24_FP);
+	/* Z5_Y6_X5 */
+        case PIPE_FORMAT_R16_SNORM:
+            return R300_EASY_TX_FORMAT(X, X, X, X, Z5Y6X5);
+        case PIPE_FORMAT_Z16_UNORM:
+	    return R300_EASY_TX_FORMAT(X, X, X, X, X16);
         default:
             debug_printf("r300: Implementation error: "
                 "Got unsupported texture format %s in %s\n",
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 516e3992fdd..bcb887a0b26 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -6,26 +6,17 @@ LIBNAME = softpipe
 C_SOURCES = \
 	sp_fs_exec.c \
 	sp_fs_sse.c \
-	sp_fs_llvm.c \
 	sp_clear.c \
 	sp_flush.c \
 	sp_query.c \
 	sp_context.c \
 	sp_draw_arrays.c \
-	sp_prim_setup.c \
 	sp_prim_vbuf.c \
 	sp_quad_pipe.c \
-	sp_quad_alpha_test.c \
-	sp_quad_blend.c \
-	sp_quad_colormask.c \
-	sp_quad_coverage.c \
+	sp_quad_stipple.c \
 	sp_quad_depth_test.c \
-	sp_quad_earlyz.c \
 	sp_quad_fs.c \
-	sp_quad_occlusion.c \
-	sp_quad_output.c \
-	sp_quad_stencil.c \
-	sp_quad_stipple.c \
+	sp_quad_blend.c \
 	sp_screen.c \
         sp_setup.c \
 	sp_state_blend.c \
@@ -38,7 +29,9 @@ C_SOURCES = \
 	sp_state_vertex.c \
 	sp_texture.c \
 	sp_tex_sample.c \
+	sp_tex_tile_cache.c \
 	sp_tile_cache.c \
-	sp_surface.c 
+	sp_surface.c \
+	sp_video_context.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index f8720638a76..aac9edf44e6 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -7,25 +7,16 @@ softpipe = env.ConvenienceLibrary(
 	source = [
 		'sp_fs_exec.c',
 		'sp_fs_sse.c',
-		'sp_fs_llvm.c',
 		'sp_clear.c',
 		'sp_context.c',
 		'sp_draw_arrays.c',
 		'sp_flush.c',
-		'sp_prim_setup.c',
 		'sp_prim_vbuf.c',
 		'sp_setup.c',
-		'sp_quad_alpha_test.c',
 		'sp_quad_blend.c',
 		'sp_quad_pipe.c',
-		'sp_quad_colormask.c',
-		'sp_quad_coverage.c',
 		'sp_quad_depth_test.c',
-		'sp_quad_earlyz.c',
 		'sp_quad_fs.c',
-		'sp_quad_occlusion.c',
-		'sp_quad_output.c',
-		'sp_quad_stencil.c',
 		'sp_quad_stipple.c',
 		'sp_query.c',
 		'sp_screen.c',
@@ -39,8 +30,10 @@ softpipe = env.ConvenienceLibrary(
 		'sp_state_vertex.c',
 		'sp_surface.c',
 		'sp_tex_sample.c',
+		'sp_tex_tile_cache.c',
 		'sp_texture.c',
 		'sp_tile_cache.c',
+		'sp_video_context.c',
 	])
 
-Export('softpipe')
-\ No newline at end of file
+Export('softpipe')
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index c699c433e9c..94d000a5acc 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -31,17 +31,18 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_flush.h"
-#include "sp_prim_setup.h"
 #include "sp_prim_vbuf.h"
 #include "sp_state.h"
 #include "sp_surface.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_texture.h"
 #include "sp_winsys.h"
 #include "sp_query.h"
@@ -72,13 +73,10 @@ softpipe_unmap_transfers(struct softpipe_context *sp)
 {
    uint i;
 
-   for (i = 0; i < sp->framebuffer.nr_cbufs; i++)
-      sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
-   sp_flush_tile_cache(sp, sp->zsbuf_cache);
-
    for (i = 0; i < sp->framebuffer.nr_cbufs; i++) {
       sp_tile_cache_unmap_transfers(sp->cbuf_cache[i]);
    }
+
    sp_tile_cache_unmap_transfers(sp->zsbuf_cache);
 }
 
@@ -92,26 +90,21 @@ softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
-      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
-      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
-      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
-      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
-      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
-      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
-      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
-      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
-      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
-      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
-   }
+      softpipe->quad.shade->destroy( softpipe->quad.shade );
+      softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
+      softpipe->quad.blend->destroy( softpipe->quad.blend );
 
-   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
+      pipe_surface_reference(&softpipe->framebuffer.cbufs[i], NULL);
+   }
    sp_destroy_tile_cache(softpipe->zsbuf_cache);
+   pipe_surface_reference(&softpipe->framebuffer.zsbuf, NULL);
 
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      sp_destroy_tile_cache(softpipe->tex_cache[i]);
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      sp_destroy_tex_tile_cache(softpipe->tex_cache[i]);
+      pipe_texture_reference(&softpipe->texture[i], NULL);
+   }
 
    for (i = 0; i < Elements(softpipe->constants); i++) {
       if (softpipe->constants[i].buffer) {
@@ -246,41 +239,14 @@ softpipe_create( struct pipe_screen *screen )
    softpipe->zsbuf_cache = sp_create_tile_cache( screen );
 
    for (i = 0; i < PIPE_MAX_SAMPLERS; i++)
-      softpipe->tex_cache[i] = sp_create_tile_cache( screen );
+      softpipe->tex_cache[i] = sp_create_tex_tile_cache( screen );
 
 
    /* setup quad rendering stages */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
-      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
-      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
-      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
-      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
-      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
-      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
-      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
-      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
-      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
-      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
-   }
+      softpipe->quad.shade = sp_quad_shade_stage(softpipe);
+      softpipe->quad.depth_test = sp_quad_depth_test_stage(softpipe);
+      softpipe->quad.blend = sp_quad_blend_stage(softpipe);
 
-   /* vertex shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.vert_samplers[i].base.get_samples = sp_get_samples_vertex;
-      softpipe->tgsi.vert_samplers[i].unit = i;
-      softpipe->tgsi.vert_samplers[i].sp = softpipe;
-      softpipe->tgsi.vert_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.vert_samplers_list[i] = &softpipe->tgsi.vert_samplers[i];
-   }
-
-   /* fragment shader samplers */
-   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
-      softpipe->tgsi.frag_samplers[i].base.get_samples = sp_get_samples_fragment;
-      softpipe->tgsi.frag_samplers[i].unit = i;
-      softpipe->tgsi.frag_samplers[i].sp = softpipe;
-      softpipe->tgsi.frag_samplers[i].cache = softpipe->tex_cache[i];
-      softpipe->tgsi.frag_samplers_list[i] = &softpipe->tgsi.frag_samplers[i];
-   }
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -294,30 +260,28 @@ softpipe_create( struct pipe_screen *screen )
                          (struct tgsi_sampler **)
                             softpipe->tgsi.vert_samplers_list);
 
-   softpipe->setup = sp_draw_render_stage(softpipe);
-   if (!softpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "SP_NO_RAST", FALSE ))
       softpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "SP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(softpipe->draw, softpipe->setup);
-   }
-   else {
-      sp_init_vbuf(softpipe);
-   }
+   softpipe->vbuf_backend = sp_create_vbuf_backend(softpipe);
+   if (!softpipe->vbuf_backend)
+      goto fail;
+
+   softpipe->vbuf = draw_vbuf_stage(softpipe->draw, softpipe->vbuf_backend);
+   if (!softpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(softpipe->draw, softpipe->vbuf);
+   draw_set_render(softpipe->draw, softpipe->vbuf_backend);
+
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(softpipe->draw, &softpipe->pipe);
    draw_install_aapoint_stage(softpipe->draw, &softpipe->pipe);
 
-#if USE_DRAW_STAGE_PSTIPPLE
    /* Do polygon stipple w/ texture map + frag prog? */
    draw_install_pstipple_stage(softpipe->draw, &softpipe->pipe);
-#endif
 
    sp_init_surface_functions(softpipe);
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 7888c2f644b..43a195c8ef5 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -36,24 +36,13 @@
 #include "draw/draw_vertex.h"
 
 #include "sp_quad_pipe.h"
-#include "sp_tex_sample.h"
 
 
-/**
- * This is a temporary variable for testing draw-stage polygon stipple.
- * If zero, do stipple in sp_quad_stipple.c
- */
-#define USE_DRAW_STAGE_PSTIPPLE 1
-
-/* Number of threads working on individual quads.
- * Setting to 1 disables this feature.
- */
-#define SP_NUM_QUAD_THREADS 1
-
 struct softpipe_vbuf_render;
 struct draw_context;
 struct draw_stage;
 struct softpipe_tile_cache;
+struct softpipe_tex_tile_cache;
 struct sp_fragment_shader;
 struct sp_vertex_shader;
 
@@ -62,12 +51,12 @@ struct softpipe_context {
    struct pipe_context pipe;  /**< base class */
 
    /** Constant state objects */
-   const struct pipe_blend_state *blend;
-   const struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
-   const struct pipe_depth_stencil_alpha_state *depth_stencil;
-   const struct pipe_rasterizer_state *rasterizer;
-   const struct sp_fragment_shader *fs;
-   const struct sp_vertex_shader *vs;
+   struct pipe_blend_state *blend;
+   struct pipe_sampler_state *sampler[PIPE_MAX_SAMPLERS];
+   struct pipe_depth_stencil_alpha_state *depth_stencil;
+   struct pipe_rasterizer_state *rasterizer;
+   struct sp_fragment_shader *fs;
+   struct sp_vertex_shader *vs;
 
    /** Other rendering state */
    struct pipe_blend_color blend_color;
@@ -107,7 +96,16 @@ struct softpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   unsigned reduced_api_prim;  /**< PIPE_PRIM_POINTS, _LINES or _TRIANGLES */
+   /* The reduced version of the primitive supplied by the state
+    * tracker.
+    */
+   unsigned reduced_api_prim;
+
+   /* The reduced primitive after unfilled triangles, wide-line
+    * decomposition, etc, are taken into account.  This is the
+    * primitive actually rasterized.
+    */
+   unsigned reduced_prim;
 
    /** Derived from scissor and surface bounds: */
    struct pipe_scissor_state cliprect;
@@ -116,41 +114,33 @@ struct softpipe_context {
 
    /** Software quad rendering pipeline */
    struct {
-      struct quad_stage *polygon_stipple;
-      struct quad_stage *earlyz;
       struct quad_stage *shade;
-      struct quad_stage *alpha_test;
-      struct quad_stage *stencil_test;
       struct quad_stage *depth_test;
-      struct quad_stage *occlusion;
-      struct quad_stage *coverage;
       struct quad_stage *blend;
-      struct quad_stage *colormask;
-      struct quad_stage *output;
 
       struct quad_stage *first; /**< points to one of the above stages */
-   } quad[SP_NUM_QUAD_THREADS];
+   } quad;
 
    /** TGSI exec things */
    struct {
-      struct sp_shader_sampler vert_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *vert_samplers_list[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler frag_samplers[PIPE_MAX_SAMPLERS];
-      struct sp_shader_sampler *frag_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *vert_samplers_list[PIPE_MAX_SAMPLERS];
+      struct sp_sampler_varient *frag_samplers_list[PIPE_MAX_SAMPLERS];
    } tgsi;
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct softpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
    
    struct softpipe_tile_cache *cbuf_cache[PIPE_MAX_COLOR_BUFS];
    struct softpipe_tile_cache *zsbuf_cache;
-
-   struct softpipe_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
+   
+   unsigned tex_timestamp;
+   struct softpipe_tex_tile_cache *tex_cache[PIPE_MAX_SAMPLERS];
 
    unsigned use_sse : 1;
    unsigned dump_fs : 1;
@@ -164,5 +154,9 @@ softpipe_context( struct pipe_context *pipe )
    return (struct softpipe_context *)pipe;
 }
 
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe);
+
+
 #endif /* SP_CONTEXT_H */
 
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index 4a14d49686e..e38b767cf2c 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -37,6 +37,7 @@
 #include "sp_surface.h"
 #include "sp_state.h"
 #include "sp_tile_cache.h"
+#include "sp_tex_tile_cache.h"
 #include "sp_winsys.h"
 
 
@@ -52,17 +53,19 @@ softpipe_flush( struct pipe_context *pipe,
 
    if (flags & PIPE_FLUSH_TEXTURE_CACHE) {
       for (i = 0; i < softpipe->num_textures; i++) {
-         sp_flush_tile_cache(softpipe, softpipe->tex_cache[i]);
+         sp_flush_tex_tile_cache(softpipe->tex_cache[i]);
       }
    }
 
-   if (flags & PIPE_FLUSH_RENDER_CACHE) {
+   if (flags & PIPE_FLUSH_SWAPBUFFERS) {
+      /* If this is a swapbuffers, just flush color buffers.
+       *
+       * The zbuffer changes are not discarded, but held in the cache
+       * in the hope that a later clear will wipe them out.
+       */
       for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
          if (softpipe->cbuf_cache[i])
-            sp_flush_tile_cache(softpipe, softpipe->cbuf_cache[i]);
-
-      if (softpipe->zsbuf_cache)
-         sp_flush_tile_cache(softpipe, softpipe->zsbuf_cache);
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
 
       /* Need this call for hardware buffers before swapbuffers.
        *
@@ -71,7 +74,15 @@ softpipe_flush( struct pipe_context *pipe,
        * to unmap surfaces when flushing.
        */
       softpipe_unmap_transfers(softpipe);
-      
+   }
+   else if (flags & PIPE_FLUSH_RENDER_CACHE) {
+      for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++)
+         if (softpipe->cbuf_cache[i])
+            sp_flush_tile_cache(softpipe->cbuf_cache[i]);
+
+      if (softpipe->zsbuf_cache)
+         sp_flush_tile_cache(softpipe->zsbuf_cache);
+     
       softpipe->dirty_render_cache = FALSE;
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index 9ee86fe7878..4076114d392 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -59,15 +59,34 @@ sp_exec_fragment_shader(const struct sp_fragment_shader *base)
 }
 
 
+static void
+exec_prepare( const struct sp_fragment_shader *base,
+	      struct tgsi_exec_machine *machine,
+	      struct tgsi_sampler **samplers )
+{
+   /*
+    * Bind tokens/shader to the interpreter's machine state.
+    * Avoid redundant binding.
+    */
+   if (machine->Tokens != base->shader.tokens) {
+      tgsi_exec_machine_bind_shader( machine,
+                                     base->shader.tokens,
+                                     PIPE_MAX_SAMPLERS,
+                                     samplers );
+   }
+}
+
+
+
 /**
  * Compute quad X,Y,Z,W for the four fragments in a quad.
  *
  * This should really be part of the compiled shader.
  */
-void
-sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
-		    float x, float y,
-		    struct tgsi_exec_vector *quadpos)
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+                 float x, float y,
+                 struct tgsi_exec_vector *quadpos)
 {
    uint chan;
    /* do X */
@@ -95,24 +114,6 @@ sp_setup_pos_vector(const struct tgsi_interp_coef *coef,
 }
 
 
-static void
-exec_prepare( const struct sp_fragment_shader *base,
-	      struct tgsi_exec_machine *machine,
-	      struct tgsi_sampler **samplers )
-{
-   /*
-    * Bind tokens/shader to the interpreter's machine state.
-    * Avoid redundant binding.
-    */
-   if (machine->Tokens != base->shader.tokens) {
-      tgsi_exec_machine_bind_shader( machine,
-                                     base->shader.tokens,
-                                     PIPE_MAX_SAMPLERS,
-                                     samplers );
-   }
-}
-
-
 /* TODO: hide the machine struct in here somewhere, remove from this
  * interface:
  */
@@ -122,11 +123,43 @@ exec_run( const struct sp_fragment_shader *base,
 	  struct quad_header *quad )
 {
    /* Compute X, Y, Z, W vals for this quad */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       &machine->QuadPos);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    &machine->QuadPos);
    
-   return tgsi_exec_machine_run( machine );
+   quad->inout.mask &= tgsi_exec_machine_run( machine );
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_fs_llvm.c b/src/gallium/drivers/softpipe/sp_fs_llvm.c
deleted file mode 100644
index 95c0d982d12..00000000000
--- a/src/gallium/drivers/softpipe/sp_fs_llvm.c
+++ /dev/null
@@ -1,205 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Execute fragment shader using LLVM code generation.
- * Authors:
- *   Zack Rusin
- */
-
-#include "sp_context.h"
-#include "sp_state.h"
-#include "sp_fs.h"
-
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_sse2.h"
-
-#if 0
-
-/**
- * Subclass of sp_fragment_shader
- */
-struct sp_llvm_fragment_shader
-{
-   struct sp_fragment_shader base;
-   struct gallivm_prog *llvm_prog;
-};
-
-
-static void
-shade_quad_llvm(struct quad_stage *qs,
-                struct quad_header *quad)
-{
-   struct quad_shade_stage *qss = quad_shade_stage(qs);
-   struct softpipe_context *softpipe = qs->softpipe;
-   float dests[4][16][4] ALIGN16_ATTRIB;
-   float inputs[4][16][4] ALIGN16_ATTRIB;
-   const float fx = (float) quad->x0;
-   const float fy = (float) quad->y0;
-   struct gallivm_prog *llvm = qss->llvm_prog;
-
-   inputs[0][0][0] = fx;
-   inputs[1][0][0] = fx + 1.0f;
-   inputs[2][0][0] = fx;
-   inputs[3][0][0] = fx + 1.0f;
-
-   inputs[0][0][1] = fy;
-   inputs[1][0][1] = fy;
-   inputs[2][0][1] = fy + 1.0f;
-   inputs[3][0][1] = fy + 1.0f;
-
-
-   gallivm_prog_inputs_interpolate(llvm, inputs, quad->coef);
-
-#if DLLVM
-   debug_printf("MASK = %d\n", quad->mask);
-   for (int i = 0; i < 4; ++i) {
-      for (int j = 0; j < 2; ++j) {
-         debug_printf("IN(%d,%d) [%f %f %f %f]\n", i, j, 
-                inputs[i][j][0], inputs[i][j][1], inputs[i][j][2], inputs[i][j][3]);
-      }
-   }
-#endif
-
-   quad->mask &=
-      gallivm_fragment_shader_exec(llvm, fx, fy, dests, inputs,
-                                   softpipe->mapped_constants[PIPE_SHADER_FRAGMENT],
-                                   qss->samplers);
-#if DLLVM
-   debug_printf("OUT LLVM = 1[%f %f %f %f], 2[%f %f %f %f]\n",
-          dests[0][0][0], dests[0][0][1], dests[0][0][2], dests[0][0][3], 
-          dests[0][1][0], dests[0][1][1], dests[0][1][2], dests[0][1][3]);
-#endif
-
-   /* store result color */
-   if (qss->colorOutSlot >= 0) {
-      unsigned i;
-      /* XXX need to handle multiple color outputs someday */
-      allvmrt(qss->stage.softpipe->fs->info.output_semantic_name[qss->colorOutSlot]
-             == TGSI_SEMANTIC_COLOR);
-      for (i = 0; i < QUAD_SIZE; ++i) {
-         quad->outputs.color[0][0][i] = dests[i][qss->colorOutSlot][0];
-         quad->outputs.color[0][1][i] = dests[i][qss->colorOutSlot][1];
-         quad->outputs.color[0][2][i] = dests[i][qss->colorOutSlot][2];
-         quad->outputs.color[0][3][i] = dests[i][qss->colorOutSlot][3];
-      }
-   }
-#if DLLVM
-   for (int i = 0; i < QUAD_SIZE; ++i) {
-      debug_printf("QLLVM%d(%d) [%f, %f, %f, %f]\n", i, qss->colorOutSlot,
-             quad->outputs.color[0][0][i],
-             quad->outputs.color[0][1][i],
-             quad->outputs.color[0][2][i],
-             quad->outputs.color[0][3][i]);
-   }
-#endif
-
-   /* store result Z */
-   if (qss->depthOutSlot >= 0) {
-      /* output[slot] is new Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = dests[i][0][2];
-      }
-   }
-   else {
-      /* copy input Z (which was interpolated by the executor) to output Z */
-      uint i;
-      for (i = 0; i < 4; i++) {
-         quad->outputs.depth[i] = inputs[i][0][2];
-      }
-   }
-#if DLLVM
-   debug_printf("D [%f, %f, %f, %f] mask = %d\n",
-             quad->outputs.depth[0],
-             quad->outputs.depth[1],
-             quad->outputs.depth[2],
-             quad->outputs.depth[3], quad->mask);
-#endif
-
-   /* shader may cull fragments */
-   if( quad->mask ) {
-      qs->next->run( qs->next, quad );
-   }
-}
-
-
-unsigned 
-run_llvm_fs( const struct sp_fragment_shader *base,
-	     struct foo *machine )
-{
-}
-
-
-void 
-delete_llvm_fs( struct sp_fragment_shader *base )
-{
-   FREE(base);
-}
-
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-                        const struct pipe_shader_state *templ)
-{
-   struct sp_llvm_fragment_shader *shader = NULL;
-
-   /* LLVM fragment shaders currently disabled:
-    */
-   state = CALLOC_STRUCT(sp_llvm_shader_state);
-   if (!state)
-      return NULL;
-
-   state->llvm_prog = 0;
-
-   if (!gallivm_global_cpu_engine()) {
-      gallivm_cpu_engine_create(state->llvm_prog);
-   }
-   else
-      gallivm_cpu_jit_compile(gallivm_global_cpu_engine(), state->llvm_prog);
-   
-   if (shader) {
-      shader->base.run = run_llvm_fs;
-      shader->base.delete = delete_llvm_fs;
-   }
-
-   return shader;
-}
-
-
-#else
-
-struct sp_fragment_shader *
-softpipe_create_fs_llvm(struct softpipe_context *softpipe,
-		       const struct pipe_shader_state *templ)
-{
-   return NULL;
-}
-
-#endif
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index f4fa0905d74..9d3e4670eef 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -76,6 +76,43 @@ fs_sse_prepare( const struct sp_fragment_shader *base,
 }
 
 
+
+/**
+ * Compute quad X,Y,Z,W for the four fragments in a quad.
+ *
+ * This should really be part of the compiled shader.
+ */
+static void
+setup_pos_vector(const struct tgsi_interp_coef *coef,
+		    float x, float y,
+		    struct tgsi_exec_vector *quadpos)
+{
+   uint chan;
+   /* do X */
+   quadpos->xyzw[0].f[0] = x;
+   quadpos->xyzw[0].f[1] = x + 1;
+   quadpos->xyzw[0].f[2] = x;
+   quadpos->xyzw[0].f[3] = x + 1;
+
+   /* do Y */
+   quadpos->xyzw[1].f[0] = y;
+   quadpos->xyzw[1].f[1] = y;
+   quadpos->xyzw[1].f[2] = y + 1;
+   quadpos->xyzw[1].f[3] = y + 1;
+
+   /* do Z and W for all fragments in the quad */
+   for (chan = 2; chan < 4; chan++) {
+      const float dadx = coef->dadx[chan];
+      const float dady = coef->dady[chan];
+      const float a0 = coef->a0[chan] + dadx * x + dady * y;
+      quadpos->xyzw[chan].f[0] = a0;
+      quadpos->xyzw[chan].f[1] = a0 + dadx;
+      quadpos->xyzw[chan].f[2] = a0 + dady;
+      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
+   }
+}
+
+
 /* TODO: codegenerate the whole run function, skip this wrapper.
  * TODO: break dependency on tgsi_exec_machine struct
  * TODO: push Position calculation into the generated shader
@@ -89,9 +126,9 @@ fs_sse_run( const struct sp_fragment_shader *base,
    struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
 
    /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
-   sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->input.x0, (float)quad->input.y0, 
-		       machine->Temps);
+   setup_pos_vector(quad->posCoef, 
+                    (float)quad->input.x0, (float)quad->input.y0, 
+                    machine->Temps);
 
    /* init kill mask */
    tgsi_set_kill_mask(machine, 0x0);
@@ -104,7 +141,39 @@ fs_sse_run( const struct sp_fragment_shader *base,
 		 //	 , &machine->QuadPos
       );
 
-   return ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
+   if (quad->inout.mask == 0)
+      return FALSE;
+
+   /* store outputs */
+   {
+      const ubyte *sem_name = base->info.output_semantic_name;
+      const ubyte *sem_index = base->info.output_semantic_index;
+      const uint n = base->info.num_outputs;
+      uint i;
+      for (i = 0; i < n; i++) {
+         switch (sem_name[i]) {
+         case TGSI_SEMANTIC_COLOR:
+            {
+               uint cbuf = sem_index[i];
+               memcpy(quad->output.color[cbuf],
+                      &machine->Outputs[i].xyzw[0].f[0],
+                      sizeof(quad->output.color[0]) );
+            }
+            break;
+         case TGSI_SEMANTIC_POSITION:
+            {
+               uint j;
+               for (j = 0; j < 4; j++) {
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+               }
+            }
+            break;
+         }
+      }
+   }
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.c b/src/gallium/drivers/softpipe/sp_prim_setup.c
deleted file mode 100644
index 038ff04d4f1..00000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in sp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <[email protected]>
- * \author  Brian Paul
- */
-
-
-#include "sp_context.h"
-#include "sp_setup.h"
-#include "sp_state.h"
-#include "sp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *sp_draw_render_stage( struct softpipe_context *softpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = setup_create_context(softpipe);
-   sstage->stage.draw = softpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-sp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-sp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/softpipe/sp_prim_setup.h b/src/gallium/drivers/softpipe/sp_prim_setup.h
deleted file mode 100644
index 49bdd98ed87..00000000000
--- a/src/gallium/drivers/softpipe/sp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef SP_PRIM_SETUP_H
-#define SP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct softpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-sp_draw_render_stage( struct softpipe_context *softpipe );
-
-extern struct setup_context *
-sp_draw_setup_context( struct draw_stage * );
-
-extern void
-sp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-sp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* SP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.c b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
index 42021789ea8..e603c20fc4c 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.c
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.c
@@ -37,13 +37,13 @@
 
 
 #include "sp_context.h"
+#include "sp_setup.h"
 #include "sp_state.h"
 #include "sp_prim_vbuf.h"
-#include "sp_prim_setup.h"
-#include "sp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
 #include "util/u_memory.h"
+#include "util/u_prim.h"
 
 
 #define SP_MAX_VBUF_INDEXES 1024
@@ -58,6 +58,8 @@ struct softpipe_vbuf_render
 {
    struct vbuf_render base;
    struct softpipe_context *softpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -74,6 +76,11 @@ softpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 sp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -104,36 +111,6 @@ sp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 sp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         softpipe_get_vbuf_vertex_info(cvbr->softpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -159,14 +136,11 @@ static boolean
 sp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = sp_draw_setup_context(cvbr->softpipe->setup);
+   struct setup_context *setup_ctx = cvbr->setup;
    
    setup_prepare( setup_ctx );
 
+   cvbr->softpipe->reduced_prim = u_reduced_prim(prim);
    cvbr->prim = prim;
    return TRUE;
 
@@ -191,14 +165,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct softpipe_context *softpipe = cvbr->softpipe;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -237,14 +206,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-1], stride),
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -254,14 +225,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-1], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)], stride),
                        get_vert(vertex_buffer, indices[i-2], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i+(i&1)-2], stride),
                        get_vert(vertex_buffer, indices[i-(i&1)-1], stride),
@@ -271,14 +244,16 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -288,8 +263,8 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-2], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -299,7 +274,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
@@ -314,8 +291,8 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-1], stride),
@@ -325,7 +302,9 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
                        get_vert(vertex_buffer, indices[i-0], stride),
                        get_vert(vertex_buffer, indices[i-3], stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, indices[i-3], stride),
                        get_vert(vertex_buffer, indices[i-2], stride),
@@ -355,11 +334,6 @@ sp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in sp_setup.c:
-    */
-   sp_draw_flush( setup );
 }
 
 
@@ -372,17 +346,12 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
    struct softpipe_context *softpipe = cvbr->softpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = softpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * softpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = softpipe->setup;
-   struct setup_context *setup_ctx = sp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -421,14 +390,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 2; i < nr; i += 3) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-1, stride),
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 3) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -438,14 +409,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
-      for (i = 2; i < nr; i++) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i++) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-1, stride),
                        get_vert(vertex_buffer, i-(i&1), stride),
                        get_vert(vertex_buffer, i-2, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i++) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i+(i&1)-2, stride),
                        get_vert(vertex_buffer, i-(i&1)-1, stride),
@@ -455,14 +428,16 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_TRIANGLE_FAN:
-      for (i = 2; i < nr; i += 1) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 2; i < nr; i += 1) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, 0, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -472,8 +447,8 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 3; i < nr; i += 4) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-2, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -483,7 +458,9 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 4) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
@@ -497,8 +474,8 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 3; i < nr; i += 2) {
-         if (softpipe->rasterizer->flatshade_first) {
+      if (softpipe->rasterizer->flatshade_first) {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-1, stride),
@@ -508,7 +485,9 @@ sp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
                        get_vert(vertex_buffer, i-0, stride),
                        get_vert(vertex_buffer, i-3, stride) );
          }
-         else {
+      }
+      else {
+         for (i = 3; i < nr; i += 2) {
             setup_tri( setup_ctx,
                        get_vert(vertex_buffer, i-3, stride),
                        get_vert(vertex_buffer, i-2, stride),
@@ -546,40 +525,38 @@ static void
 sp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct softpipe_vbuf_render *cvbr = softpipe_vbuf_render(vbr);
-   cvbr->softpipe->vbuf_render = NULL;
+   setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-sp_init_vbuf(struct softpipe_context *sp)
+struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *sp)
 {
-   assert(sp->draw);
+   struct softpipe_vbuf_render *cvbr = CALLOC_STRUCT(softpipe_vbuf_render);
 
-   sp->vbuf_render = CALLOC_STRUCT(softpipe_vbuf_render);
+   assert(sp->draw);
 
-   sp->vbuf_render->base.max_indices = SP_MAX_VBUF_INDEXES;
-   sp->vbuf_render->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->base.get_vertex_info = sp_vbuf_get_vertex_info;
-   sp->vbuf_render->base.allocate_vertices = sp_vbuf_allocate_vertices;
-   sp->vbuf_render->base.map_vertices = sp_vbuf_map_vertices;
-   sp->vbuf_render->base.unmap_vertices = sp_vbuf_unmap_vertices;
-   sp->vbuf_render->base.set_primitive = sp_vbuf_set_primitive;
-   sp->vbuf_render->base.draw = sp_vbuf_draw;
-   sp->vbuf_render->base.draw_arrays = sp_vbuf_draw_arrays;
-   sp->vbuf_render->base.release_vertices = sp_vbuf_release_vertices;
-   sp->vbuf_render->base.destroy = sp_vbuf_destroy;
+   cvbr->base.max_indices = SP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = SP_MAX_VBUF_SIZE;
 
-   sp->vbuf_render->softpipe = sp;
+   cvbr->base.get_vertex_info = sp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = sp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = sp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = sp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = sp_vbuf_set_primitive;
+   cvbr->base.draw = sp_vbuf_draw;
+   cvbr->base.draw_arrays = sp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = sp_vbuf_release_vertices;
+   cvbr->base.destroy = sp_vbuf_destroy;
 
-   sp->vbuf = draw_vbuf_stage(sp->draw, &sp->vbuf_render->base);
+   cvbr->softpipe = sp;
 
-   draw_set_rasterize_stage(sp->draw, sp->vbuf);
+   cvbr->setup = setup_create_context(cvbr->softpipe);
 
-   draw_set_render(sp->draw, &sp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/softpipe/sp_prim_vbuf.h b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
index 1de9cc2a894..ad01cc2f289 100644
--- a/src/gallium/drivers/softpipe/sp_prim_vbuf.h
+++ b/src/gallium/drivers/softpipe/sp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct softpipe_context;
 
-extern void
-sp_init_vbuf(struct softpipe_context *softpipe);
+extern struct vbuf_render *
+sp_create_vbuf_backend(struct softpipe_context *softpipe);
 
 
 #endif /* SP_VBUF_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad.h b/src/gallium/drivers/softpipe/sp_quad.h
index bd6c6cb9123..a3236bd1169 100644
--- a/src/gallium/drivers/softpipe/sp_quad.h
+++ b/src/gallium/drivers/softpipe/sp_quad.h
@@ -97,10 +97,10 @@ struct quad_header {
    struct quad_header_inout inout;
    struct quad_header_output output;
 
-   const struct tgsi_interp_coef *coef;
+   /* Redundant/duplicated:
+    */
    const struct tgsi_interp_coef *posCoef;
-
-   unsigned nr_attrs;
+   const struct tgsi_interp_coef *coef;
 };
 
 #endif /* SP_QUAD_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
deleted file mode 100644
index 0845bae0e68..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ /dev/null
@@ -1,108 +0,0 @@
-
-/**
- * quad alpha test
- */
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-static void
-alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const float ref = softpipe->depth_stencil->alpha.ref_value;
-   unsigned passMask = 0x0, j;
-   const uint cbuf = 0; /* only output[0].alpha is tested */
-   const float *aaaa = quad->output.color[cbuf][3];
-
-   switch (softpipe->depth_stencil->alpha.func) {
-   case PIPE_FUNC_NEVER:
-      break;
-   case PIPE_FUNC_LESS:
-      /*
-       * If mask were an array [4] we could do this SIMD-style:
-       * passMask = (quad->outputs.color[0][3] <= vec4(ref));
-       */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] < ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] == ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] <= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] > ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] != ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (aaaa[j] >= ref) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   quad->inout.mask &= passMask;
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void alpha_test_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void alpha_test_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *
-sp_quad_alpha_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = alpha_test_begin;
-   stage->run = alpha_test_quad;
-   stage->destroy = alpha_test_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index b1e18805c70..e243c63fa23 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -117,644 +117,865 @@ do { \
 
 
 static void
-logicop_quad(struct quad_stage *qs, struct quad_header *quad)
+logicop_quad(struct quad_stage *qs, 
+             float (*quadColor)[4],
+             float (*dest)[4])
 {
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   ubyte src[4][4], dst[4][4], res[4][4];
+   uint *src4 = (uint *) src;
+   uint *dst4 = (uint *) dst;
+   uint *res4 = (uint *) res;
+   uint j;
+
+
+   /* convert to ubyte */
+   for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
+      dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
+      dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
+      dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
+      dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
+
+      src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
+      src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
+      src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
+      src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
+   }
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      ubyte src[4][4], dst[4][4], res[4][4];
-      uint *src4 = (uint *) src;
-      uint *dst4 = (uint *) dst;
-      uint *res4 = (uint *) res;
-      struct softpipe_cached_tile *
-         tile = sp_get_cached_tile(softpipe,
-                                   softpipe->cbuf_cache[cbuf],
-                                   quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
+   switch (softpipe->blend->logicop_func) {
+   case PIPE_LOGICOP_CLEAR:
+      for (j = 0; j < 4; j++)
+         res4[j] = 0;
+      break;
+   case PIPE_LOGICOP_NOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] | dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j];
+      break;
+   case PIPE_LOGICOP_AND_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & ~dst4[j];
+      break;
+   case PIPE_LOGICOP_INVERT:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~dst4[j];
+      break;
+   case PIPE_LOGICOP_XOR:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j] ^ src4[j];
+      break;
+   case PIPE_LOGICOP_NAND:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] & dst4[j]);
+      break;
+   case PIPE_LOGICOP_AND:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] & dst4[j];
+      break;
+   case PIPE_LOGICOP_EQUIV:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~(src4[j] ^ dst4[j]);
+      break;
+   case PIPE_LOGICOP_NOOP:
+      for (j = 0; j < 4; j++)
+         res4[j] = dst4[j];
+      break;
+   case PIPE_LOGICOP_OR_INVERTED:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_COPY:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j];
+      break;
+   case PIPE_LOGICOP_OR_REVERSE:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | ~dst4[j];
+      break;
+   case PIPE_LOGICOP_OR:
+      for (j = 0; j < 4; j++)
+         res4[j] = src4[j] | dst4[j];
+      break;
+   case PIPE_LOGICOP_SET:
+      for (j = 0; j < 4; j++)
+         res4[j] = ~0;
+      break;
+   default:
+      assert(0);
+   }
 
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
+   for (j = 0; j < 4; j++) {
+      quadColor[j][0] = ubyte_to_float(res[j][0]);
+      quadColor[j][1] = ubyte_to_float(res[j][1]);
+      quadColor[j][2] = ubyte_to_float(res[j][2]);
+      quadColor[j][3] = ubyte_to_float(res[j][3]);
+   }
+}
 
-      /* convert to ubyte */
-      for (j = 0; j < 4; j++) { /* loop over R,G,B,A channels */
-         dst[j][0] = float_to_ubyte(dest[j][0]); /* P0 */
-         dst[j][1] = float_to_ubyte(dest[j][1]); /* P1 */
-         dst[j][2] = float_to_ubyte(dest[j][2]); /* P2 */
-         dst[j][3] = float_to_ubyte(dest[j][3]); /* P3 */
-
-         src[j][0] = float_to_ubyte(quadColor[j][0]); /* P0 */
-         src[j][1] = float_to_ubyte(quadColor[j][1]); /* P1 */
-         src[j][2] = float_to_ubyte(quadColor[j][2]); /* P2 */
-         src[j][3] = float_to_ubyte(quadColor[j][3]); /* P3 */
-      }
 
-      switch (softpipe->blend->logicop_func) {
-      case PIPE_LOGICOP_CLEAR:
-         for (j = 0; j < 4; j++)
-            res4[j] = 0;
-         break;
-      case PIPE_LOGICOP_NOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] | dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j];
-         break;
-      case PIPE_LOGICOP_AND_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & ~dst4[j];
-         break;
-      case PIPE_LOGICOP_INVERT:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~dst4[j];
-         break;
-      case PIPE_LOGICOP_XOR:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j] ^ src4[j];
-         break;
-      case PIPE_LOGICOP_NAND:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] & dst4[j]);
-         break;
-      case PIPE_LOGICOP_AND:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] & dst4[j];
-         break;
-      case PIPE_LOGICOP_EQUIV:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~(src4[j] ^ dst4[j]);
-         break;
-      case PIPE_LOGICOP_NOOP:
-         for (j = 0; j < 4; j++)
-            res4[j] = dst4[j];
-         break;
-      case PIPE_LOGICOP_OR_INVERTED:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_COPY:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j];
-         break;
-      case PIPE_LOGICOP_OR_REVERSE:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | ~dst4[j];
-         break;
-      case PIPE_LOGICOP_OR:
-         for (j = 0; j < 4; j++)
-            res4[j] = src4[j] | dst4[j];
-         break;
-      case PIPE_LOGICOP_SET:
-         for (j = 0; j < 4; j++)
-            res4[j] = ~0;
-         break;
-      default:
-         assert(0);
-      }
 
-      for (j = 0; j < 4; j++) {
-         quadColor[j][0] = ubyte_to_float(res[j][0]);
-         quadColor[j][1] = ubyte_to_float(res[j][1]);
-         quadColor[j][2] = ubyte_to_float(res[j][2]);
-         quadColor[j][3] = ubyte_to_float(res[j][3]);
-      }
+static void
+blend_quad(struct quad_stage *qs, 
+           float (*quadColor)[4],
+           float (*dest)[4])
+{
+   static const float zero[4] = { 0, 0, 0, 0 };
+   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
+   float source[4][QUAD_SIZE];
+
+   /*
+    * Compute src/first term RGB
+    */
+   switch (softpipe->blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[0], quadColor[0]); /* R */
+      VEC4_COPY(source[1], quadColor[1]); /* G */
+      VEC4_COPY(source[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
+      VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
+      VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+   {
+      const float *alpha = dest[3];
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+   {
+      const float *alpha = quadColor[3];
+      float diff[4], temp[4];
+      VEC4_SUB(diff, one, dest[3]);
+      VEC4_MIN(temp, alpha, diff);
+      VEC4_MUL(source[0], quadColor[0], temp); /* R */
+      VEC4_MUL(source[1], quadColor[1], temp); /* G */
+      VEC4_MUL(source[2], quadColor[2], temp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float alpha[4];
+      VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[0], zero); /* R */
+      VEC4_COPY(source[1], zero); /* G */
+      VEC4_COPY(source[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(source[0], quadColor[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(source[1], quadColor[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(source[2], quadColor[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+      assert(0); /* to do */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      assert(0); /* to do */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Compute src/first term A
+    */
+   switch (softpipe->blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+   {
+      const float *alpha = quadColor[3];
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      /* multiply alpha by 1.0 */
+      VEC4_COPY(source[3], quadColor[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(source[3], quadColor[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(source[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, quadColor[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_alpha[4];
+      VEC4_SUB(inv_alpha, one, dest[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      /* A */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(source[3], quadColor[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
    }
 
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
 
+   /*
+    * Compute dest/second term RGB
+    */
+   switch (softpipe->blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
+      VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
+      VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
+      VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
+      VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      assert(0); /* illegal */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[0], dest[0], comp); /* R */
+      VEC4_MUL(dest[1], dest[1], comp); /* G */
+      VEC4_MUL(dest[2], dest[2], comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[0], zero); /* R */
+      VEC4_COPY(dest[1], zero); /* G */
+      VEC4_COPY(dest[2], zero); /* B */
+      break;
+   case PIPE_BLENDFACTOR_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
+      VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
+      VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[0]); /* R */
+      VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
+      VEC4_SUB(inv_comp, one, dest[1]); /* G */
+      VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
+      VEC4_SUB(inv_comp, one, dest[2]); /* B */
+      VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+   {
+      float inv_comp[4];
+      /* R */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      /* G */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      /* B */
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[0], dest[0], inv_comp);
+      VEC4_MUL(dest[1], dest[1], inv_comp);
+      VEC4_MUL(dest[2], dest[2], inv_comp);
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
+   case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
+      /* XXX what are these? */
+      assert(0);
+      break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Compute dest/second term A
+    */
+   switch (softpipe->blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      /* dest = dest * 1   NO-OP, leave dest as-is */
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
+      break;
+   case PIPE_BLENDFACTOR_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_DST_ALPHA:
+      VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
+      assert(0); /* illegal */
+      break;
+   case PIPE_BLENDFACTOR_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_CONST_ALPHA:
+   {
+      float comp[4];
+      VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
+      VEC4_MUL(dest[3], dest[3], comp); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_ZERO:
+      VEC4_COPY(dest[3], zero); /* A */
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+   {
+      float one_minus_alpha[QUAD_SIZE];
+      VEC4_SUB(one_minus_alpha, one, quadColor[3]);
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_DST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_DST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SUB(inv_comp, one, dest[3]); /* A */
+      VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
+   }
+   break;
+   case PIPE_BLENDFACTOR_INV_CONST_COLOR:
+      /* fall-through */
+   case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
+   {
+      float inv_comp[4];
+      VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
+      VEC4_MUL(dest[3], dest[3], inv_comp);
+   }
+   break;
+   default:
+      assert(0);
+   }
 
+   /*
+    * Combine RGB terms
+    */
+   switch (softpipe->blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
+      VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
+      VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
+      break;
+   default:
+      assert(0);
+   }
+
+   /*
+    * Combine A terms
+    */
+   switch (softpipe->blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_REVERSE_SUBTRACT:
+      VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
+      break;
+   case PIPE_BLEND_MIN:
+      VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   case PIPE_BLEND_MAX:
+      VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
+      break;
+   default:
+      assert(0);
+   }
+}
 
 static void
-blend_quad(struct quad_stage *qs, struct quad_header *quad)
+colormask_quad(struct quad_stage *qs,
+               float (*quadColor)[4],
+               float (*dest)[4])
 {
-   static const float zero[4] = { 0, 0, 0, 0 };
-   static const float one[4] = { 1, 1, 1, 1 };
+   struct softpipe_context *softpipe = qs->softpipe;
 
+   /* R */
+   if (!(softpipe->blend->colormask & PIPE_MASK_R))
+      COPY_4V(quadColor[0], dest[0]);
+
+   /* G */
+   if (!(softpipe->blend->colormask & PIPE_MASK_G))
+      COPY_4V(quadColor[1], dest[1]);
+
+   /* B */
+   if (!(softpipe->blend->colormask & PIPE_MASK_B))
+      COPY_4V(quadColor[2], dest[2]);
+
+   /* A */
+   if (!(softpipe->blend->colormask & PIPE_MASK_A))
+      COPY_4V(quadColor[3], dest[3]);
+}
+
+
+static void
+blend_fallback(struct quad_stage *qs, 
+               struct quad_header *quads[],
+               unsigned nr)
+{
    struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
+   const struct pipe_blend_state *blend = softpipe->blend;
+   unsigned cbuf;
+
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) 
+   {
+      float dest[4][QUAD_SIZE];
+      struct softpipe_cached_tile *tile
+         = sp_get_cached_tile(softpipe->cbuf_cache[cbuf],
+                              quads[0]->input.x0, 
+                              quads[0]->input.y0);
+      uint q, i, j;
+
+      for (q = 0; q < nr; q++) {
+         struct quad_header *quad = quads[q];
+         float (*quadColor)[4] = quad->output.color[cbuf];
+         const int itx = (quad->input.x0 & (TILE_SIZE-1));
+         const int ity = (quad->input.y0 & (TILE_SIZE-1));
+
+         /* get/swizzle dest colors 
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) {
+               dest[i][j] = tile->data.color[y][x][i];
+            }
+         }
 
-   if (softpipe->blend->logicop_enable) {
-      logicop_quad(qs, quad);
-      return;
+
+         if (blend->logicop_enable) {
+            logicop_quad( qs, quadColor, dest );
+         }
+         else if (blend->blend_enable) {
+            blend_quad( qs, quadColor, dest );
+         }
+
+         if (blend->colormask != 0xf)
+            colormask_quad( qs, quadColor, dest );
+   
+         /* Output color values
+          */
+         for (j = 0; j < QUAD_SIZE; j++) {
+            if (quad->inout.mask & (1 << j)) {
+               int x = itx + (j & 1);
+               int y = ity + (j >> 1);
+               for (i = 0; i < 4; i++) { /* loop over color chans */
+                  tile->data.color[y][x][i] = quadColor[i][j];
+               }
+            }
+         }
+      }
    }
+}
 
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float source[4][QUAD_SIZE], dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
 
+static void
+blend_single_add_src_alpha_inv_src_alpha(struct quad_stage *qs, 
+                                         struct quad_header *quads[],
+                                         unsigned nr)
+{
+   static const float one[4] = { 1, 1, 1, 1 };
+   float one_minus_alpha[QUAD_SIZE];
+   float dest[4][QUAD_SIZE];
+   float source[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const float *alpha = quadColor[3];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
       }
 
-      /*
-       * Compute src/first term RGB
-       */
-      switch (softpipe->blend->rgb_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[0], quadColor[0]); /* R */
-         VEC4_COPY(source[1], quadColor[1]); /* G */
-         VEC4_COPY(source[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(source[0], quadColor[0], quadColor[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], quadColor[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(source[0], quadColor[0], dest[0]); /* R */
-         VEC4_MUL(source[1], quadColor[1], dest[1]); /* G */
-         VEC4_MUL(source[2], quadColor[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         {
-            const float *alpha = dest[3];
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         {
-            const float *alpha = quadColor[3];
-            float diff[4], temp[4];
-            VEC4_SUB(diff, one, dest[3]);
-            VEC4_MIN(temp, alpha, diff);
-            VEC4_MUL(source[0], quadColor[0], temp); /* R */
-            VEC4_MUL(source[1], quadColor[1], temp); /* G */
-            VEC4_MUL(source[2], quadColor[2], temp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float alpha[4];
-            VEC4_SCALAR(alpha, softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[0], zero); /* R */
-         VEC4_COPY(source[1], zero); /* G */
-         VEC4_COPY(source[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(source[0], quadColor[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(source[1], quadColor[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(source[2], quadColor[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(source[0], quadColor[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(source[1], quadColor[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(source[2], quadColor[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SCALAR(inv_alpha, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[0], quadColor[0], inv_alpha); /* R */
-            VEC4_MUL(source[1], quadColor[1], inv_alpha); /* G */
-            VEC4_MUL(source[2], quadColor[2], inv_alpha); /* B */
+      VEC4_MUL(source[0], quadColor[0], alpha); /* R */
+      VEC4_MUL(source[1], quadColor[1], alpha); /* G */
+      VEC4_MUL(source[2], quadColor[2], alpha); /* B */
+      VEC4_MUL(source[3], quadColor[3], alpha); /* A */
+
+      VEC4_SUB(one_minus_alpha, one, alpha);
+      VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
+      VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
+      VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
+      VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* B */
+
+      VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-         assert(0); /* to do */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         assert(0); /* to do */
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute src/first term A
-       */
-      switch (softpipe->blend->alpha_src_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         {
-            const float *alpha = quadColor[3];
-            VEC4_MUL(source[3], quadColor[3], alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(source[3], quadColor[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         /* multiply alpha by 1.0 */
-         VEC4_COPY(source[3], quadColor[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(source[3], quadColor[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(source[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, quadColor[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_alpha[4];
-            VEC4_SUB(inv_alpha, one, dest[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            /* A */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(source[3], quadColor[3], inv_comp);
+static void
+blend_single_add_one_one(struct quad_stage *qs, 
+                         struct quad_header *quads[],
+                         unsigned nr)
+{
+   float dest[4][QUAD_SIZE];
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      /* get/swizzle dest colors */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = itx + (j & 1);
+         int y = ity + (j >> 1);
+         for (i = 0; i < 4; i++) {
+            dest[i][j] = tile->data.color[y][x][i];
          }
-         break;
-      default:
-         assert(0);
       }
+     
+      VEC4_ADD_SAT(quadColor[0], quadColor[0], dest[0]); /* R */
+      VEC4_ADD_SAT(quadColor[1], quadColor[1], dest[1]); /* G */
+      VEC4_ADD_SAT(quadColor[2], quadColor[2], dest[2]); /* B */
+      VEC4_ADD_SAT(quadColor[3], quadColor[3], dest[3]); /* A */
 
-
-      /*
-       * Compute dest/second term RGB
-       */
-      switch (softpipe->blend->rgb_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         VEC4_MUL(dest[0], dest[0], quadColor[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], quadColor[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], quadColor[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[0], dest[0], quadColor[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], quadColor[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], quadColor[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[0], dest[0], dest[3]); /* R * A */
-         VEC4_MUL(dest[1], dest[1], dest[3]); /* G * A */
-         VEC4_MUL(dest[2], dest[2], dest[3]); /* B * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         VEC4_MUL(dest[0], dest[0], dest[0]); /* R */
-         VEC4_MUL(dest[1], dest[1], dest[1]); /* G */
-         VEC4_MUL(dest[2], dest[2], dest[2]); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_SCALAR(comp, softpipe->blend_color.color[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[0], dest[0], comp); /* R */
-            VEC4_MUL(dest[1], dest[1], comp); /* G */
-            VEC4_MUL(dest[2], dest[2], comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[0], zero); /* R */
-         VEC4_COPY(dest[1], zero); /* G */
-         VEC4_COPY(dest[2], zero); /* B */
-         break;
-      case PIPE_BLENDFACTOR_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, quadColor[0]); /* R */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_SUB(inv_comp, one, quadColor[1]); /* G */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_SUB(inv_comp, one, quadColor[2]); /* B */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[0], dest[0], one_minus_alpha); /* R */
-            VEC4_MUL(dest[1], dest[1], one_minus_alpha); /* G */
-            VEC4_MUL(dest[2], dest[2], one_minus_alpha); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[0], inv_comp, dest[0]); /* R */
-            VEC4_MUL(dest[1], inv_comp, dest[1]); /* G */
-            VEC4_MUL(dest[2], inv_comp, dest[2]); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[0]); /* R */
-            VEC4_MUL(dest[0], dest[0], inv_comp); /* R */
-            VEC4_SUB(inv_comp, one, dest[1]); /* G */
-            VEC4_MUL(dest[1], dest[1], inv_comp); /* G */
-            VEC4_SUB(inv_comp, one, dest[2]); /* B */
-            VEC4_MUL(dest[2], dest[2], inv_comp); /* B */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         {
-            float inv_comp[4];
-            /* R */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[0]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            /* G */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[1]);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            /* B */
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[2]);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[0], dest[0], inv_comp);
-            VEC4_MUL(dest[1], dest[1], inv_comp);
-            VEC4_MUL(dest[2], dest[2], inv_comp);
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC1_COLOR:
-      case PIPE_BLENDFACTOR_INV_SRC1_ALPHA:
-         /* XXX what are these? */
-         assert(0);
-         break;
-      default:
-         assert(0);
       }
+   }
+}
 
-      /*
-       * Compute dest/second term A
-       */
-      switch (softpipe->blend->alpha_dst_factor) {
-      case PIPE_BLENDFACTOR_ONE:
-         /* dest = dest * 1   NO-OP, leave dest as-is */
-         break;
-      case PIPE_BLENDFACTOR_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_SRC_ALPHA:
-         VEC4_MUL(dest[3], dest[3], quadColor[3]); /* A * A */
-         break;
-      case PIPE_BLENDFACTOR_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_DST_ALPHA:
-         VEC4_MUL(dest[3], dest[3], dest[3]); /* A */
-         break;
-      case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
-         assert(0); /* illegal */
-         break;
-      case PIPE_BLENDFACTOR_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_CONST_ALPHA:
-         {
-            float comp[4];
-            VEC4_SCALAR(comp, softpipe->blend_color.color[3]); /* A */
-            VEC4_MUL(dest[3], dest[3], comp); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_ZERO:
-         VEC4_COPY(dest[3], zero); /* A */
-         break;
-      case PIPE_BLENDFACTOR_INV_SRC_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
-         {
-            float one_minus_alpha[QUAD_SIZE];
-            VEC4_SUB(one_minus_alpha, one, quadColor[3]);
-            VEC4_MUL(dest[3], dest[3], one_minus_alpha); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_DST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_DST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SUB(inv_comp, one, dest[3]); /* A */
-            VEC4_MUL(dest[3], inv_comp, dest[3]); /* A */
-         }
-         break;
-      case PIPE_BLENDFACTOR_INV_CONST_COLOR:
-         /* fall-through */
-      case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
-         {
-            float inv_comp[4];
-            VEC4_SCALAR(inv_comp, 1.0f - softpipe->blend_color.color[3]);
-            VEC4_MUL(dest[3], dest[3], inv_comp);
+
+static void
+single_output_color(struct quad_stage *qs, 
+                    struct quad_header *quads[],
+                    unsigned nr)
+{
+   uint i, j, q;
+
+   struct softpipe_cached_tile *tile
+      = sp_get_cached_tile(qs->softpipe->cbuf_cache[0],
+                           quads[0]->input.x0, 
+                           quads[0]->input.y0);
+
+   for (q = 0; q < nr; q++) {
+      struct quad_header *quad = quads[q];
+      float (*quadColor)[4] = quad->output.color[0];
+      const int itx = (quad->input.x0 & (TILE_SIZE-1));
+      const int ity = (quad->input.y0 & (TILE_SIZE-1));
+      
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (quad->inout.mask & (1 << j)) {
+            int x = itx + (j & 1);
+            int y = ity + (j >> 1);
+            for (i = 0; i < 4; i++) { /* loop over color chans */
+               tile->data.color[y][x][i] = quadColor[i][j];
+            }
          }
-         break;
-      default:
-         assert(0);
       }
+   }
+}
+
+static void
+blend_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+}
 
-      /*
-       * Combine RGB terms
-       */
-      switch (softpipe->blend->rgb_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_ADD_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_ADD_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[0], dest[0], source[0]); /* R */
-         VEC4_SUB_SAT(quadColor[1], dest[1], source[1]); /* G */
-         VEC4_SUB_SAT(quadColor[2], dest[2], source[2]); /* B */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MIN(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MIN(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[0], source[0], dest[0]); /* R */
-         VEC4_MAX(quadColor[1], source[1], dest[1]); /* G */
-         VEC4_MAX(quadColor[2], source[2], dest[2]); /* B */
-         break;
-      default:
-         assert(0);
-      }
 
-      /*
-       * Combine A terms
-       */
-      switch (softpipe->blend->alpha_func) {
-      case PIPE_BLEND_ADD:
-         VEC4_ADD_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_REVERSE_SUBTRACT:
-         VEC4_SUB_SAT(quadColor[3], dest[3], source[3]); /* A */
-         break;
-      case PIPE_BLEND_MIN:
-         VEC4_MIN(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      case PIPE_BLEND_MAX:
-         VEC4_MAX(quadColor[3], source[3], dest[3]); /* A */
-         break;
-      default:
-         assert(0);
+static void
+choose_blend_quad(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   const struct pipe_blend_state *blend = softpipe->blend;
+
+   qs->run = blend_fallback;
+   
+   if (softpipe->framebuffer.nr_cbufs == 0) {
+      qs->run = blend_noop;
+   }
+   else if (!softpipe->blend->logicop_enable &&
+            softpipe->blend->colormask == 0xf) 
+   {
+      if (!blend->blend_enable) {
+         qs->run = single_output_color;
       }
+      else if (blend->rgb_src_factor == blend->alpha_src_factor &&
+               blend->rgb_dst_factor == blend->alpha_dst_factor &&
+               blend->rgb_func == blend->alpha_func &&
+               softpipe->framebuffer.nr_cbufs == 1)
+      {
+         if (blend->alpha_func == PIPE_BLEND_ADD) {
+            if (blend->rgb_src_factor == PIPE_BLENDFACTOR_ONE &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_ONE) {
+               qs->run = blend_single_add_one_one;
+            }
+            else if (blend->rgb_src_factor == PIPE_BLENDFACTOR_SRC_ALPHA &&
+                blend->rgb_dst_factor == PIPE_BLENDFACTOR_INV_SRC_ALPHA)
+               qs->run = blend_single_add_src_alpha_inv_src_alpha;
 
-   } /* cbuf loop */
+         }
+      }
+   }
 
-   /* pass blended quad to next stage */
-   qs->next->run(qs->next, quad);
+   qs->run(qs, quads, nr);
 }
 
 
 static void blend_begin(struct quad_stage *qs)
 {
-   qs->next->begin(qs->next);
+   qs->run = choose_blend_quad;
 }
 
 
@@ -770,7 +991,7 @@ struct quad_stage *sp_quad_blend_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = blend_begin;
-   stage->run = blend_quad;
+   stage->run = choose_blend_quad;
    stage->destroy = blend_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_bufloop.c b/src/gallium/drivers/softpipe/sp_quad_bufloop.c
deleted file mode 100644
index 953d8516b90..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_bufloop.c
+++ /dev/null
@@ -1,74 +0,0 @@
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Loop over colorbuffers, passing quad to next stage each time.
- */
-static void
-cbuf_loop_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   float tmp[PIPE_MAX_COLOR_BUFS][4][QUAD_SIZE];
-   unsigned i;
-
-   assert(sizeof(quad->outputs.color) == sizeof(tmp));
-   assert(softpipe->framebuffer.nr_cbufs <= PIPE_MAX_COLOR_BUFS);
-
-   /* make copy of original colors since they can get modified
-    * by blending and masking.
-    * XXX we won't have to do this if the fragment program actually emits
-    * N separate colors and we're drawing to N color buffers (MRT).
-    * But if we emitted one color and glDrawBuffer(GL_FRONT_AND_BACK) is
-    * in effect, we need to save/restore colors like this.
-    */
-   memcpy(tmp, quad->outputs.color, sizeof(tmp));
-
-   for (i = 0; i < softpipe->framebuffer.nr_cbufs; i++) {
-      /* set current cbuffer */
-#if 0 /* obsolete & going away */
-      softpipe->current_cbuf = i;
-#endif
-
-      /* pass blended quad to next stage */
-      qs->next->run(qs->next, quad);
-
-      /* restore quad's colors for next buffer */
-      memcpy(quad->outputs.color, tmp, sizeof(tmp));
-   }
-}
-
-
-static void cbuf_loop_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void cbuf_loop_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-/**
- * Create the colorbuffer loop stage.
- * This is used to implement multiple render targets and GL_FRONT_AND_BACK
- * rendering.
- */
-struct quad_stage *sp_quad_bufloop_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = cbuf_loop_begin;
-   stage->run = cbuf_loop_quad;
-   stage->destroy = cbuf_loop_destroy;
-
-   return stage;
-}
-
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
deleted file mode 100644
index dc90e5d5e99..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ /dev/null
@@ -1,116 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  quad colormask stage
- * \author Brian Paul
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-
-/**
- * XXX colormask could be rolled into blending...
- */
-static void
-colormask_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      float dest[4][QUAD_SIZE];
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      uint i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
-         for (i = 0; i < 4; i++) {
-            dest[i][j] = tile->data.color[y][x][i];
-         }
-      }
-
-      /* R */
-      if (!(softpipe->blend->colormask & PIPE_MASK_R))
-          COPY_4V(quadColor[0], dest[0]);
-
-      /* G */
-      if (!(softpipe->blend->colormask & PIPE_MASK_G))
-          COPY_4V(quadColor[1], dest[1]);
-
-      /* B */
-      if (!(softpipe->blend->colormask & PIPE_MASK_B))
-          COPY_4V(quadColor[2], dest[2]);
-
-      /* A */
-      if (!(softpipe->blend->colormask & PIPE_MASK_A))
-          COPY_4V(quadColor[3], dest[3]);
-   }
-
-   /* pass quad to next stage */
-   qs->next->run(qs->next, quad);
-}
-
-
-static void colormask_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void colormask_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_colormask_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = colormask_begin;
-   stage->run = colormask_quad;
-   stage->destroy = colormask_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
deleted file mode 100644
index 4aeee858705..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ /dev/null
@@ -1,94 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-/**
- * \brief  Apply AA coverage to quad alpha valus
- * \author  Brian Paul
- */
-
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * Multiply quad's alpha values by the fragment coverage.
- */
-static void
-coverage_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   const uint prim = quad->input.prim;
-
-   if ((softpipe->rasterizer->poly_smooth && prim == QUAD_PRIM_TRI) ||
-       (softpipe->rasterizer->line_smooth && prim == QUAD_PRIM_LINE) ||
-       (softpipe->rasterizer->point_smooth && prim == QUAD_PRIM_POINT)) {
-      uint cbuf;
-
-      /* loop over colorbuffer outputs */
-      for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-         float (*quadColor)[4] = quad->output.color[cbuf];
-         unsigned j;
-         for (j = 0; j < QUAD_SIZE; j++) {
-            assert(quad->input.coverage[j] >= 0.0);
-            assert(quad->input.coverage[j] <= 1.0);
-         quadColor[3][j] *= quad->input.coverage[j];
-         }
-      }
-   }
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void coverage_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void coverage_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_coverage_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = coverage_begin;
-   stage->run = coverage_quad;
-   stage->destroy = coverage_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index d463930bae1..0ca86c4e1cb 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -31,61 +31,109 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
+#include "tgsi/tgsi_scan.h"
 #include "sp_context.h"
 #include "sp_quad.h"
 #include "sp_surface.h"
 #include "sp_quad_pipe.h"
 #include "sp_tile_cache.h"
+#include "sp_state.h"           /* for sp_fragment_shader */
 
 
-/**
- * Do depth testing for a quad.
- * Not static since it's used by the stencil code.
- */
+struct depth_data {
+   struct pipe_surface *ps;
+   enum pipe_format format;
+   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
+   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
+   ubyte stencilVals[QUAD_SIZE];
+   struct softpipe_cached_tile *tile;
+};
 
-/*
- * To increase efficiency, we should probably have multiple versions
- * of this function that are specifically for Z16, Z32 and FP Z buffers.
- * Try to effectively do that with codegen...
- */
 
-void
-sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+
+static void
+get_depth_stencil_values( struct depth_data *data,
+                          const struct quad_header *quad )
 {
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   const enum pipe_format format = ps->format;
-   unsigned bzzzz[QUAD_SIZE];  /**< Z values fetched from depth buffer */
-   unsigned qzzzz[QUAD_SIZE];  /**< Z values from the quad */
-   unsigned zmask = 0;
    unsigned j;
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
+   const struct softpipe_cached_tile *tile = data->tile;
+
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth16[y][x];
+      }
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+         data->stencilVals[j] = tile->data.depth32[y][x] >> 24;
+      }
+   break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         data->bzzzz[j] = tile->data.depth32[y][x] >> 8;
+         data->stencilVals[j] = tile->data.depth32[y][x] & 0xff;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-   assert(ps); /* shouldn't get here if there's no zbuffer */
+/* If the shader has not been run, interpolate the depth values
+ * ourselves.
+ */
+static void
+interpolate_quad_depth( struct quad_header *quad )
+{
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
+   const float dzdx = quad->posCoef->dadx[2];
+   const float dzdy = quad->posCoef->dady[2];
+   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
-   /*
-    * Convert quad's float depth values to int depth values (qzzzz).
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
+}
+
+
+static void
+convert_quad_depth( struct depth_data *data, 
+                    const struct quad_header *quad )
+{
+   unsigned j;
+
+   /* Convert quad's float depth values to int depth values (qzzzz).
     * If the Z buffer stores integer values, we _have_ to do the depth
     * compares with integers (not floats).  Otherwise, the float->int->float
     * conversion of Z values (which isn't an identity function) will cause
     * Z-fighting errors.
-    *
-    * Also, get the zbuffer values (bzzzz) from the cached tile.
     */
-   switch (format) {
+   switch (data->format) {
    case PIPE_FORMAT_Z16_UNORM:
       {
          float scale = 65535.0;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth16[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
@@ -94,47 +142,247 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          double scale = (double) (uint) ~0UL;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x];
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_X8Z24_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_S8Z24_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
-         }
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
       }
       break;
    case PIPE_FORMAT_Z24X8_UNORM:
-      /* fall-through */
    case PIPE_FORMAT_Z24S8_UNORM:
       {
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
+            data->qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
 
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            bzzzz[j] = tile->data.depth32[y][x] >> 8;
+
+
+static void
+write_depth_stencil_values( struct depth_data *data,
+                            struct quad_header *quad )
+{
+   struct softpipe_cached_tile *tile = data->tile;
+   unsigned j;
+
+   /* put updated Z values back into cached tile */
+   switch (data->format) {
+   case PIPE_FORMAT_Z16_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth16[y][x] = (ushort) data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_X8Z24_UNORM:
+   case PIPE_FORMAT_Z32_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->stencilVals[j] << 24) | data->bzzzz[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = (data->bzzzz[j] << 8) | data->stencilVals[j];
+      }
+      break;
+   case PIPE_FORMAT_Z24X8_UNORM:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
+         tile->data.depth32[y][x] = data->bzzzz[j] << 8;
+      }
+      break;
+   default:
+      assert(0);
+   }
+}
+
+
+
+
+/** Only 8-bit stencil supported */
+#define STENCIL_MAX 0xff
+
+
+/**
+ * Do the basic stencil test (compare stencil buffer values against the
+ * reference value.
+ *
+ * \param data->stencilVals  the stencil values from the stencil buffer
+ * \param func  the stencil func (PIPE_FUNC_x)
+ * \param ref  the stencil reference value
+ * \param valMask  the stencil value mask indicating which bits of the stencil
+ *                 values and ref value are to be used.
+ * \return mask indicating which pixels passed the stencil test
+ */
+static unsigned
+do_stencil_test(struct depth_data *data,
+                unsigned func,
+                unsigned ref, unsigned valMask)
+{
+   unsigned passMask = 0x0;
+   unsigned j;
+
+   ref &= valMask;
+
+   switch (func) {
+   case PIPE_FUNC_NEVER:
+      /* passMask = 0x0 */
+      break;
+   case PIPE_FUNC_LESS:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref < (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_EQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref == (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_LEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref <= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GREATER:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref > (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_NOTEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref != (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_GEQUAL:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (ref >= (data->stencilVals[j] & valMask)) {
+            passMask |= (1 << j);
+         }
+      }
+      break;
+   case PIPE_FUNC_ALWAYS:
+      passMask = MASK_ALL;
+      break;
+   default:
+      assert(0);
+   }
+
+   return passMask;
+}
+
+
+/**
+ * Apply the stencil operator to stencil values.
+ *
+ * \param data->stencilVals  the stencil buffer values (read and written)
+ * \param mask  indicates which pixels to update
+ * \param op  the stencil operator (PIPE_STENCIL_OP_x)
+ * \param ref  the stencil reference value
+ * \param wrtMask  writemask controlling which bits are changed in the
+ *                 stencil values
+ */
+static void
+apply_stencil_op(struct depth_data *data,
+                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
+{
+   unsigned j;
+   ubyte newstencil[QUAD_SIZE];
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      newstencil[j] = data->stencilVals[j];
+   }
+
+   switch (op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* no-op */
+      break;
+   case PIPE_STENCIL_OP_ZERO:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = 0;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_REPLACE:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ref;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] < STENCIL_MAX) {
+               newstencil[j] = data->stencilVals[j] + 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            if (data->stencilVals[j] > 0) {
+               newstencil[j] = data->stencilVals[j] - 1;
+            }
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] + 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = data->stencilVals[j] - 1;
+         }
+      }
+      break;
+   case PIPE_STENCIL_OP_INVERT:
+      for (j = 0; j < QUAD_SIZE; j++) {
+         if (mask & (1 << j)) {
+            newstencil[j] = ~data->stencilVals[j];
          }
       }
       break;
@@ -142,6 +390,39 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
+   /*
+    * update the stencil values
+    */
+   if (wrtMask != STENCIL_MAX) {
+      /* apply bit-wise stencil buffer writemask */
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & data->stencilVals[j]);
+      }
+   }
+   else {
+      for (j = 0; j < QUAD_SIZE; j++) {
+         data->stencilVals[j] = newstencil[j];
+      }
+   }
+}
+
+   
+
+/*
+ * To increase efficiency, we should probably have multiple versions
+ * of this function that are specifically for Z16, Z32 and FP Z buffers.
+ * Try to effectively do that with codegen...
+ */
+
+static boolean
+depth_test_quad(struct quad_stage *qs, 
+                struct depth_data *data,
+                struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned zmask = 0;
+   unsigned j;
+
    switch (softpipe->depth_stencil->depth.func) {
    case PIPE_FUNC_NEVER:
       /* zmask = 0 */
@@ -151,37 +432,37 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
        * Like this:  quad->mask &= (quad->outputs.depth < zzzz);
        */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] < bzzzz[j]) 
+	 if (data->qzzzz[j] < data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_EQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] == bzzzz[j]) 
+	 if (data->qzzzz[j] == data->bzzzz[j]) 
 	    zmask |= 1 << j;
       }
       break;
    case PIPE_FUNC_LEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] <= bzzzz[j]) 
+	 if (data->qzzzz[j] <= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GREATER:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] > bzzzz[j]) 
+	 if (data->qzzzz[j] > data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_NOTEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] != bzzzz[j]) 
+	 if (data->qzzzz[j] != data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
    case PIPE_FUNC_GEQUAL:
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (qzzzz[j] >= bzzzz[j]) 
+	 if (data->qzzzz[j] >= data->bzzzz[j]) 
 	    zmask |= (1 << j);
       }
       break;
@@ -193,80 +474,480 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
    }
 
    quad->inout.mask &= zmask;
+   if (quad->inout.mask == 0)
+      return FALSE;
 
+   /* Update our internal copy only if writemask set.  Even if
+    * depth.writemask is FALSE, may still need to write out buffer
+    * data due to stencil changes.
+    */
    if (softpipe->depth_stencil->depth.writemask) {
-      
-      /* This is also efficient with sse / spe instructions: 
-       */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (quad->inout.mask & (1 << j)) {
-	    bzzzz[j] = qzzzz[j];
-	 }
+         if (quad->inout.mask & (1 << j)) {
+            data->bzzzz[j] = data->qzzzz[j];
+         }
       }
+   }
 
-      /* put updated Z values back into cached tile */
-      switch (format) {
-      case PIPE_FORMAT_Z16_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth16[y][x] = (ushort) bzzzz[j];
+   return TRUE;
+}
+
+
+
+/**
+ * Do stencil (and depth) testing.  Stenciling depends on the outcome of
+ * depth testing.
+ */
+static void
+depth_stencil_test_quad(struct quad_stage *qs, 
+                        struct depth_data *data,
+                        struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned func, zFailOp, zPassOp, failOp;
+   ubyte ref, wrtMask, valMask;
+   uint face = quad->input.facing;
+
+   if (!softpipe->depth_stencil->stencil[1].enabled) {
+      /* single-sided stencil test, use front (face=0) state */
+      face = 0;
+   }
+
+   /* choose front or back face function, operator, etc */
+   /* XXX we could do these initializations once per primitive */
+   func    = softpipe->depth_stencil->stencil[face].func;
+   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
+   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
+   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
+   ref     = softpipe->depth_stencil->stencil[face].ref_value;
+   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
+   valMask = softpipe->depth_stencil->stencil[face].valuemask;
+
+
+   /* do the stencil test first */
+   {
+      unsigned passMask, failMask;
+      passMask = do_stencil_test(data, func, ref, valMask);
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
+
+      if (failOp != PIPE_STENCIL_OP_KEEP) {
+         apply_stencil_op(data, failMask, failOp, ref, wrtMask);
+      }
+   }
+
+   if (quad->inout.mask) {
+      /* now the pixels that passed the stencil test are depth tested */
+      if (softpipe->depth_stencil->depth.enabled) {
+         const unsigned origMask = quad->inout.mask;
+
+         depth_test_quad(qs, data, quad);  /* quad->mask is updated */
+
+         /* update stencil buffer values according to z pass/fail result */
+         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zFailMask = origMask & ~quad->inout.mask;
+            apply_stencil_op(data, zFailMask, zFailOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_X8Z24_UNORM:
-         /* fall-through */
-         /* (yes, this falls through to a different case than above) */
-      case PIPE_FORMAT_Z32_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j];
+
+         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
+            const unsigned zPassMask = origMask & quad->inout.mask;
+            apply_stencil_op(data, zPassMask, zPassOp, ref, wrtMask);
          }
-         break;
-      case PIPE_FORMAT_S8Z24_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint s8z24 = tile->data.depth32[y][x];
-            s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
-            tile->data.depth32[y][x] = s8z24;
+      }
+      else {
+         /* no depth test, apply Zpass operator to stencil buffer values */
+         apply_stencil_op(data, quad->inout.mask, zPassOp, ref, wrtMask);
+      }
+   }
+}
+
+
+#define ALPHATEST( FUNC, COMP )                                         \
+   static int                                                          \
+   alpha_test_quads_##FUNC( struct quad_stage *qs,                      \
+                           struct quad_header *quads[],                 \
+                           unsigned nr )                                \
+   {                                                                    \
+      const float ref = qs->softpipe->depth_stencil->alpha.ref_value;   \
+      const uint cbuf = 0; /* only output[0].alpha is tested */         \
+      unsigned pass_nr = 0;                                             \
+      unsigned i;                                                       \
+                                                                        \
+      for (i = 0; i < nr; i++) {                                        \
+         const float *aaaa = quads[i]->output.color[cbuf][3];           \
+         unsigned passMask = 0;                                         \
+                                                                        \
+         if (aaaa[0] COMP ref) passMask |= (1 << 0);                    \
+         if (aaaa[1] COMP ref) passMask |= (1 << 1);                    \
+         if (aaaa[2] COMP ref) passMask |= (1 << 2);                    \
+         if (aaaa[3] COMP ref) passMask |= (1 << 3);                    \
+                                                                        \
+         quads[i]->inout.mask &= passMask;                              \
+                                                                        \
+         if (quads[i]->inout.mask)                                      \
+            quads[pass_nr++] = quads[i];                                \
+      }                                                                 \
+                                                                        \
+      return pass_nr;                                                   \
+   }
+
+
+ALPHATEST( LESS,     < )
+ALPHATEST( EQUAL,    == )
+ALPHATEST( LEQUAL,   <= )
+ALPHATEST( GREATER,  > )
+ALPHATEST( NOTEQUAL, != )
+ALPHATEST( GEQUAL,   >= )
+
+
+/* XXX: Incorporate into shader using KILP.
+ */
+static int
+alpha_test_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[], 
+                 unsigned nr)
+{
+   switch (qs->softpipe->depth_stencil->alpha.func) {
+   case PIPE_FUNC_LESS:
+      return alpha_test_quads_LESS( qs, quads, nr );
+   case PIPE_FUNC_EQUAL:
+      return alpha_test_quads_EQUAL( qs, quads, nr );
+      break;
+   case PIPE_FUNC_LEQUAL:
+      return alpha_test_quads_LEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GREATER:
+      return alpha_test_quads_GREATER( qs, quads, nr );
+   case PIPE_FUNC_NOTEQUAL:
+      return alpha_test_quads_NOTEQUAL( qs, quads, nr );
+   case PIPE_FUNC_GEQUAL:
+      return alpha_test_quads_GEQUAL( qs, quads, nr );
+   case PIPE_FUNC_ALWAYS:
+      return nr;
+   case PIPE_FUNC_NEVER:
+   default:
+      return 0;
+   }
+}
+
+static unsigned mask_count[16] = 
+{
+   0,                           /* 0x0 */
+   1,                           /* 0x1 */
+   1,                           /* 0x2 */
+   2,                           /* 0x3 */
+   1,                           /* 0x4 */
+   2,                           /* 0x5 */
+   2,                           /* 0x6 */
+   3,                           /* 0x7 */
+   1,                           /* 0x8 */
+   2,                           /* 0x9 */
+   2,                           /* 0xa */
+   3,                           /* 0xb */
+   2,                           /* 0xc */
+   3,                           /* 0xd */
+   3,                           /* 0xe */
+   4,                           /* 0xf */
+};
+
+
+
+static void
+depth_test_quads_fallback(struct quad_stage *qs, 
+                          struct quad_header *quads[],
+                          unsigned nr)
+{
+   unsigned i, pass = 0;
+   const struct sp_fragment_shader *fs = qs->softpipe->fs;
+   boolean interp_depth = !fs->info.writes_z;
+   struct depth_data data;
+
+
+   if (qs->softpipe->depth_stencil->alpha.enabled) {
+      nr = alpha_test_quads(qs, quads, nr);
+   }
+
+   if (qs->softpipe->framebuffer.zsbuf && 
+       (qs->softpipe->depth_stencil->depth.enabled ||
+        qs->softpipe->depth_stencil->stencil[0].enabled)) {
+
+      data.ps = qs->softpipe->framebuffer.zsbuf;
+      data.format = data.ps->format;
+      data.tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, 
+                                     quads[0]->input.x0, 
+                                     quads[0]->input.y0);
+
+      for (i = 0; i < nr; i++) {
+         get_depth_stencil_values(&data, quads[i]);
+
+         if (qs->softpipe->depth_stencil->depth.enabled) {
+            if (interp_depth)
+               interpolate_quad_depth(quads[i]);
+
+            convert_quad_depth(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24S8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            uint z24s8 = tile->data.depth32[y][x];
-            z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
-            tile->data.depth32[y][x] = z24s8;
+
+         if (qs->softpipe->depth_stencil->stencil[0].enabled) {
+            depth_stencil_test_quad(qs, &data, quads[i]);
+            write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      case PIPE_FORMAT_Z24X8_UNORM:
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->input.x0 % TILE_SIZE + (j & 1);
-            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-            tile->data.depth32[y][x] = bzzzz[j] << 8;
+         else {
+            if (!depth_test_quad(qs, &data, quads[i]))
+               continue;
+
+            if (qs->softpipe->depth_stencil->depth.writemask)
+               write_depth_stencil_values(&data, quads[i]);
          }
-         break;
-      default:
-         assert(0);
+
+
+         quads[pass++] = quads[i];
+      }
+
+      nr = pass;
+   }
+
+   if (qs->softpipe->active_query_count) {
+      for (i = 0; i < nr; i++) 
+         qs->softpipe->occlusion_count += mask_count[quads[i]->inout.mask];
+   }
+
+   if (nr)
+      qs->next->run(qs->next, quads, nr);
+}
+
+/* XXX: this function assumes setup function actually emits linear
+ * spans of quads.  It seems a lot more natural to do (early)
+ * depth-testing on spans rather than quads.
+ */
+static void
+depth_interp_z16_less_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
+{
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] < depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] < depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] < depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] < depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
       }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
    }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
+
 }
 
 
 static void
-depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
+depth_interp_z16_lequal_write(struct quad_stage *qs, 
+                            struct quad_header *quads[],
+                            unsigned nr)
 {
-   sp_depth_test_quad(qs, quad);
+   unsigned i, pass = 0;
+   const unsigned ix = quads[0]->input.x0;
+   const unsigned iy = quads[0]->input.y0;
+   const float fx = (float) ix;
+   const float fy = (float) iy;
+   const float dzdx = quads[0]->posCoef->dadx[2];
+   const float dzdy = quads[0]->posCoef->dady[2];
+   const float z0 = quads[0]->posCoef->a0[2] + dzdx * fx + dzdy * fy;
+   struct softpipe_cached_tile *tile;
+   ushort (*depth16)[TILE_SIZE];
+   ushort idepth[4], depth_step;
+   const float scale = 65535.0;
+
+   idepth[0] = (ushort)((z0) * scale);
+   idepth[1] = (ushort)((z0 + dzdx) * scale);
+   idepth[2] = (ushort)((z0 + dzdy) * scale);
+   idepth[3] = (ushort)((z0 + dzdx + dzdy) * scale);
+
+   depth_step = (ushort)(dzdx * 2 * scale);
+
+   tile = sp_get_cached_tile(qs->softpipe->zsbuf_cache, ix, iy);
+
+   depth16 = (ushort (*)[TILE_SIZE])
+      &tile->data.depth16[iy % TILE_SIZE][ix % TILE_SIZE];
+
+   for (i = 0; i < nr; i++) {
+      unsigned outmask = quads[i]->inout.mask;
+      unsigned mask = 0;
+      
+      if ((outmask & 1) && idepth[0] <= depth16[0][0]) {
+         depth16[0][0] = idepth[0];
+         mask |= (1 << 0);
+      }
+
+      if ((outmask & 2) && idepth[1] <= depth16[0][1]) {
+         depth16[0][1] = idepth[1];
+         mask |= (1 << 1);
+      }
+
+      if ((outmask & 4) && idepth[2] <= depth16[1][0]) {
+         depth16[1][0] = idepth[2];
+         mask |= (1 << 2);
+      }
+
+      if ((outmask & 8) && idepth[3] <= depth16[1][1]) {
+         depth16[1][1] = idepth[3];
+         mask |= (1 << 3);
+      }
+
+      idepth[0] += depth_step;
+      idepth[1] += depth_step;
+      idepth[2] += depth_step;
+      idepth[3] += depth_step;
+
+      depth16 = (ushort (*)[TILE_SIZE]) &depth16[0][2];
+
+      quads[i]->inout.mask = mask;
+      if (quads[i]->inout.mask)
+         quads[pass++] = quads[i];
+   }
+
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
 }
 
 
+
+
+
+static void
+depth_noop(struct quad_stage *qs, 
+           struct quad_header *quads[],
+           unsigned nr)
+{
+   qs->next->run(qs->next, quads, nr);
+}
+
+
+
+static void
+choose_depth_test(struct quad_stage *qs, 
+                  struct quad_header *quads[],
+                  unsigned nr)
+{
+   boolean interp_depth = !qs->softpipe->fs->info.writes_z;
+
+   boolean alpha = qs->softpipe->depth_stencil->alpha.enabled;
+
+   boolean depth = (qs->softpipe->framebuffer.zsbuf && 
+                    qs->softpipe->depth_stencil->depth.enabled);
+
+   unsigned depthfunc = qs->softpipe->depth_stencil->depth.func;
+
+   boolean stencil = qs->softpipe->depth_stencil->stencil[0].enabled;
+
+   boolean depthwrite = qs->softpipe->depth_stencil->depth.writemask;
+
+   boolean occlusion = qs->softpipe->active_query_count;
+
+
+   if (!alpha &&
+       !depth &&
+       !stencil) {
+      qs->run = depth_noop;
+   }
+   else if (!alpha && 
+            interp_depth && 
+            depth && 
+            depthwrite && 
+            !occlusion &&
+            !stencil) 
+   {
+      switch (depthfunc) {
+      case PIPE_FUNC_LESS:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_less_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      case PIPE_FUNC_LEQUAL:
+         switch (qs->softpipe->framebuffer.zsbuf->format) {
+         case PIPE_FORMAT_Z16_UNORM:
+            qs->run = depth_interp_z16_lequal_write;
+            break;
+         default:
+            qs->run = depth_test_quads_fallback;
+            break;
+         }
+         break;
+      default:
+         qs->run = depth_test_quads_fallback;
+      }
+   }
+   else {
+      qs->run = depth_test_quads_fallback;
+   }
+
+
+   qs->run( qs, quads, nr );
+}
+
+
+
+
+
 static void depth_test_begin(struct quad_stage *qs)
 {
+   qs->run = choose_depth_test;
    qs->next->begin(qs->next);
 }
 
@@ -283,7 +964,7 @@ struct quad_stage *sp_quad_depth_test_stage( struct softpipe_context *softpipe )
 
    stage->softpipe = softpipe;
    stage->begin = depth_test_begin;
-   stage->run = depth_test_quad;
+   stage->run = choose_depth_test;
    stage->destroy = depth_test_destroy;
 
    return stage;
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
deleted file mode 100644
index 496fd39ed1a..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ /dev/null
@@ -1,88 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief  Quad early-z testing
- */
-
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_quad.h"
-#include "sp_quad_pipe.h"
-
-
-/**
- * All this stage does is compute the quad's Z values (which is normally
- * done by the shading stage).
- * The next stage will do the actual depth test.
- */
-static void
-earlyz_quad(
-   struct quad_stage    *qs,
-   struct quad_header   *quad )
-{
-   const float fx = (float) quad->input.x0;
-   const float fy = (float) quad->input.y0;
-   const float dzdx = quad->posCoef->dadx[2];
-   const float dzdy = quad->posCoef->dady[2];
-   const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-   quad->output.depth[0] = z0;
-   quad->output.depth[1] = z0 + dzdx;
-   quad->output.depth[2] = z0 + dzdy;
-   quad->output.depth[3] = z0 + dzdx + dzdy;
-
-   qs->next->run( qs->next, quad );
-}
-
-static void
-earlyz_begin(
-   struct quad_stage *qs )
-{
-   qs->next->begin( qs->next );
-}
-
-static void
-earlyz_destroy(
-   struct quad_stage *qs )
-{
-   FREE( qs );
-}
-
-struct quad_stage *
-sp_quad_earlyz_stage(
-   struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT( quad_stage );
-
-   stage->softpipe = softpipe;
-   stage->begin = earlyz_begin;
-   stage->run = earlyz_quad;
-   stage->destroy = earlyz_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 28f8d1a60ea..1e7533d0f9e 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -68,72 +68,69 @@ quad_shade_stage(struct quad_stage *qs)
 /**
  * Execute fragment shader for the four fragments in the quad.
  */
-static void
+static INLINE boolean
 shade_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct quad_shade_stage *qss = quad_shade_stage( qs );
    struct softpipe_context *softpipe = qs->softpipe;
    struct tgsi_exec_machine *machine = qss->machine;
-   boolean z_written;
-   
-   /* Consts do not require 16 byte alignment. */
-   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
-
-   machine->InterpCoefs = quad->coef;
 
    /* run shader */
-   quad->inout.mask &= softpipe->fs->run( softpipe->fs, machine, quad );
-
-   /* store outputs */
-   z_written = FALSE;
-   {
-      const ubyte *sem_name = softpipe->fs->info.output_semantic_name;
-      const ubyte *sem_index = softpipe->fs->info.output_semantic_index;
-      const uint n = qss->stage.softpipe->fs->info.num_outputs;
-      uint i;
-      for (i = 0; i < n; i++) {
-         switch (sem_name[i]) {
-         case TGSI_SEMANTIC_COLOR:
-            {
-               uint cbuf = sem_index[i];
-               memcpy(quad->output.color[cbuf],
-                      &machine->Outputs[i].xyzw[0].f[0],
-                      sizeof(quad->output.color[0]) );
-            }
-            break;
-         case TGSI_SEMANTIC_POSITION:
-            {
-               uint j;
-               for (j = 0; j < 4; j++) {
-                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
-               }
-               z_written = TRUE;
-            }
-            break;
-         }
+   return softpipe->fs->run( softpipe->fs, machine, quad );
+}
+
+
+
+static void
+coverage_quad(struct quad_stage *qs, struct quad_header *quad)
+{
+   struct softpipe_context *softpipe = qs->softpipe;
+   uint cbuf;
+
+   /* loop over colorbuffer outputs */
+   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
+      float (*quadColor)[4] = quad->output.color[cbuf];
+      unsigned j;
+      for (j = 0; j < QUAD_SIZE; j++) {
+         assert(quad->input.coverage[j] >= 0.0);
+         assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
       }
    }
+}
+
 
-   if (!z_written) {
-      /* compute Z values now, as in the quad earlyz stage */
-      /* XXX we should really only do this if the earlyz stage is not used */
-      const float fx = (float) quad->input.x0;
-      const float fy = (float) quad->input.y0;
-      const float dzdx = quad->posCoef->dadx[2];
-      const float dzdy = quad->posCoef->dady[2];
-      const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
-
-      quad->output.depth[0] = z0;
-      quad->output.depth[1] = z0 + dzdx;
-      quad->output.depth[2] = z0 + dzdy;
-      quad->output.depth[3] = z0 + dzdx + dzdy;
-   }
 
-   /* shader may cull fragments */
-   if (quad->inout.mask) {
-      qs->next->run( qs->next, quad );
+static void
+shade_quads(struct quad_stage *qs, 
+                 struct quad_header *quads[],
+                 unsigned nr)
+{
+   struct quad_shade_stage *qss = quad_shade_stage( qs );
+   struct softpipe_context *softpipe = qs->softpipe;
+   struct tgsi_exec_machine *machine = qss->machine;
+
+   unsigned i, pass = 0;
+   
+   machine->Consts = softpipe->mapped_constants[PIPE_SHADER_FRAGMENT];
+   machine->InterpCoefs = quads[0]->coef;
+
+   for (i = 0; i < nr; i++) {
+      if (!shade_quad(qs, quads[i]))
+         continue;
+
+      if (/*do_coverage*/ 0)
+         coverage_quad( qs, quads[i] );
+
+      quads[pass++] = quads[i];
    }
+   
+   if (pass)
+      qs->next->run(qs->next, quads, pass);
 }
+   
+
+
 
 
 /**
@@ -174,7 +171,7 @@ sp_quad_shade_stage( struct softpipe_context *softpipe )
 
    qss->stage.softpipe = softpipe;
    qss->stage.begin = shade_begin;
-   qss->stage.run = shade_quad;
+   qss->stage.run = shade_quads;
    qss->stage.destroy = shade_destroy;
 
    qss->machine = tgsi_exec_machine_create();
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
deleted file mode 100644
index 92d5f9f3c1a..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ /dev/null
@@ -1,103 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-#include "sp_tile_cache.h"
-
-
-/**
- * Last step of quad processing: write quad colors to the framebuffer,
- * taking mask into account.
- */
-static void
-output_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   /* in-tile pos: */
-   const int itx = quad->input.x0 % TILE_SIZE;
-   const int ity = quad->input.y0 % TILE_SIZE;
-
-   struct softpipe_context *softpipe = qs->softpipe;
-   uint cbuf;
-
-   /* loop over colorbuffer outputs */
-   for (cbuf = 0; cbuf < softpipe->framebuffer.nr_cbufs; cbuf++) {
-      struct softpipe_cached_tile *tile
-         = sp_get_cached_tile(softpipe,
-                              softpipe->cbuf_cache[cbuf],
-                              quad->input.x0, quad->input.y0);
-      float (*quadColor)[4] = quad->output.color[cbuf];
-      int i, j;
-
-      /* get/swizzle dest colors */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (quad->inout.mask & (1 << j)) {
-            int x = itx + (j & 1);
-            int y = ity + (j >> 1);
-            for (i = 0; i < 4; i++) { /* loop over color chans */
-               tile->data.color[y][x][i] = quadColor[i][j];
-            }
-            if (0) {
-               debug_printf("sp write pixel %d,%d: %g, %g, %g\n",
-                            quad->input.x0 + x,
-                            quad->input.y0 + y,
-                            quadColor[0][j],
-                            quadColor[1][j],
-                            quadColor[2][j]);
-            }
-         }
-      }
-   }
-}
-
-
-static void output_begin(struct quad_stage *qs)
-{
-   assert(qs->next == NULL);
-}
-
-
-static void output_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = output_begin;
-   stage->run = output_quad;
-   stage->destroy = output_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.c b/src/gallium/drivers/softpipe/sp_quad_pipe.c
index b5f69b74264..1b5bab4eca6 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.c
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.c
@@ -31,88 +31,33 @@
 #include "pipe/p_shader_tokens.h"
 
 static void
-sp_push_quad_first(
-   struct softpipe_context *sp,
-   struct quad_stage *quad,
-   uint i )
+sp_push_quad_first( struct softpipe_context *sp,
+                    struct quad_stage *quad )
 {
-   quad->next = sp->quad[i].first;
-   sp->quad[i].first = quad;
+   quad->next = sp->quad.first;
+   sp->quad.first = quad;
 }
 
-static void
-sp_build_depth_stencil(
-   struct softpipe_context *sp,
-   uint i )
-{
-   if (sp->depth_stencil->stencil[0].enabled ||
-       sp->depth_stencil->stencil[1].enabled) {
-      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
-   }
-   else if (sp->depth_stencil->depth.enabled &&
-            sp->framebuffer.zsbuf) {
-      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
-   }
-}
 
 void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
-   uint i;
-
    boolean early_depth_test =
-               sp->depth_stencil->depth.enabled &&
-               sp->framebuffer.zsbuf &&
-               !sp->depth_stencil->alpha.enabled &&
-               !sp->fs->info.uses_kill &&
-               !sp->fs->info.writes_z;
-
-   /* build up the pipeline in reverse order... */
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first = sp->quad[i].output;
-
-      if (sp->blend->colormask != 0xf) {
-         sp_push_quad_first( sp, sp->quad[i].colormask, i );
-      }
+      sp->depth_stencil->depth.enabled &&
+      sp->framebuffer.zsbuf &&
+      !sp->depth_stencil->alpha.enabled &&
+      !sp->fs->info.uses_kill &&
+      !sp->fs->info.writes_z;
 
-      if (sp->blend->blend_enable ||
-          sp->blend->logicop_enable) {
-         sp_push_quad_first( sp, sp->quad[i].blend, i );
-      }
+   sp->quad.first = sp->quad.blend;
 
-      if (sp->active_query_count) {
-         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
-      }
-
-      if (sp->rasterizer->poly_smooth ||
-          sp->rasterizer->line_smooth ||
-          sp->rasterizer->point_smooth) {
-         sp_push_quad_first( sp, sp->quad[i].coverage, i );
-      }
-
-      if (!early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-      }
-
-      if (sp->depth_stencil->alpha.enabled) {
-         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
-      }
-
-      /* XXX always enable shader? */
-      if (1) {
-         sp_push_quad_first( sp, sp->quad[i].shade, i );
-      }
-
-      if (early_depth_test) {
-         sp_build_depth_stencil( sp, i );
-         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
-      }
-
-#if !USE_DRAW_STAGE_PSTIPPLE
-      if (sp->rasterizer->poly_stipple_enable) {
-         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
-      }
-#endif
+   if (early_depth_test) {
+      sp_push_quad_first( sp, sp->quad.shade );
+      sp_push_quad_first( sp, sp->quad.depth_test );
+   }
+   else {
+      sp_push_quad_first( sp, sp->quad.depth_test );
+      sp_push_quad_first( sp, sp->quad.shade );
    }
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_pipe.h b/src/gallium/drivers/softpipe/sp_quad_pipe.h
index 0e40586ffc8..c0aa1348319 100644
--- a/src/gallium/drivers/softpipe/sp_quad_pipe.h
+++ b/src/gallium/drivers/softpipe/sp_quad_pipe.h
@@ -49,7 +49,7 @@ struct quad_stage {
    void (*begin)(struct quad_stage *qs);
 
    /** the stage action */
-   void (*run)(struct quad_stage *qs, struct quad_header *quad);
+   void (*run)(struct quad_stage *qs, struct quad_header *quad[], unsigned nr);
 
    void (*destroy)(struct quad_stage *qs);
 };
@@ -69,6 +69,4 @@ struct quad_stage *sp_quad_output_stage( struct softpipe_context *softpipe );
 
 void sp_build_quad_pipeline(struct softpipe_context *sp);
 
-void sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad);
-
 #endif /* SP_QUAD_PIPE_H */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
deleted file mode 100644
index 5e9d447737d..00000000000
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ /dev/null
@@ -1,352 +0,0 @@
-
-/**
- * \brief Quad stencil testing
- */
-
-
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_tile_cache.h"
-#include "sp_quad_pipe.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-
-
-/** Only 8-bit stencil supported */
-#define STENCIL_MAX 0xff
-
-
-/**
- * Do the basic stencil test (compare stencil buffer values against the
- * reference value.
- *
- * \param stencilVals  the stencil values from the stencil buffer
- * \param func  the stencil func (PIPE_FUNC_x)
- * \param ref  the stencil reference value
- * \param valMask  the stencil value mask indicating which bits of the stencil
- *                 values and ref value are to be used.
- * \return mask indicating which pixels passed the stencil test
- */
-static unsigned
-do_stencil_test(const ubyte stencilVals[QUAD_SIZE], unsigned func,
-                unsigned ref, unsigned valMask)
-{
-   unsigned passMask = 0x0;
-   unsigned j;
-
-   ref &= valMask;
-
-   switch (func) {
-   case PIPE_FUNC_NEVER:
-      /* passMask = 0x0 */
-      break;
-   case PIPE_FUNC_LESS:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref < (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_EQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref == (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_LEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref <= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GREATER:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref > (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref != (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_GEQUAL:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (ref >= (stencilVals[j] & valMask)) {
-            passMask |= (1 << j);
-         }
-      }
-      break;
-   case PIPE_FUNC_ALWAYS:
-      passMask = MASK_ALL;
-      break;
-   default:
-      assert(0);
-   }
-
-   return passMask;
-}
-
-
-/**
- * Apply the stencil operator to stencil values.
- *
- * \param stencilVals  the stencil buffer values (read and written)
- * \param mask  indicates which pixels to update
- * \param op  the stencil operator (PIPE_STENCIL_OP_x)
- * \param ref  the stencil reference value
- * \param wrtMask  writemask controlling which bits are changed in the
- *                 stencil values
- */
-static void
-apply_stencil_op(ubyte stencilVals[QUAD_SIZE],
-                 unsigned mask, unsigned op, ubyte ref, ubyte wrtMask)
-{
-   unsigned j;
-   ubyte newstencil[QUAD_SIZE];
-
-   for (j = 0; j < QUAD_SIZE; j++) {
-      newstencil[j] = stencilVals[j];
-   }
-
-   switch (op) {
-   case PIPE_STENCIL_OP_KEEP:
-      /* no-op */
-      break;
-   case PIPE_STENCIL_OP_ZERO:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = 0;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_REPLACE:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ref;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] < STENCIL_MAX) {
-               newstencil[j] = stencilVals[j] + 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            if (stencilVals[j] > 0) {
-               newstencil[j] = stencilVals[j] - 1;
-            }
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INCR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] + 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_DECR_WRAP:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = stencilVals[j] - 1;
-         }
-      }
-      break;
-   case PIPE_STENCIL_OP_INVERT:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         if (mask & (1 << j)) {
-            newstencil[j] = ~stencilVals[j];
-         }
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /*
-    * update the stencil values
-    */
-   if (wrtMask != STENCIL_MAX) {
-      /* apply bit-wise stencil buffer writemask */
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = (wrtMask & newstencil[j]) | (~wrtMask & stencilVals[j]);
-      }
-   }
-   else {
-      for (j = 0; j < QUAD_SIZE; j++) {
-         stencilVals[j] = newstencil[j];
-      }
-   }
-}
-
-
-/**
- * Do stencil (and depth) testing.  Stenciling depends on the outcome of
- * depth testing.
- */
-static void
-stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-   struct pipe_surface *ps = softpipe->framebuffer.zsbuf;
-   unsigned func, zFailOp, zPassOp, failOp;
-   ubyte ref, wrtMask, valMask;
-   ubyte stencilVals[QUAD_SIZE];
-   struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
-   uint j;
-   uint face = quad->input.facing;
-
-   if (!softpipe->depth_stencil->stencil[1].enabled) {
-      /* single-sided stencil test, use front (face=0) state */
-      face = 0;
-   }
-
-   /* choose front or back face function, operator, etc */
-   /* XXX we could do these initializations once per primitive */
-   func    = softpipe->depth_stencil->stencil[face].func;
-   failOp  = softpipe->depth_stencil->stencil[face].fail_op;
-   zFailOp = softpipe->depth_stencil->stencil[face].zfail_op;
-   zPassOp = softpipe->depth_stencil->stencil[face].zpass_op;
-   ref     = softpipe->depth_stencil->stencil[face].ref_value;
-   wrtMask = softpipe->depth_stencil->stencil[face].writemask;
-   valMask = softpipe->depth_stencil->stencil[face].valuemask;
-
-   assert(ps); /* shouldn't get here if there's no stencil buffer */
-
-   /* get stencil values from cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] >> 24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.depth32[y][x] & 0xff;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         stencilVals[j] = tile->data.stencil8[y][x];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   /* do the stencil test first */
-   {
-      unsigned passMask, failMask;
-      passMask = do_stencil_test(stencilVals, func, ref, valMask);
-      failMask = quad->inout.mask & ~passMask;
-      quad->inout.mask &= passMask;
-
-      if (failOp != PIPE_STENCIL_OP_KEEP) {
-         apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
-      }
-   }
-
-   if (quad->inout.mask) {
-      /* now the pixels that passed the stencil test are depth tested */
-      if (softpipe->depth_stencil->depth.enabled) {
-         const unsigned origMask = quad->inout.mask;
-
-         sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
-
-         /* update stencil buffer values according to z pass/fail result */
-         if (zFailOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned failMask = origMask & ~quad->inout.mask;
-            apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
-         }
-
-         if (zPassOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned passMask = origMask & quad->inout.mask;
-            apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
-         }
-      }
-      else {
-         /* no depth test, apply Zpass operator to stencil buffer values */
-         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
-      }
-
-   }
-
-   /* put new stencil values into cached tile */
-   switch (ps->format) {
-   case PIPE_FORMAT_S8Z24_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint s8z24 = tile->data.depth32[y][x];
-         s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
-         tile->data.depth32[y][x] = s8z24;
-      }
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         uint z24s8 = tile->data.depth32[y][x];
-         z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
-         tile->data.depth32[y][x] = z24s8;
-      }
-      break;
-   case PIPE_FORMAT_S8_UNORM:
-      for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->input.x0 % TILE_SIZE + (j & 1);
-         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
-         tile->data.stencil8[y][x] = stencilVals[j];
-      }
-      break;
-   default:
-      assert(0);
-   }
-
-   if (quad->inout.mask)
-      qs->next->run(qs->next, quad);
-}
-
-
-static void stencil_begin(struct quad_stage *qs)
-{
-   qs->next->begin(qs->next);
-}
-
-
-static void stencil_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_stencil_test_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = stencil_begin;
-   stage->run = stencil_test_quad;
-   stage->destroy = stencil_destroy;
-
-   return stage;
-}
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index 07162db7b6e..a0527a596a6 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -14,14 +14,20 @@
  * Apply polygon stipple to quads produced by triangle rasterization
  */
 static void
-stipple_quad(struct quad_stage *qs, struct quad_header *quad)
+stipple_quad(struct quad_stage *qs, struct quad_header *quads[], unsigned nr)
 {
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
+   unsigned pass = nr;
+
+   struct softpipe_context *softpipe = qs->softpipe;
+   unsigned q;
+
+   pass = 0;
+
+   for (q = 0; q < nr; q++)  {
+      struct quad_header *quad = quads[q];
 
-   if (quad->input.prim == QUAD_PRIM_TRI) {
-      struct softpipe_context *softpipe = qs->softpipe;
-      /* need to invert Y to index into OpenGL's stipple pattern */
       const int col0 = quad->input.x0 % 32;
       const int y0 = quad->input.y0;
       const int y1 = y0 + 1;
@@ -41,13 +47,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
       if ((stipple1 & (bit30 >> col0)) == 0)
          quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
 
-      if (!quad->inout.mask) {
-         /* all fragments failed stipple test, end of quad pipeline */
-         return;
-      }
+      if (quad->inout.mask)
+         quads[pass++] = quad;
    }
 
-   qs->next->run(qs->next, quad);
+   qs->next->run(qs->next, quads, pass);
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
index ce770184158..81fb7aa20c6 100644
--- a/src/gallium/drivers/softpipe/sp_screen.c
+++ b/src/gallium/drivers/softpipe/sp_screen.c
@@ -40,7 +40,7 @@
 static const char *
 softpipe_get_vendor(struct pipe_screen *screen)
 {
-   return "Tungsten Graphics, Inc.";
+   return "VMware, Inc.";
 }
 
 
@@ -82,7 +82,7 @@ softpipe_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
-      return 8;  /* max 128x128x128 */
+      return 9;  /* max 256x256x256 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
       return 13; /* max 4Kx4K */
    case PIPE_CAP_TGSI_CONT_SUPPORTED:
@@ -135,6 +135,9 @@ softpipe_is_format_supported( struct pipe_screen *screen,
           target == PIPE_TEXTURE_CUBE);
 
    switch(format) {
+   case PIPE_FORMAT_L16_UNORM:
+   case PIPE_FORMAT_YCBCR_REV:
+   case PIPE_FORMAT_YCBCR:
    case PIPE_FORMAT_DXT1_RGB:
    case PIPE_FORMAT_DXT1_RGBA:
    case PIPE_FORMAT_DXT3_RGBA:
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index de3ae3c3696..ade125662ad 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -33,7 +33,6 @@
  */
 
 #include "sp_context.h"
-#include "sp_prim_setup.h"
 #include "sp_quad.h"
 #include "sp_quad_pipe.h"
 #include "sp_setup.h"
@@ -61,87 +60,9 @@ struct edge {
    int lines;		/**< number of lines on this edge */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
 
-/* Set to 1 if you want other threads to be instantly
- * notified of pending jobs.
- */
-#define INSTANT_NOTEMPTY_NOTIFY 0
-
-struct thread_info
-{
-   struct setup_context *setup;
-   uint id;
-   pipe_thread handle;
-};
-
-struct quad_job;
-
-typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
-
-struct quad_job
-{
-   struct quad_header_input input;
-   struct quad_header_inout inout;
-   quad_job_routine routine;
-};
-
-#define NUM_QUAD_JOBS 64
-
-struct quad_job_que
-{
-   struct quad_job jobs[NUM_QUAD_JOBS];
-   uint first;
-   uint last;
-   pipe_mutex que_mutex;
-   pipe_condvar que_notfull_condvar;
-   pipe_condvar que_notempty_condvar;
-   uint jobs_added;
-   uint jobs_done;
-   pipe_condvar que_done_condvar;
-};
-
-static void
-add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
-{
-#if INSTANT_NOTEMPTY_NOTIFY
-   boolean empty;
-#endif
-
-   /* Wait for empty slot, see if the que is empty.
-    */
-   pipe_mutex_lock( que->que_mutex );
-   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
-#if !INSTANT_NOTEMPTY_NOTIFY
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-#endif
-      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
-   }
-#if INSTANT_NOTEMPTY_NOTIFY
-   empty = que->last == que->first;
-#endif
-   que->jobs_added++;
-   pipe_mutex_unlock( que->que_mutex );
+#define MAX_QUADS 16
 
-   /* Submit new job.
-    */
-   que->jobs[que->last].input = quad->input;
-   que->jobs[que->last].inout = quad->inout;
-   que->jobs[que->last].routine = routine;
-   que->last = (que->last + 1) % NUM_QUAD_JOBS;
-
-#if INSTANT_NOTEMPTY_NOTIFY
-   /* If the que was empty, notify consumers there's a job to be done.
-    */
-   if (empty) {
-      pipe_mutex_lock( que->que_mutex );
-      pipe_condvar_broadcast( que->que_notempty_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-#endif
-}
-
-#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -164,22 +85,19 @@ struct setup_context {
    struct edge emaj;
 
    float oneoverarea;
+   int facing;
+
+   struct quad_header quad[MAX_QUADS];
+   struct quad_header *quad_ptrs[MAX_QUADS];
+   unsigned count;
 
    struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
    struct tgsi_interp_coef posCoef;  /* For Z, W */
-   struct quad_header quad;
-
-#if SP_NUM_QUAD_THREADS > 1
-   struct quad_job_que que;
-   struct thread_info threads[SP_NUM_QUAD_THREADS];
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
       int y;
-      unsigned y_flags;
-      unsigned mask;     /**< mask of MASK_BOTTOM/TOP_LEFT/RIGHT bits */
    } span;
 
 #if DEBUG_FRAGS
@@ -190,67 +108,6 @@ struct setup_context {
    unsigned winding;		/* which winding to cull */
 };
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static PIPE_THREAD_ROUTINE( quad_thread, param )
-{
-   struct thread_info *info = (struct thread_info *) param;
-   struct quad_job_que *que = &info->setup->que;
-
-   for (;;) {
-      struct quad_job job;
-      boolean full;
-
-      /* Wait for an available job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      while (que->last == que->first)
-         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
-
-      /* See if the que is full.
-       */
-      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
-
-      /* Take a job and remove it from que.
-       */
-      job = que->jobs[que->first];
-      que->first = (que->first + 1) % NUM_QUAD_JOBS;
-
-      /* Notify the producer if the que is not full.
-       */
-      if (full)
-         pipe_condvar_signal( que->que_notfull_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-
-      job.routine( info->setup, info->id, &job );
-
-      /* Notify the producer if that's the last finished job.
-       */
-      pipe_mutex_lock( que->que_mutex );
-      que->jobs_done++;
-      if (que->jobs_added == que->jobs_done)
-         pipe_condvar_signal( que->que_done_condvar );
-      pipe_mutex_unlock( que->que_mutex );
-   }
-
-   return NULL;
-}
-
-#define WAIT_FOR_COMPLETION(setup) \
-   do {\
-      pipe_mutex_lock( setup->que.que_mutex );\
-      if (!INSTANT_NOTEMPTY_NOTIFY)\
-         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
-      while (setup->que.jobs_added != setup->que.jobs_done)\
-         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
-      pipe_mutex_unlock( setup->que.que_mutex );\
-   } while (0)
-
-#else
-
-#define WAIT_FOR_COMPLETION(setup) ((void) 0)
-
-#endif
 
 
 
@@ -313,98 +170,18 @@ quad_clip( struct setup_context *setup, struct quad_header *quad )
  * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
 {
    quad_clip( setup, quad );
+
    if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
 
-      sp->quad[thread].first->run( sp->quad[thread].first, quad );
+      sp->quad.first->run( sp->quad.first, &quad, 1 );
    }
 }
 
-#if SP_NUM_QUAD_THREADS > 1
-
-static void
-clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   clip_emit_quad( setup, &quad, thread );
-}
-
-#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
-
-#else
-
-#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
-
-#endif
-
-/**
- * Emit a quad (pass to next stage).  No clipping is done.
- */
-static INLINE void
-emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
-{
-   struct softpipe_context *sp = setup->softpipe;
-#if DEBUG_FRAGS
-   uint mask = quad->inout.mask;
-#endif
-
-#if DEBUG_FRAGS
-   if (mask & 1) setup->numFragsEmitted++;
-   if (mask & 2) setup->numFragsEmitted++;
-   if (mask & 4) setup->numFragsEmitted++;
-   if (mask & 8) setup->numFragsEmitted++;
-#endif
-   sp->quad[thread].first->run( sp->quad[thread].first, quad );
-#if DEBUG_FRAGS
-   mask = quad->inout.mask;
-   if (mask & 1) setup->numFragsWritten++;
-   if (mask & 2) setup->numFragsWritten++;
-   if (mask & 4) setup->numFragsWritten++;
-   if (mask & 8) setup->numFragsWritten++;
-#endif
-}
-
-#if SP_NUM_QUAD_THREADS > 1
 
-static void
-emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
-{
-   struct quad_header quad;
-
-   quad.input = job->input;
-   quad.inout = job->inout;
-   quad.coef = setup->quad.coef;
-   quad.posCoef = setup->quad.posCoef;
-   quad.nr_attrs = setup->quad.nr_attrs;
-   emit_quad( setup, &quad, thread );
-}
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
-   } while (0)
-
-#else
-
-#define EMIT_QUAD(setup,x,y,mask) do {\
-      setup->quad.input.x0 = x;\
-      setup->quad.input.y0 = y;\
-      setup->quad.inout.mask = mask;\
-      emit_quad( setup, &setup->quad, 0 );\
-   } while (0)
-
-#endif
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -412,7 +189,12 @@ emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
  */
 static INLINE int block( int x )
 {
-   return x & ~1;
+   return x & ~(2-1);
+}
+
+static INLINE int block_x( int x )
+{
+   return x & ~(16-1);
 }
 
 
@@ -421,72 +203,63 @@ static INLINE int block( int x )
  */
 static void flush_spans( struct setup_context *setup )
 {
+   const int step = 16;
    const int xleft0 = setup->span.left[0];
    const int xleft1 = setup->span.left[1];
    const int xright0 = setup->span.right[0];
    const int xright1 = setup->span.right[1];
-   int minleft, maxright;
+   struct quad_stage *pipe = setup->softpipe->quad.first;
+
+
+   int minleft = block_x(MIN2(xleft0, xleft1));
+   int maxright = MAX2(xright0, xright1);
    int x;
 
-   switch (setup->span.y_flags) {
-   case 0x3:
-      /* both odd and even lines written (both quad rows) */
-      minleft = block(MIN2(xleft0, xleft1));
-      maxright = block(MAX2(xright0, xright1));
-      for (x = minleft; x <= maxright; x += 2) {
-         /* determine which of the four pixels is inside the span bounds */
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x1:
-      /* only even line written (quad top row) */
-      minleft = block(xleft0);
-      maxright = block(xright0);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft0 && x < xright0)
-            mask |= MASK_TOP_LEFT;
-         if (x+1 >= xleft0 && x+1 < xright0)
-            mask |= MASK_TOP_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
-
-   case 0x2:
-      /* only odd line written (quad bottom row) */
-      minleft = block(xleft1);
-      maxright = block(xright1);
-      for (x = minleft; x <= maxright; x += 2) {
-         uint mask = 0x0;
-         if (x >= xleft1 && x < xright1)
-            mask |= MASK_BOTTOM_LEFT;
-         if (x+1 >= xleft1 && x+1 < xright1)
-            mask |= MASK_BOTTOM_RIGHT;
-         if (mask)
-            EMIT_QUAD( setup, x, setup->span.y, mask );
-      }
-      break;
+   for (x = minleft; x < maxright; x += step) {
+      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
+      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
+      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
+      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
+      unsigned lx = x;
+      unsigned q = 0;
 
-   default:
-      return;
+      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
+      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
+
+      /* These calculations fail when step == 32 and skip_right == 0.
+       */
+      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
+      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
+
+      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
+      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
+
+      if (mask0 | mask1) {
+         do {
+            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
+            if (quadmask) {
+               setup->quad[q].input.x0 = lx;
+               setup->quad[q].input.y0 = setup->span.y;
+               setup->quad[q].input.facing = setup->facing;
+               setup->quad[q].inout.mask = quadmask;
+               setup->quad_ptrs[q] = &setup->quad[q];
+               q++;
+            }
+            mask0 >>= 2;
+            mask1 >>= 2;
+            lx += 2;
+         } while (mask0 | mask1);
+
+         pipe->run( pipe, setup->quad_ptrs, q );
+      }
    }
 
+
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 }
 
 
@@ -496,7 +269,7 @@ static void print_vertex(const struct setup_context *setup,
 {
    int i;
    debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad.nr_attrs; i++) {
+   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
       debug_printf("     %d: %f %f %f %f\n",  i,
               v[i][0], v[i][1], v[i][2], v[i][3]);
       if (util_is_inf_or_nan(v[i][0])) {
@@ -601,7 +374,9 @@ static boolean setup_sort_vertices( struct setup_context *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup->facing = 
+      ((det > 0.0) ^ 
+       (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW));
 
    return TRUE;
 }
@@ -788,7 +563,7 @@ static void setup_tri_coefficients( struct setup_context *setup )
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -844,11 +619,10 @@ static void subtriangle( struct setup_context *setup,
 
    /* clip top/bottom */
    start_y = sy;
-   finish_y = sy + lines;
-
    if (start_y < miny)
       start_y = miny;
 
+   finish_y = sy + lines;
    if (finish_y > maxy)
       finish_y = maxy;
 
@@ -885,7 +659,6 @@ static void subtriangle( struct setup_context *setup,
 
          setup->span.left[_y&1] = left;
          setup->span.right[_y&1] = right;
-         setup->span.y_flags |= 1<<(_y&1);
       }
    }
 
@@ -958,10 +731,9 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.input.prim = QUAD_PRIM_TRI;
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
 
    setup->span.y = 0;
-   setup->span.y_flags = 0;
    setup->span.right[0] = 0;
    setup->span.right[1] = 0;
    /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
@@ -983,8 +755,6 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
-   WAIT_FOR_COMPLETION(setup);
-
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -1101,7 +871,7 @@ setup_line_coefficients(struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
@@ -1122,20 +892,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.input.x0 ||
-       quadY != setup->quad.input.y0)
+   if (quadX != setup->quad[0].input.x0 ||
+       quadY != setup->quad[0].input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.input.x0 != -1)
-         CLIP_EMIT_QUAD(setup);
+      if (setup->quad[0].input.x0 != -1)
+         clip_emit_quad( setup, &setup->quad[0] );
 
-      setup->quad.input.x0 = quadX;
-      setup->quad.input.y0 = quadY;
-      setup->quad.inout.mask = 0x0;
+      setup->quad[0].input.x0 = quadX;
+      setup->quad[0].input.y0 = quadY;
+      setup->quad[0].inout.mask = 0x0;
    }
 
-   setup->quad.inout.mask |= mask;
+   setup->quad[0].inout.mask |= mask;
 }
 
 
@@ -1195,17 +965,18 @@ setup_line(struct setup_context *setup,
 
    assert(dx >= 0);
    assert(dy >= 0);
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_LINES);
+
+   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
+   setup->quad[0].inout.mask = 0x0;
 
-   setup->quad.input.x0 = setup->quad.input.y0 = -1;
-   setup->quad.inout.mask = 0x0;
-   setup->quad.input.prim = QUAD_PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.input.coverage[0] =
-   setup->quad.input.coverage[1] =
-   setup->quad.input.coverage[2] =
-   setup->quad.input.coverage[3] = 1.0;
+   setup->quad[0].input.coverage[0] =
+   setup->quad[0].input.coverage[1] =
+   setup->quad[0].input.coverage[2] =
+   setup->quad[0].input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1249,11 +1020,9 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.inout.mask) {
-      CLIP_EMIT_QUAD(setup);
+   if (setup->quad[0].inout.mask) {
+      clip_emit_quad( setup, &setup->quad[0] );
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 
@@ -1300,6 +1069,8 @@ setup_point( struct setup_context *setup,
    if (softpipe->no_rast)
       return;
 
+   assert(setup->softpipe->reduced_prim == PIPE_PRIM_POINTS);
+
    /* For points, all interpolants are constant-valued.
     * However, for point sprites, we'll need to setup texcoords appropriately.
     * XXX: which coefficients are the texcoords???
@@ -1346,22 +1117,21 @@ setup_point( struct setup_context *setup,
       }
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef[fragSlot].a0[0] = 1.0f - setup->quad.input.facing;
+         setup->coef[fragSlot].a0[0] = 1.0f - setup->facing;
          setup->coef[fragSlot].dadx[0] = 0.0;
          setup->coef[fragSlot].dady[0] = 0.0;
       }
    }
 
-   setup->quad.input.prim = QUAD_PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.input.x0 = (int) x - ix;
-      setup->quad.input.y0 = (int) y - iy;
-      setup->quad.inout.mask = (1 << ix) << (2 * iy);
-      CLIP_EMIT_QUAD(setup);
+      setup->quad[0].input.x0 = (int) x - ix;
+      setup->quad[0].input.y0 = (int) y - iy;
+      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
+      clip_emit_quad( setup, &setup->quad[0] );
    }
    else {
       if (round) {
@@ -1381,15 +1151,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.inout.mask = 0x0;
+               setup->quad[0].inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_LEFT;
+                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1397,8 +1167,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1406,8 +1176,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1415,14 +1185,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.inout.mask) {
-                  setup->quad.input.x0 = ix;
-                  setup->quad.input.y0 = iy;
-                  CLIP_EMIT_QUAD(setup);
+               if (setup->quad[0].inout.mask) {
+                  setup->quad[0].input.x0 = ix;
+                  setup->quad[0].input.y0 = iy;
+                  clip_emit_quad( setup, &setup->quad[0] );
                }
             }
          }
@@ -1466,33 +1236,25 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.inout.mask = mask;
-               setup->quad.input.x0 = ix;
-               setup->quad.input.y0 = iy;
-               CLIP_EMIT_QUAD(setup);
+               setup->quad[0].inout.mask = mask;
+               setup->quad[0].input.x0 = ix;
+               setup->quad[0].input.y0 = iy;
+               clip_emit_quad( setup, &setup->quad[0] );
             }
          }
       }
    }
-
-   WAIT_FOR_COMPLETION(setup);
 }
 
 void setup_prepare( struct setup_context *setup )
 {
    struct softpipe_context *sp = setup->softpipe;
-   unsigned i;
 
    if (sp->dirty) {
       softpipe_update_derived(sp);
    }
 
-   /* Note: nr_attrs is only used for debugging (vertex printing) */
-   setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
-
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      sp->quad[i].first->begin( sp->quad[i].first );
-   }
+   sp->quad.first->begin( sp->quad.first );
 
    if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
@@ -1520,30 +1282,17 @@ void setup_destroy_context( struct setup_context *setup )
 struct setup_context *setup_create_context( struct softpipe_context *softpipe )
 {
    struct setup_context *setup = CALLOC_STRUCT(setup_context);
-#if SP_NUM_QUAD_THREADS > 1
-   uint i;
-#endif
+   unsigned i;
 
    setup->softpipe = softpipe;
 
-   setup->quad.coef = setup->coef;
-   setup->quad.posCoef = &setup->posCoef;
-
-#if SP_NUM_QUAD_THREADS > 1
-   setup->que.first = 0;
-   setup->que.last = 0;
-   pipe_mutex_init( setup->que.que_mutex );
-   pipe_condvar_init( setup->que.que_notfull_condvar );
-   pipe_condvar_init( setup->que.que_notempty_condvar );
-   setup->que.jobs_added = 0;
-   setup->que.jobs_done = 0;
-   pipe_condvar_init( setup->que.que_done_condvar );
-   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
-      setup->threads[i].setup = setup;
-      setup->threads[i].id = i;
-      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   for (i = 0; i < MAX_QUADS; i++) {
+      setup->quad[i].coef = setup->coef;
+      setup->quad[i].posCoef = &setup->posCoef;
    }
-#endif
+
+   setup->span.left[0] = 1000000;     /* greater than right[0] */
+   setup->span.left[1] = 1000000;     /* greater than right[1] */
 
    return setup;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state.h b/src/gallium/drivers/softpipe/sp_state.h
index 9776e978e3e..77ee3c1136b 100644
--- a/src/gallium/drivers/softpipe/sp_state.h
+++ b/src/gallium/drivers/softpipe/sp_state.h
@@ -87,6 +87,7 @@ struct sp_fragment_shader {
 struct sp_vertex_shader {
    struct pipe_shader_state shader;
    struct draw_vertex_shader *draw_data;
+   int max_sampler;             /* -1 if no samplers */
 };
 
 
diff --git a/src/gallium/drivers/softpipe/sp_state_blend.c b/src/gallium/drivers/softpipe/sp_state_blend.c
index 384fe559afd..efed082f823 100644
--- a/src/gallium/drivers/softpipe/sp_state_blend.c
+++ b/src/gallium/drivers/softpipe/sp_state_blend.c
@@ -45,7 +45,7 @@ void softpipe_bind_blend_state( struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->blend = (const struct pipe_blend_state *)blend;
+   softpipe->blend = (struct pipe_blend_state *)blend;
 
    softpipe->dirty |= SP_NEW_BLEND;
 }
@@ -86,7 +86,7 @@ softpipe_bind_depth_stencil_state(struct pipe_context *pipe,
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->depth_stencil = (const struct pipe_depth_stencil_alpha_state *)depth_stencil;
+   softpipe->depth_stencil = (struct pipe_depth_stencil_alpha_state *)depth_stencil;
 
    softpipe->dirty |= SP_NEW_DEPTH_STENCIL_ALPHA;
 }
diff --git a/src/gallium/drivers/softpipe/sp_state_derived.c b/src/gallium/drivers/softpipe/sp_state_derived.c
index 9258f641091..1faeca1c2a3 100644
--- a/src/gallium/drivers/softpipe/sp_state_derived.c
+++ b/src/gallium/drivers/softpipe/sp_state_derived.c
@@ -32,7 +32,10 @@
 #include "draw/draw_vertex.h"
 #include "draw/draw_private.h"
 #include "sp_context.h"
+#include "sp_screen.h"
 #include "sp_state.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
 
 
 /**
@@ -65,24 +68,19 @@ softpipe_get_vertex_info(struct softpipe_context *softpipe)
       const struct sp_fragment_shader *spfs = softpipe->fs;
       const enum interp_mode colorInterp
          = softpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(softpipe->draw);
       uint i;
 
-      if (softpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &softpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(softpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
@@ -181,11 +179,19 @@ softpipe_get_vbuf_vertex_info(struct softpipe_context *softpipe)
 static void
 compute_cliprect(struct softpipe_context *sp)
 {
+   /* SP_NEW_FRAMEBUFFER
+    */
    uint surfWidth = sp->framebuffer.width;
    uint surfHeight = sp->framebuffer.height;
 
+   /* SP_NEW_RASTERIZER
+    */
    if (sp->rasterizer->scissor) {
-      /* clip to scissor rect */
+
+      /* SP_NEW_SCISSOR
+       *
+       * clip to scissor rect:
+       */
       sp->cliprect.minx = MAX2(sp->scissor.minx, 0);
       sp->cliprect.miny = MAX2(sp->scissor.miny, 0);
       sp->cliprect.maxx = MIN2(sp->scissor.maxx, surfWidth);
@@ -201,27 +207,63 @@ compute_cliprect(struct softpipe_context *sp)
 }
 
 
+static void
+update_tgsi_samplers( struct softpipe_context *softpipe )
+{
+   unsigned i;
+
+   softpipe_reset_sampler_varients( softpipe );
+
+   for (i = 0; i < PIPE_MAX_SAMPLERS; i++) {
+      struct softpipe_tex_tile_cache *tc = softpipe->tex_cache[i];
+      if (tc->texture) {
+         struct softpipe_texture *spt = softpipe_texture(tc->texture);
+         if (spt->timestamp != tc->timestamp) {
+	    sp_tex_tile_cache_validate_texture( tc );
+            /*
+            _debug_printf("INV %d %d\n", tc->timestamp, spt->timestamp);
+            */
+            tc->timestamp = spt->timestamp;
+         }
+      }
+   }
+}
+
+
 /* Hopefully this will remain quite simple, otherwise need to pull in
  * something like the state tracker mechanism.
  */
 void softpipe_update_derived( struct softpipe_context *softpipe )
 {
+   struct softpipe_screen *sp_screen = softpipe_screen(softpipe->pipe.screen);
+
+   /* Check for updated textures.
+    */
+   if (softpipe->tex_timestamp != sp_screen->timestamp) {
+      softpipe->tex_timestamp = sp_screen->timestamp;
+      softpipe->dirty |= SP_NEW_TEXTURE;
+   }
+      
+   if (softpipe->dirty & (SP_NEW_SAMPLER |
+                          SP_NEW_TEXTURE |
+                          SP_NEW_FS | 
+                          SP_NEW_VS))
+      update_tgsi_samplers( softpipe );
+
    if (softpipe->dirty & (SP_NEW_RASTERIZER |
                           SP_NEW_FS |
                           SP_NEW_VS))
       invalidate_vertex_layout( softpipe );
 
    if (softpipe->dirty & (SP_NEW_SCISSOR |
-                          SP_NEW_DEPTH_STENCIL_ALPHA |
+                          SP_NEW_RASTERIZER |
                           SP_NEW_FRAMEBUFFER))
       compute_cliprect(softpipe);
 
    if (softpipe->dirty & (SP_NEW_BLEND |
                           SP_NEW_DEPTH_STENCIL_ALPHA |
                           SP_NEW_FRAMEBUFFER |
-                          SP_NEW_RASTERIZER |
-                          SP_NEW_FS | 
-			  SP_NEW_QUERY))
+                          SP_NEW_FS))
       sp_build_quad_pipeline(softpipe);
 
    softpipe->dirty = 0;
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 4330c203935..256faa94b84 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -31,9 +31,8 @@
 
 #include "pipe/p_defines.h"
 #include "util/u_memory.h"
-#include "pipe/internal/p_winsys_screen.h"
-#include "pipe/p_shader_tokens.h"
 #include "draw/draw_context.h"
+#include "draw/draw_vs.h"
 #include "tgsi/tgsi_dump.h"
 #include "tgsi/tgsi_scan.h"
 #include "tgsi/tgsi_parse.h"
@@ -51,12 +50,9 @@ softpipe_create_fs_state(struct pipe_context *pipe,
       tgsi_dump(templ->tokens, 0);
 
    /* codegen */
-   state = softpipe_create_fs_llvm( softpipe, templ );
+   state = softpipe_create_fs_sse( softpipe, templ );
    if (!state) {
-      state = softpipe_create_fs_sse( softpipe, templ );
-      if (!state) {
-         state = softpipe_create_fs_exec( softpipe, templ );
-      }
+      state = softpipe_create_fs_exec( softpipe, templ );
    }
 
    assert(state);
@@ -111,6 +107,8 @@ softpipe_create_vs_state(struct pipe_context *pipe,
    if (state->draw_data == NULL) 
       goto fail;
 
+   state->max_sampler = state->draw_data->info.file_max[TGSI_FILE_SAMPLER];
+
    return state;
 
 fail:
@@ -128,7 +126,7 @@ softpipe_bind_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   softpipe->vs = (const struct sp_vertex_shader *)vs;
+   softpipe->vs = (struct sp_vertex_shader *) vs;
 
    draw_bind_vertex_shader(softpipe->draw,
                            (softpipe->vs ? softpipe->vs->draw_data : NULL));
@@ -142,8 +140,7 @@ softpipe_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
    struct softpipe_context *softpipe = softpipe_context(pipe);
 
-   struct sp_vertex_shader *state =
-      (struct sp_vertex_shader *)vs;
+   struct sp_vertex_shader *state = (struct sp_vertex_shader *) vs;
 
    draw_delete_vertex_shader(softpipe->draw, state->draw_data);
    FREE( state );
diff --git a/src/gallium/drivers/softpipe/sp_state_sampler.c b/src/gallium/drivers/softpipe/sp_state_sampler.c
index cb517b02e44..db0b8ab76b1 100644
--- a/src/gallium/drivers/softpipe/sp_state_sampler.c
+++ b/src/gallium/drivers/softpipe/sp_state_sampler.c
@@ -32,21 +32,37 @@
 #include "util/u_memory.h"
 
 #include "draw/draw_context.h"
+#include "draw/draw_context.h"
 
 #include "sp_context.h"
-#include "sp_context.h"
 #include "sp_state.h"
 #include "sp_texture.h"
-#include "sp_tile_cache.h"
-#include "draw/draw_context.h"
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
+struct sp_sampler {
+   struct pipe_sampler_state base;
+   struct sp_sampler_varient *varients;
+   struct sp_sampler_varient *current;
+};
+
+static struct sp_sampler *sp_sampler( struct pipe_sampler_state *sampler )
+{
+   return (struct sp_sampler *)sampler;
+}
+
 
 void *
 softpipe_create_sampler_state(struct pipe_context *pipe,
                               const struct pipe_sampler_state *sampler)
 {
-   return mem_dup(sampler, sizeof(*sampler));
+   struct sp_sampler *sp_sampler = CALLOC_STRUCT(sp_sampler);
+
+   sp_sampler->base = *sampler;
+   sp_sampler->varients = NULL;
+
+   return (void *)sp_sampler;
 }
 
 
@@ -97,7 +113,7 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
       struct pipe_texture *tex = i < num ? texture[i] : NULL;
 
       pipe_texture_reference(&softpipe->texture[i], tex);
-      sp_tile_cache_set_texture(pipe, softpipe->tex_cache[i], tex);
+      sp_tex_tile_cache_set_texture(softpipe->tex_cache[i], tex);
    }
 
    softpipe->num_textures = num;
@@ -106,10 +122,111 @@ softpipe_set_sampler_textures(struct pipe_context *pipe,
 }
 
 
+/**
+ * Find/create an sp_sampler_varient object for sampling the given texture,
+ * sampler and tex unit.
+ *
+ * Note that the tex unit is significant.  We can't re-use a sampler
+ * varient for multiple texture units because the sampler varient contains
+ * the texture object pointer.  If the texture object pointer were stored
+ * somewhere outside the sampler varient, we could re-use samplers for
+ * multiple texture units.
+ */
+static struct sp_sampler_varient *
+get_sampler_varient( unsigned unit,
+                     struct sp_sampler *sampler,
+                     struct pipe_texture *texture,
+                     unsigned processor )
+{
+   struct softpipe_texture *sp_texture = softpipe_texture(texture);
+   struct sp_sampler_varient *v = NULL;
+   union sp_sampler_key key;
+
+   /* if this fails, widen the key.unit field and update this assertion */
+   assert(PIPE_MAX_SAMPLERS <= 16);
+
+   key.bits.target = sp_texture->base.target;
+   key.bits.is_pot = sp_texture->pot;
+   key.bits.processor = processor;
+   key.bits.unit = unit;
+   key.bits.pad = 0;
+
+   if (sampler->current && 
+       key.value == sampler->current->key.value) {
+      v = sampler->current;
+   }
+
+   if (v == NULL) {
+      for (v = sampler->varients; v; v = v->next)
+         if (v->key.value == key.value)
+            break;
+
+      if (v == NULL) {
+         v = sp_create_sampler_varient( &sampler->base, key );
+         v->next = sampler->varients;
+         sampler->varients = v;
+      }
+   }
+   
+   sampler->current = v;
+   return v;
+}
+
+
+
+
+void
+softpipe_reset_sampler_varients(struct softpipe_context *softpipe)
+{
+   int i;
+
+   /* It's a bit hard to build these samplers ahead of time -- don't
+    * really know which samplers are going to be used for vertex and
+    * fragment programs.
+    */
+   for (i = 0; i <= softpipe->vs->max_sampler; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.vert_samplers_list[i] = 
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_VERTEX );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.vert_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+
+   for (i = 0; i <= softpipe->fs->info.file_max[TGSI_FILE_SAMPLER]; i++) {
+      if (softpipe->sampler[i]) {
+         softpipe->tgsi.frag_samplers_list[i] =
+            get_sampler_varient( i,
+                                 sp_sampler(softpipe->sampler[i]),
+                                 softpipe->texture[i],
+                                 TGSI_PROCESSOR_FRAGMENT );
+
+         sp_sampler_varient_bind_texture( softpipe->tgsi.frag_samplers_list[i], 
+                                          softpipe->tex_cache[i],
+                                          softpipe->texture[i] );
+      }
+   }
+}
+
+
+
 void
 softpipe_delete_sampler_state(struct pipe_context *pipe,
                               void *sampler)
 {
+   struct sp_sampler *sp_sampler = (struct sp_sampler *)sampler;
+   struct sp_sampler_varient *v, *tmp;
+
+   for (v = sp_sampler->varients; v; v = tmp) {
+      tmp = v->next;
+      sp_sampler_varient_destroy(v);
+   }
+
    FREE( sampler );
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_state_surface.c b/src/gallium/drivers/softpipe/sp_state_surface.c
index 7c06d864a75..bc0e2011300 100644
--- a/src/gallium/drivers/softpipe/sp_state_surface.c
+++ b/src/gallium/drivers/softpipe/sp_state_surface.c
@@ -53,10 +53,10 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
       /* check if changing cbuf */
       if (sp->framebuffer.cbufs[i] != fb->cbufs[i]) {
          /* flush old */
-         sp_flush_tile_cache(sp, sp->cbuf_cache[i]);
+         sp_flush_tile_cache(sp->cbuf_cache[i]);
 
          /* assign new */
-         sp->framebuffer.cbufs[i] = fb->cbufs[i];
+         pipe_surface_reference(&sp->framebuffer.cbufs[i], fb->cbufs[i]);
 
          /* update cache */
          sp_tile_cache_set_surface(sp->cbuf_cache[i], fb->cbufs[i]);
@@ -68,58 +68,28 @@ softpipe_set_framebuffer_state(struct pipe_context *pipe,
    /* zbuf changing? */
    if (sp->framebuffer.zsbuf != fb->zsbuf) {
       /* flush old */
-      sp_flush_tile_cache(sp, sp->zsbuf_cache);
+      sp_flush_tile_cache(sp->zsbuf_cache);
 
       /* assign new */
-      sp->framebuffer.zsbuf = fb->zsbuf;
+      pipe_surface_reference(&sp->framebuffer.zsbuf, fb->zsbuf);
 
       /* update cache */
       sp_tile_cache_set_surface(sp->zsbuf_cache, fb->zsbuf);
-   }
-
-#if 0
-   /* XXX combined depth/stencil here */
-
-   /* sbuf changing? */
-   if (sp->framebuffer.sbuf != fb->sbuf) {
-      /* flush old */
-      sp_flush_tile_cache(sp, sp->sbuf_cache_sep);
-
-      /* assign new */
-      sp->framebuffer.sbuf = fb->sbuf;
-
-      /* update cache */
-      if (fb->sbuf != fb->zbuf) {
-         /* separate stencil buf */
-         sp->sbuf_cache = sp->sbuf_cache_sep;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-      else {
-         /* combined depth/stencil */
-         sp->sbuf_cache = sp->zbuf_cache;
-         sp_tile_cache_set_surface(sp->sbuf_cache, fb->sbuf);
-      }
-   }
-#endif
 
-   /* Tell draw module how deep the Z/depth buffer is */
-   {
-      int depth_bits;
-      double mrd;
+      /* Tell draw module how deep the Z/depth buffer is */
       if (sp->framebuffer.zsbuf) {
+         int depth_bits;
+         double mrd;
          depth_bits = pf_get_component_bits(sp->framebuffer.zsbuf->format,
                                             PIPE_FORMAT_COMP_Z);
+         if (depth_bits > 16) {
+            mrd = 0.0000001;
+         }
+         else {
+            mrd = 0.00002;
+         }
+         draw_set_mrd(sp->draw, mrd);
       }
-      else {
-         depth_bits = 0;
-      }
-      if (depth_bits > 16) {
-         mrd = 0.0000001;
-      }
-      else {
-         mrd = 0.00002;
-      }
-      draw_set_mrd(sp->draw, mrd);
    }
 
    sp->framebuffer.width = fb->width;
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.c b/src/gallium/drivers/softpipe/sp_tex_sample.c
index f99a30277dd..c22ee86b66c 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.c
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.c
@@ -31,29 +31,33 @@
  *
  * Authors:
  *   Brian Paul
+ *   Keith Whitwell
  */
 
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
-#include "sp_tex_sample.h"
-#include "sp_tile_cache.h"
 #include "pipe/p_context.h"
 #include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
+#include "sp_quad.h"   /* only for #define QUAD_* tokens */
+#include "sp_tex_sample.h"
+#include "sp_tex_tile_cache.h"
 
 
 
 /*
- * Note, the FRAC macro has to work perfectly.  Otherwise you'll sometimes
- * see 1-pixel bands of improperly weighted linear-filtered textures.
+ * Return fractional part of 'f'.  Used for computing interpolation weights.
+ * Need to be careful with negative values.
+ * Note, if this function isn't perfect you'll sometimes see 1-pixel bands
+ * of improperly weighted linear-filtered textures.
  * The tests/texwrap.c demo is a good test.
- * Also note, FRAC(x) doesn't truly return the fractional part of x for x < 0.
- * Instead, if x < 0 then FRAC(x) = 1 - true_frac(x).
  */
-#define FRAC(f)  ((f) - util_ifloor(f))
+static INLINE float
+frac(float f)
+{
+   return f - util_ifloor(f);
+}
+
 
 
 /**
@@ -100,10 +104,16 @@ lerp_3d(float a, float b, float c,
 
 
 /**
- * If A is a signed integer, A % B doesn't give the right value for A < 0
- * (in terms of texture repeat).  Just casting to unsigned fixes that.
+ * Compute coord % size for repeat wrap modes.
+ * Note that if coord is a signed integer, coord % size doesn't give
+ * the right value for coord < 0 (in terms of texture repeat).  Just
+ * casting to unsigned fixes that.
  */
-#define REMAINDER(A, B) ((unsigned) (A) % (unsigned) (B))
+static INLINE int
+repeat(int coord, unsigned size)
+{
+   return (int) ((unsigned) coord % size);
+}
 
 
 /**
@@ -115,133 +125,153 @@ lerp_3d(float a, float b, float c,
  * \param icoord  returns the integer texcoords
  * \return  integer texture index
  */
-static INLINE void
-nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
-                   int icoord[4])
+static void
+wrap_nearest_repeat(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      /* s limited to [0,1) */
-      /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch] * size);
-         icoord[ch] = REMAINDER(i, size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP:
+   /* s limited to [0,1) */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch] * size);
+      icoord[ch] = repeat(i, size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [0,1] */
+   /* i limited to [0,size-1] */
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= 0.0F)
+         icoord[ch] = 0;
+      else if (s[ch] >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_edge(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] < min)
+         icoord[ch] = 0;
+      else if (s[ch] > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_clamp_to_border(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [-1, size] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      if (s[ch] <= min)
+         icoord[ch] = -1;
+      else if (s[ch] >= max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(s[ch] * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_repeat(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp(const float s[4], unsigned size, int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
       /* s limited to [0,1] */
       /* i limited to [0,size-1] */
-      for (ch = 0; ch < 4; ch++) {
-         if (s[ch] <= 0.0F)
-            icoord[ch] = 0;
-         else if (s[ch] >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(s[ch] * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] < min)
-               icoord[ch] = 0;
-            else if (s[ch] > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [-1, size] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            if (s[ch] <= min)
-               icoord[ch] = -1;
-            else if (s[ch] >= max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(s[ch] * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      {
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const int flr = util_ifloor(s[ch]);
-            float u;
-            if (flr & 1)
-               u = 1.0F - (s[ch] - (float) flr);
-            else
-               u = s[ch] - (float) flr;
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* s limited to [0,1] */
-         /* i limited to [0,size-1] */
-         const float u = fabsf(s[ch]);
-         if (u <= 0.0F)
-            icoord[ch] = 0;
-         else if (u >= 1.0F)
-            icoord[ch] = size - 1;
-         else
-            icoord[ch] = util_ifloor(u * size);
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = 1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = 0;
-            else if (u > max)
-               icoord[ch] = size - 1;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         /* s limited to [min,max] */
-         /* i limited to [0, size-1] */
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            const float u = fabsf(s[ch]);
-            if (u < min)
-               icoord[ch] = -1;
-            else if (u > max)
-               icoord[ch] = size;
-            else
-               icoord[ch] = util_ifloor(u * size);
-         }
-      }
-      return;
-   default:
-      assert(0);
+      const float u = fabsf(s[ch]);
+      if (u <= 0.0F)
+         icoord[ch] = 0;
+      else if (u >= 1.0F)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                  int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = 1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = 0;
+      else if (u > max)
+         icoord[ch] = size - 1;
+      else
+         icoord[ch] = util_ifloor(u * size);
+   }
+}
+
+
+static void
+wrap_nearest_mirror_clamp_to_border(const float s[4], unsigned size,
+                                    int icoord[4])
+{
+   uint ch;
+   /* s limited to [min,max] */
+   /* i limited to [0, size-1] */
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   for (ch = 0; ch < 4; ch++) {
+      const float u = fabsf(s[ch]);
+      if (u < min)
+         icoord[ch] = -1;
+      else if (u > max)
+         icoord[ch] = size;
+      else
+         icoord[ch] = util_ifloor(u * size);
    }
 }
 
@@ -256,125 +286,156 @@ nearest_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * \param w  returns blend factor/weight between texture indexes
  * \param icoord  returns the computed integer texture coords
  */
-static INLINE void
-linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
+static void
+wrap_linear_repeat(const float s[4], unsigned size,
+                   int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = s[ch] * size - 0.5F;
+      icoord0[ch] = repeat(util_ifloor(u), size);
+      icoord1[ch] = repeat(icoord0[ch] + 1, size);
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp(const float s[4], unsigned size,
                   int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
 
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         float u = s[ch] * size - 0.5F;
-         icoord0[ch] = REMAINDER(util_ifloor(u), size);
-         icoord1[ch] = REMAINDER(icoord0[ch] + 1, size);
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.0F, 1.0F);
-         u = u * size - 0.5f;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = CLAMP(s[ch], min, max);
-            u = u * size - 0.5f;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_REPEAT:
-      for (ch = 0; ch < 4; ch++) {
-         const int flr = util_ifloor(s[ch]);
-         float u;
-         if (flr & 1)
-            u = 1.0F - (s[ch] - (float) flr);
-         else
-            u = s[ch] - (float) flr;
-         u = u * size - 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
-      for (ch = 0; ch < 4; ch++) {
-         float u = fabsf(s[ch]);
-         if (u >= 1.0F)
-            u = (float) size;
-         else
-            u *= size;
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord0[ch] < 0)
-            icoord0[ch] = 0;
-         if (icoord1[ch] >= (int) size)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;;
-   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
-      {
-         const float min = -1.0F / (2.0F * size);
-         const float max = 1.0F - min;
-         for (ch = 0; ch < 4; ch++) {
-            float u = fabsf(s[ch]);
-            if (u <= min)
-               u = min * size;
-            else if (u >= max)
-               u = max * size;
-            else
-               u *= size;
-            u -= 0.5F;
-            icoord0[ch] = util_ifloor(u);
-            icoord1[ch] = icoord0[ch] + 1;
-            w[ch] = FRAC(u);
-         }
-      }
-      break;;
-   default:
-      assert(0);
+
+static void
+wrap_linear_clamp_to_edge(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.0F, 1.0F);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_clamp_to_border(const float s[4], unsigned size,
+                            int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], min, max);
+      u = u * size - 0.5f;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_repeat(const float s[4], unsigned size,
+                          int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      const int flr = util_ifloor(s[ch]);
+      float u;
+      if (flr & 1)
+         u = 1.0F - (s[ch] - (float) flr);
+      else
+         u = s[ch] - (float) flr;
+      u = u * size - 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp(const float s[4], unsigned size,
+                         int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_edge(const float s[4], unsigned size,
+                                 int icoord0[4], int icoord1[4], float w[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u >= 1.0F)
+         u = (float) size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord0[ch] < 0)
+         icoord0[ch] = 0;
+      if (icoord1[ch] >= (int) size)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+static void
+wrap_linear_mirror_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord0[4], int icoord1[4], float w[4])
+{
+   const float min = -1.0F / (2.0F * size);
+   const float max = 1.0F - min;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = fabsf(s[ch]);
+      if (u <= min)
+         u = min * size;
+      else if (u >= max)
+         u = max * size;
+      else
+         u *= size;
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
@@ -383,27 +444,27 @@ linear_texcoord_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                          int icoord[4])
+static void
+wrap_nearest_unorm_clamp(const float s[4], unsigned size, int icoord[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         int i = util_ifloor(s[ch]);
-         icoord[ch]= CLAMP(i, 0, (int) size-1);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
-      }
-      return;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      int i = util_ifloor(s[ch]);
+      icoord[ch]= CLAMP(i, 0, (int) size-1);
+   }
+}
+
+
+/**
+ * Handles clamp_to_edge and clamp_to_border:
+ */
+static void
+wrap_nearest_unorm_clamp_to_border(const float s[4], unsigned size,
+                                   int icoord[4])
+{
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      icoord[ch]= util_ifloor( CLAMP(s[ch], 0.5F, (float) size - 0.5F) );
    }
 }
 
@@ -412,358 +473,971 @@ nearest_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
  * For RECT textures / unnormalized texcoords.
  * Only a subset of wrap modes supported.
  */
-static INLINE void
-linear_texcoord_unnorm_4(unsigned wrapMode, const float s[4], unsigned size,
-                         int icoord0[4], int icoord1[4], float w[4])
+static void
+wrap_linear_unorm_clamp(const float s[4], unsigned size,
+                        int icoord0[4], int icoord1[4], float w[4])
 {
    uint ch;
-   switch (wrapMode) {
-   case PIPE_TEX_WRAP_CLAMP:
-      for (ch = 0; ch < 4; ch++) {
-         /* Not exactly what the spec says, but it matches NVIDIA output */
-         float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         w[ch] = FRAC(u);
-      }
-      return;
-   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
-      /* fall-through */
-   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
-      for (ch = 0; ch < 4; ch++) {
-         float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
-         u -= 0.5F;
-         icoord0[ch] = util_ifloor(u);
-         icoord1[ch] = icoord0[ch] + 1;
-         if (icoord1[ch] > (int) size - 1)
-            icoord1[ch] = size - 1;
-         w[ch] = FRAC(u);
-      }
-      break;
-   default:
-      assert(0);
+   for (ch = 0; ch < 4; ch++) {
+      /* Not exactly what the spec says, but it matches NVIDIA output */
+      float u = CLAMP(s[ch] - 0.5F, 0.0f, (float) size - 1.0f);
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      w[ch] = frac(u);
    }
 }
 
 
-static unsigned
-choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+static void
+wrap_linear_unorm_clamp_to_border(const float s[4], unsigned size,
+                                  int icoord0[4], int icoord1[4], float w[4])
 {
-   /*
-      major axis
-      direction     target                             sc     tc    ma
-      ----------    -------------------------------    ---    ---   ---
-       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
-       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
-       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
-       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
-       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
-       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
-   */
-   const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
-   unsigned face;
-   float sc, tc, ma;
-
-   if (arx >= ary && arx >= arz) {
-      if (rx >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_X;
-         sc = -rz;
-         tc = -ry;
-         ma = arx;
+   uint ch;
+   for (ch = 0; ch < 4; ch++) {
+      float u = CLAMP(s[ch], 0.5F, (float) size - 0.5F);
+      u -= 0.5F;
+      icoord0[ch] = util_ifloor(u);
+      icoord1[ch] = icoord0[ch] + 1;
+      if (icoord1[ch] > (int) size - 1)
+         icoord1[ch] = size - 1;
+      w[ch] = frac(u);
+   }
+}
+
+
+
+/**
+ * Examine the quad's texture coordinates to compute the partial
+ * derivatives w.r.t X and Y, then compute lambda (level of detail).
+ */
+static float
+compute_lambda_1d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float rho = MAX2(dsdx, dsdy) * texture->width[0];
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_2d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float rho  = MAX2(maxx, maxy);
+   float lambda;
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+static float
+compute_lambda_3d(const struct sp_sampler_varient *samp,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias)
+{
+   const struct pipe_texture *texture = samp->texture;
+   const struct pipe_sampler_state *sampler = samp->sampler;
+   float dsdx = fabsf(s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT]);
+   float dsdy = fabsf(s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT]);
+   float dtdx = fabsf(t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT]);
+   float dtdy = fabsf(t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT]);
+   float dpdx = fabsf(p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT]);
+   float dpdy = fabsf(p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT]);
+   float maxx = MAX2(dsdx, dsdy) * texture->width[0];
+   float maxy = MAX2(dtdx, dtdy) * texture->height[0];
+   float maxz = MAX2(dpdx, dpdy) * texture->depth[0];
+   float rho, lambda;
+
+   rho = MAX2(maxx, maxy);
+   rho = MAX2(rho, maxz);
+
+   lambda = util_fast_log2(rho);
+   lambda += lodbias + sampler->lod_bias;
+   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
+
+   return lambda;
+}
+
+
+/**
+ * Compute lambda for a vertex texture sampler.
+ * Since there aren't derivatives to use, just return the LOD bias.
+ */
+static float
+compute_lambda_vert(const struct sp_sampler_varient *samp,
+                    const float s[QUAD_SIZE],
+                    const float t[QUAD_SIZE],
+                    const float p[QUAD_SIZE],
+                    float lodbias)
+{
+   return lodbias;
+}
+
+
+
+/**
+ * Get a texel from a texture, using the texture tile cache.
+ *
+ * \param addr  the template tex address containing cube, z, face info.
+ * \param x  the x coord of texel within 2D image
+ * \param y  the y coord of texel within 2D image
+ * \param rgba  the quad to put the texel/color into
+ *
+ * XXX maybe move this into sp_tex_tile_cache.c and merge with the
+ * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
+ */
+
+
+
+
+static INLINE const float *
+get_texel_2d_no_border(const struct sp_sampler_varient *samp,
+		       union tex_tile_address addr, int x, int y)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_2d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_2d_no_border( samp, addr, x, y );
+   }
+}
+
+
+/* Gather a quad of adjacent texels within a tile:
+ */
+static INLINE void
+get_texel_quad_2d_no_border_single_tile(const struct sp_sampler_varient *samp,
+					union tex_tile_address addr, 
+					unsigned x, unsigned y, 
+					const float *out[4])
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+      
+   out[0] = &tile->data.color[y  ][x  ][0];
+   out[1] = &tile->data.color[y  ][x+1][0];
+   out[2] = &tile->data.color[y+1][x  ][0];
+   out[3] = &tile->data.color[y+1][x+1][0];
+}
+
+
+/* Gather a quad of potentially non-adjacent texels:
+ */
+static INLINE void
+get_texel_quad_2d_no_border(const struct sp_sampler_varient *samp,
+			    union tex_tile_address addr,
+			    int x0, int y0, 
+			    int x1, int y1,
+			    const float *out[4])
+{
+   out[0] = get_texel_2d_no_border( samp, addr, x0, y0 );
+   out[1] = get_texel_2d_no_border( samp, addr, x1, y0 );
+   out[2] = get_texel_2d_no_border( samp, addr, x0, y1 );
+   out[3] = get_texel_2d_no_border( samp, addr, x1, y1 );
+}
+
+/* Can involve a lot of unnecessary checks for border color:
+ */
+static INLINE void
+get_texel_quad_2d(const struct sp_sampler_varient *samp,
+		  union tex_tile_address addr,
+		  int x0, int y0, 
+		  int x1, int y1,
+		  const float *out[4])
+{
+   out[0] = get_texel_2d( samp, addr, x0, y0 );
+   out[1] = get_texel_2d( samp, addr, x1, y0 );
+   out[3] = get_texel_2d( samp, addr, x1, y1 );
+   out[2] = get_texel_2d( samp, addr, x0, y1 );
+}
+
+
+
+/* 3d varients:
+ */
+static INLINE const float *
+get_texel_3d_no_border(const struct sp_sampler_varient *samp,
+                       union tex_tile_address addr, int x, int y, int z)
+{
+   const struct softpipe_tex_cached_tile *tile;
+
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   y %= TILE_SIZE;
+   x %= TILE_SIZE;
+
+   tile = sp_get_cached_tile_tex(samp->cache, addr);
+
+   return &tile->data.color[y][x][0];
+}
+
+
+static INLINE const float *
+get_texel_3d(const struct sp_sampler_varient *samp,
+	     union tex_tile_address addr, int x, int y, int z)
+{
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level = addr.bits.level;
+
+   if (x < 0 || x >= (int) texture->width[level] ||
+       y < 0 || y >= (int) texture->height[level] ||
+       z < 0 || z >= (int) texture->depth[level]) {
+      return samp->sampler->border_color;
+   }
+   else {
+      return get_texel_3d_no_border( samp, addr, x, y, z );
+   }
+}
+
+
+/**
+ * Given the logbase2 of a mipmap's base level size and a mipmap level,
+ * return the size (in texels) of that mipmap level.
+ * For example, if level[0].width = 256 then base_pot will be 8.
+ * If level = 2, then we'll return 64 (the width at level=2).
+ * Return 1 if level > base_pot.
+ */
+static INLINE unsigned
+pot_level_size(unsigned base_pot, unsigned level)
+{
+   return (base_pot >= level) ? (1 << (base_pot - level)) : 1;
+}
+
+
+/* Some image-filter fastpaths:
+ */
+static INLINE void
+img_filter_2d_linear_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   unsigned xmax = (xpot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, xpot) - 1; */
+   unsigned ymax = (ypot - 1) & (TILE_SIZE - 1); /* MIN2(TILE_SIZE, ypot) - 1; */
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot - 0.5F;
+      float v = t[j] * ypot - 0.5F;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      float xw = u - (float)uflr;
+      float yw = v - (float)vflr;
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *tx[4];      
+      
+      /* Can we fetch all four at once:
+       */
+      if (x0 < xmax && y0 < ymax) {
+         get_texel_quad_2d_no_border_single_tile(samp, addr, x0, y0, tx);
       }
       else {
-         face = PIPE_TEX_FACE_NEG_X;
-         sc = rz;
-         tc = -ry;
-         ma = arx;
+         unsigned x1 = (x0 + 1) & (xpot - 1);
+         unsigned y1 = (y0 + 1) & (ypot - 1);
+         get_texel_quad_2d_no_border(samp, addr, x0, y0, x1, y1, tx);
+      }
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw, yw, 
+                              tx[0][c], tx[1][c], 
+                              tx[2][c], tx[3][c]);
       }
    }
-   else if (ary >= arx && ary >= arz) {
-      if (ry >= 0.0F) {
-         face = PIPE_TEX_FACE_POS_Y;
-         sc = rx;
-         tc = rz;
-         ma = ary;
+}
+
+
+static INLINE void
+img_filter_2d_nearest_repeat_POT(struct tgsi_sampler *tgsi_sampler,
+                                 const float s[QUAD_SIZE],
+                                 const float t[QUAD_SIZE],
+                                 const float p[QUAD_SIZE],
+                                 float lodbias,
+                                 float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int uflr = util_ifloor(u);
+      int vflr = util_ifloor(v);
+
+      int x0 = uflr & (xpot - 1);
+      int y0 = vflr & (ypot - 1);
+
+      const float *out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Y;
-         sc = rx;
-         tc = -rz;
-         ma = ary;
+   }
+}
+
+
+static INLINE void
+img_filter_2d_nearest_clamp_POT(struct tgsi_sampler *tgsi_sampler,
+                                const float s[QUAD_SIZE],
+                                const float t[QUAD_SIZE],
+                                const float p[QUAD_SIZE],
+                                float lodbias,
+                                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned  j;
+   unsigned level = samp->level;
+   unsigned xpot = pot_level_size(samp->xpot, level);
+   unsigned ypot = pot_level_size(samp->ypot, level);
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      float u = s[j] * xpot;
+      float v = t[j] * ypot;
+
+      int x0, y0;
+      const float *out;
+
+      x0 = util_ifloor(u);
+      if (x0 < 0) 
+         x0 = 0;
+      else if (x0 > xpot - 1)
+         x0 = xpot - 1;
+
+      y0 = util_ifloor(v);
+      if (y0 < 0) 
+         y0 = 0;
+      else if (y0 > ypot - 1)
+         y0 = ypot - 1;
+      
+      out = get_texel_2d_no_border(samp, addr, x0, y0);
+
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
-   else {
-      if (rz > 0.0F) {
-         face = PIPE_TEX_FACE_POS_Z;
-         sc = rx;
-         tc = -ry;
-         ma = arz;
+}
+
+
+static void
+img_filter_1d_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], 0);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
-      else {
-         face = PIPE_TEX_FACE_NEG_Z;
-         sc = -rx;
-         tc = -ry;
-         ma = arz;
+   }
+}
+
+
+static void
+img_filter_2d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
+
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, addr, x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
       }
    }
+}
 
-   *newS = ( sc / ma + 1.0F ) * 0.5F;
-   *newT = ( tc / ma + 1.0F ) * 0.5F;
 
-   return face;
+static INLINE union tex_tile_address
+face(union tex_tile_address addr, unsigned face )
+{
+   addr.bits.face = face;
+   return addr;
 }
 
 
-/**
- * Examine the quad's texture coordinates to compute the partial
- * derivatives w.r.t X and Y, then compute lambda (level of detail).
- *
- * This is only done for fragment shaders, not vertex shaders.
- */
-static float
-compute_lambda(const struct pipe_texture *tex,
-               const struct pipe_sampler_state *sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               float lodbias)
+static void
+img_filter_cube_nearest(struct tgsi_sampler *tgsi_sampler,
+                        const float s[QUAD_SIZE],
+                        const float t[QUAD_SIZE],
+                        const float p[QUAD_SIZE],
+                        float lodbias,
+                        float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   float rho, lambda;
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x[4], y[4];
+   union tex_tile_address addr;
 
-   assert(sampler->normalized_coords);
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
 
-   assert(s);
-   {
-      float dsdx = s[QUAD_BOTTOM_RIGHT] - s[QUAD_BOTTOM_LEFT];
-      float dsdy = s[QUAD_TOP_LEFT]     - s[QUAD_BOTTOM_LEFT];
-      dsdx = fabsf(dsdx);
-      dsdy = fabsf(dsdy);
-      rho = MAX2(dsdx, dsdy) * tex->width[0];
-   }
-   if (t) {
-      float dtdx = t[QUAD_BOTTOM_RIGHT] - t[QUAD_BOTTOM_LEFT];
-      float dtdy = t[QUAD_TOP_LEFT]     - t[QUAD_BOTTOM_LEFT];
-      float max;
-      dtdx = fabsf(dtdx);
-      dtdy = fabsf(dtdy);
-      max = MAX2(dtdx, dtdy) * tex->height[0];
-      rho = MAX2(rho, max);
+   assert(width > 0);
+   assert(height > 0);
+ 
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->nearest_texcoord_s(s, width, x);
+   samp->nearest_texcoord_t(t, height, y);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_2d(samp, face(addr, faces[j]), x[j], y[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }
    }
-   if (p) {
-      float dpdx = p[QUAD_BOTTOM_RIGHT] - p[QUAD_BOTTOM_LEFT];
-      float dpdy = p[QUAD_TOP_LEFT]     - p[QUAD_BOTTOM_LEFT];
-      float max;
-      dpdx = fabsf(dpdx);
-      dpdy = fabsf(dpdy);
-      max = MAX2(dpdx, dpdy) * tex->depth[0];
-      rho = MAX2(rho, max);
+}
+
+
+static void
+img_filter_3d_nearest(struct tgsi_sampler *tgsi_sampler,
+                      const float s[QUAD_SIZE],
+                      const float t[QUAD_SIZE],
+                      const float p[QUAD_SIZE],
+                      float lodbias,
+                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x[4], y[4], z[4];
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->nearest_texcoord_s(s, width,  x);
+   samp->nearest_texcoord_t(t, height, y);
+   samp->nearest_texcoord_p(p, depth,  z);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *out = get_texel_3d(samp, addr, x[j], y[j], z[j]);
+      int c;
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = out[c];
+      }      
    }
+}
 
-   lambda = util_fast_log2(rho);
-   lambda += lodbias + sampler->lod_bias;
-   lambda = CLAMP(lambda, sampler->min_lod, sampler->max_lod);
 
-   return lambda;
+static void
+img_filter_1d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width;
+   int x0[4], x1[4];
+   float xw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+
+   assert(width > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width, x0, x1, xw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], 0);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], 0);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp(xw[j], tx0[c], tx1[c]);
+      }
+   }
 }
 
 
-/**
- * Do several things here:
- * 1. Compute lambda from the texcoords, if needed
- * 2. Determine if we're minifying or magnifying
- * 3. If minifying, choose mipmap levels
- * 4. Return image filter to use within mipmap images
- * \param level0  Returns first mipmap level to sample from
- * \param level1  Returns second mipmap level to sample from
- * \param levelBlend  Returns blend factor between levels, in [0,1]
- * \param imgFilter  Returns either the min or mag filter, depending on lambda
- */
 static void
-choose_mipmap_levels(const struct pipe_texture *texture,
-                     const struct pipe_sampler_state *sampler,
+img_filter_2d_linear(struct tgsi_sampler *tgsi_sampler,
                      const float s[QUAD_SIZE],
                      const float t[QUAD_SIZE],
                      const float p[QUAD_SIZE],
-                     boolean computeLambda,
                      float lodbias,
-                     unsigned *level0, unsigned *level1, float *levelBlend,
-                     unsigned *imgFilter)
-{
-   if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NONE) {
-      /* no mipmap selection needed */
-      *level0 = *level1 = CLAMP((int) sampler->min_lod,
-                                0, (int) texture->last_level);
-
-      if (sampler->min_img_filter != sampler->mag_img_filter) {
-         /* non-mipmapped texture, but still need to determine if doing
-          * minification or magnification.
-          */
-         float lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-         if (lambda <= 0.0) {
-            *imgFilter = sampler->mag_img_filter;
-         }
-         else {
-            *imgFilter = sampler->min_img_filter;
-         }
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      const float *tx0 = get_texel_2d(samp, addr, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addr, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addr, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addr, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
-      else {
-         *imgFilter = sampler->mag_img_filter;
+   }
+}
+
+
+static void
+img_filter_cube_linear(struct tgsi_sampler *tgsi_sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   const unsigned *faces = samp->faces; /* zero when not cube-mapping */
+   unsigned level0, j;
+   int width, height;
+   int x0[4], y0[4], x1[4], y1[4];
+   float xw[4], yw[4]; /* weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+
+   assert(width > 0);
+   assert(height > 0);
+
+   addr.value = 0;
+   addr.bits.level = samp->level;
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      union tex_tile_address addrj = face(addr, faces[j]);
+      const float *tx0 = get_texel_2d(samp, addrj, x0[j], y0[j]);
+      const float *tx1 = get_texel_2d(samp, addrj, x1[j], y0[j]);
+      const float *tx2 = get_texel_2d(samp, addrj, x0[j], y1[j]);
+      const float *tx3 = get_texel_2d(samp, addrj, x1[j], y1[j]);
+      int c;
+
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_2d(xw[j], yw[j],
+                              tx0[c], tx1[c],
+                              tx2[c], tx3[c]);
       }
    }
-   else {
-      float lambda;
+}
 
-      if (computeLambda)
-         /* fragment shader */
-         lambda = compute_lambda(texture, sampler, s, t, p, lodbias);
-      else
-         /* vertex shader */
-         lambda = lodbias; /* not really a bias, but absolute LOD */
 
-      if (lambda <= 0.0) { /* XXX threshold depends on the filter */
-         /* magnifying */
-         *imgFilter = sampler->mag_img_filter;
-         *level0 = *level1 = 0;
+static void
+img_filter_3d_linear(struct tgsi_sampler *tgsi_sampler,
+                     const float s[QUAD_SIZE],
+                     const float t[QUAD_SIZE],
+                     const float p[QUAD_SIZE],
+                     float lodbias,
+                     float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   const struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   unsigned level0, j;
+   int width, height, depth;
+   int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
+   float xw[4], yw[4], zw[4]; /* interpolation weights */
+   union tex_tile_address addr;
+
+   level0 = samp->level;
+   width = texture->width[level0];
+   height = texture->height[level0];
+   depth = texture->depth[level0];
+
+   addr.value = 0;
+   addr.bits.level = level0;
+
+   assert(width > 0);
+   assert(height > 0);
+   assert(depth > 0);
+
+   samp->linear_texcoord_s(s, width,  x0, x1, xw);
+   samp->linear_texcoord_t(t, height, y0, y1, yw);
+   samp->linear_texcoord_p(p, depth,  z0, z1, zw);
+
+   for (j = 0; j < QUAD_SIZE; j++) {
+      int c;
+
+      const float *tx00 = get_texel_3d(samp, addr, x0[j], y0[j], z0[j]);
+      const float *tx01 = get_texel_3d(samp, addr, x1[j], y0[j], z0[j]);
+      const float *tx02 = get_texel_3d(samp, addr, x0[j], y1[j], z0[j]);
+      const float *tx03 = get_texel_3d(samp, addr, x1[j], y1[j], z0[j]);
+      
+      const float *tx10 = get_texel_3d(samp, addr, x0[j], y0[j], z1[j]);
+      const float *tx11 = get_texel_3d(samp, addr, x1[j], y0[j], z1[j]);
+      const float *tx12 = get_texel_3d(samp, addr, x0[j], y1[j], z1[j]);
+      const float *tx13 = get_texel_3d(samp, addr, x1[j], y1[j], z1[j]);
+      
+      /* interpolate R, G, B, A */
+      for (c = 0; c < 4; c++) {
+         rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
+                              tx00[c], tx01[c],
+                              tx02[c], tx03[c],
+                              tx10[c], tx11[c],
+                              tx12[c], tx13[c]);
       }
-      else {
-         /* minifying */
-         *imgFilter = sampler->min_img_filter;
-
-         /* choose mipmap level(s) and compute the blend factor between them */
-         if (sampler->min_mip_filter == PIPE_TEX_MIPFILTER_NEAREST) {
-            /* Nearest mipmap level */
-            const int lvl = (int) (lambda + 0.5);
-            *level0 =
-            *level1 = CLAMP(lvl, 0, (int) texture->last_level);
-         }
-         else {
-            /* Linear interpolation between mipmap levels */
-            const int lvl = (int) lambda;
-            *level0 = CLAMP(lvl,     0, (int) texture->last_level);
-            *level1 = CLAMP(lvl + 1, 0, (int) texture->last_level);
-            *levelBlend = FRAC(lambda);  /* blending weight between levels */
+   }
+}
+
+
+static void
+mip_filter_linear(struct tgsi_sampler *tgsi_sampler,
+                  const float s[QUAD_SIZE],
+                  const float t[QUAD_SIZE],
+                  const float p[QUAD_SIZE],
+                  float lodbias,
+                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else if (level0 >= texture->last_level) {
+      samp->level = texture->last_level;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
+
+      samp->level = level0;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
          }
       }
    }
 }
 
 
-/**
- * Get a texel from a texture, using the texture tile cache.
- *
- * \param face  the cube face in 0..5
- * \param level  the mipmap level
- * \param x  the x coord of texel within 2D image
- * \param y  the y coord of texel within 2D image
- * \param z  which slice of a 3D texture
- * \param rgba  the quad to put the texel/color into
- * \param j  which element of the rgba quad to write to
- *
- * XXX maybe move this into sp_tile_cache.c and merge with the
- * sp_get_cached_tile_tex() function.  Also, get 4 texels instead of 1...
- */
 static void
-get_texel(const struct tgsi_sampler *tgsi_sampler,
-          unsigned face, unsigned level, int x, int y, int z,
-          float rgba[NUM_CHANNELS][QUAD_SIZE], unsigned j)
+mip_filter_nearest(struct tgsi_sampler *tgsi_sampler,
+                   const float s[QUAD_SIZE],
+                   const float t[QUAD_SIZE],
+                   const float p[QUAD_SIZE],
+                   float lodbias,
+                   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   float lambda;
 
-   if (x < 0 || x >= (int) texture->width[level] ||
-       y < 0 || y >= (int) texture->height[level] ||
-       z < 0 || z >= (int) texture->depth[level]) {
-      rgba[0][j] = sampler->border_color[0];
-      rgba[1][j] = sampler->border_color[1];
-      rgba[2][j] = sampler->border_color[2];
-      rgba[3][j] = sampler->border_color[3];
+   lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->level = 0;
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
    else {
-      const int tx = x % TILE_SIZE;
-      const int ty = y % TILE_SIZE;
-      const struct softpipe_cached_tile *tile
-         = sp_get_cached_tile_tex(sp, samp->cache,
-                                  x, y, z, face, level);
-      rgba[0][j] = tile->data.color[ty][tx][0];
-      rgba[1][j] = tile->data.color[ty][tx][1];
-      rgba[2][j] = tile->data.color[ty][tx][2];
-      rgba[3][j] = tile->data.color[ty][tx][3];
-      if (0)
-      {
-         debug_printf("Get texel %f %f %f %f from %s\n",
-                      rgba[0][j], rgba[1][j], rgba[2][j], rgba[3][j],
-                      pf_name(texture->format));
-      }
+      samp->level = (int)(lambda + 0.5) ;
+      samp->level = MIN2(samp->level, (int)texture->last_level);
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+
+#if 0
+   printf("RGBA %g %g %g %g, %g %g %g %g, %g %g %g %g, %g %g %g %g\n",
+          rgba[0][0], rgba[1][0], rgba[2][0], rgba[3][0],
+          rgba[0][1], rgba[1][1], rgba[2][1], rgba[3][1],
+          rgba[0][2], rgba[1][2], rgba[2][2], rgba[3][2],
+          rgba[0][3], rgba[1][3], rgba[2][3], rgba[3][3]);
+#endif
+}
+
+
+static void
+mip_filter_none(struct tgsi_sampler *tgsi_sampler,
+                const float s[QUAD_SIZE],
+                const float t[QUAD_SIZE],
+                const float p[QUAD_SIZE],
+                float lodbias,
+                float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   float lambda = samp->compute_lambda(samp, s, t, p, lodbias);
+
+   if (lambda < 0.0) { 
+      samp->mag_img_filter( tgsi_sampler, s, t, p, 0, rgba );
+   }
+   else {
+      samp->min_img_filter( tgsi_sampler, s, t, p, 0, rgba );
    }
 }
 
 
+
 /**
- * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
- * When we sampled the depth texture, the depth value was put into all
- * RGBA channels.  We look at the red channel here.
- * \param rgba  quad of (depth) texel values
- * \param p  texture 'P' components for four pixels in quad
- * \param j  which pixel in the quad to test [0..3]
+ * Specialized version of mip_filter_linear with hard-wired calls to
+ * 2d lambda calculation and 2d_linear_repeat_POT img filters.
  */
-static INLINE void
-shadow_compare(const struct pipe_sampler_state *sampler,
-               float rgba[NUM_CHANNELS][QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               uint j)
+static void
+mip_filter_linear_2d_linear_repeat_POT(
+   struct tgsi_sampler *tgsi_sampler,
+   const float s[QUAD_SIZE],
+   const float t[QUAD_SIZE],
+   const float p[QUAD_SIZE],
+   float lodbias,
+   float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
-   int k;
-   switch (sampler->compare_func) {
-   case PIPE_FUNC_LESS:
-      k = p[j] < rgba[0][j];
-      break;
-   case PIPE_FUNC_LEQUAL:
-      k = p[j] <= rgba[0][j];
-      break;
-   case PIPE_FUNC_GREATER:
-      k = p[j] > rgba[0][j];
-      break;
-   case PIPE_FUNC_GEQUAL:
-      k = p[j] >= rgba[0][j];
-      break;
-   case PIPE_FUNC_EQUAL:
-      k = p[j] == rgba[0][j];
-      break;
-   case PIPE_FUNC_NOTEQUAL:
-      k = p[j] != rgba[0][j];
-      break;
-   case PIPE_FUNC_ALWAYS:
-      k = 1;
-      break;
-   case PIPE_FUNC_NEVER:
-      k = 0;
-      break;
-   default:
-      k = 0;
-      assert(0);
-      break;
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_texture *texture = samp->texture;
+   int level0;
+   float lambda;
+
+   lambda = compute_lambda_2d(samp, s, t, p, lodbias);
+   level0 = (int)lambda;
+
+   /* Catches both negative and large values of level0:
+    */
+   if ((unsigned)level0 >= texture->last_level) { 
+      if (level0 < 0)
+         samp->level = 0;
+      else
+         samp->level = texture->last_level;
+
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba );
    }
+   else {
+      float levelBlend = lambda - level0;
+      float rgba0[4][4];
+      float rgba1[4][4];
+      int c,j;
 
-   /* XXX returning result for default GL_DEPTH_TEXTURE_MODE = GL_LUMINANCE */
-   rgba[0][j] = rgba[1][j] = rgba[2][j] = (float) k;
-   rgba[3][j] = 1.0F;
+      samp->level = level0;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba0 );
+
+      samp->level = level0+1;
+      img_filter_2d_linear_repeat_POT( tgsi_sampler, s, t, p, 0, rgba1 );
+
+      for (j = 0; j < QUAD_SIZE; j++) {
+         for (c = 0; c < 4; c++) {
+            rgba[c][j] = lerp(levelBlend, rgba0[c][j], rgba1[c][j]);
+         }
+      }
+   }
 }
 
 
+
 /**
- * As above, but do four z/texture comparisons.
+ * Do shadow/depth comparisons.
  */
-static INLINE void
-shadow_compare4(const struct pipe_sampler_state *sampler,
-                float rgba[NUM_CHANNELS][QUAD_SIZE],
-                const float p[QUAD_SIZE])
+static void
+sample_compare(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE])
 {
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   const struct pipe_sampler_state *sampler = samp->sampler;
    int j, k0, k1, k2, k3;
    float val;
 
+   samp->mip_filter( tgsi_sampler, s, t, p, lodbias, rgba );
+
+   /**
+    * Compare texcoord 'p' (aka R) against texture value 'rgba[0]'
+    * When we sampled the depth texture, the depth value was put into all
+    * RGBA channels.  We look at the red channel here.
+    */
+
    /* compare four texcoords vs. four texture samples */
    switch (sampler->compare_func) {
    case PIPE_FUNC_LESS:
@@ -826,470 +1500,392 @@ shadow_compare4(const struct pipe_sampler_state *sampler,
 
 
 /**
- * Common code for sampling 1D/2D/cube textures.
- * Could probably extend for 3D...
+ * Compute which cube face is referenced by each texcoord and put that
+ * info into the sampler faces[] array.  Then sample the cube faces
  */
 static void
-sp_get_samples_2d_common(const struct tgsi_sampler *tgsi_sampler,
-                         const float s[QUAD_SIZE],
-                         const float t[QUAD_SIZE],
-                         const float p[QUAD_SIZE],
-                         boolean computeLambda,
-                         float lodbias,
-                         float rgba[NUM_CHANNELS][QUAD_SIZE],
-                         const unsigned faces[4])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, faces[j], level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
+sample_cube(struct tgsi_sampler *tgsi_sampler,
+            const float s[QUAD_SIZE],
+            const float t[QUAD_SIZE],
+            const float p[QUAD_SIZE],
+            float lodbias,
+            float rgba[NUM_CHANNELS][QUAD_SIZE])
+{
+   struct sp_sampler_varient *samp = sp_sampler_varient(tgsi_sampler);
+   unsigned j;
+   float ssss[4], tttt[4];
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x[j], y[j], 0,
-                         rgba2, j);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare(sampler, rgba2, p, j);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
+   /*
+     major axis
+     direction     target                             sc     tc    ma
+     ----------    -------------------------------    ---    ---   ---
+     +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+     -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+     +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+     -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+     +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+     -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   for (j = 0; j < QUAD_SIZE; j++) {
+      float rx = s[j];
+      float ry = t[j];
+      float rz = p[j];
+      const float arx = fabsf(rx), ary = fabsf(ry), arz = fabsf(rz);
+      unsigned face;
+      float sc, tc, ma;
+
+      if (arx >= ary && arx >= arz) {
+         if (rx >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_X;
+            sc = -rz;
+            tc = -ry;
+            ma = arx;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_X;
+            sc = rz;
+            tc = -ry;
+            ma = arx;
          }
       }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+      else if (ary >= arx && ary >= arz) {
+         if (ry >= 0.0F) {
+            face = PIPE_TEX_FACE_POS_Y;
+            sc = rx;
+            tc = rz;
+            ma = ary;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Y;
+            sc = rx;
+            tc = -rz;
+            ma = ary;
+         }
+      }
+      else {
+         if (rz > 0.0F) {
+            face = PIPE_TEX_FACE_POS_Z;
+            sc = rx;
+            tc = -ry;
+            ma = arz;
+         }
+         else {
+            face = PIPE_TEX_FACE_NEG_Z;
+            sc = -rx;
+            tc = -ry;
+            ma = arz;
+         }
+      }
+
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-
-         linear_texcoord_4(sampler->wrap_s, s, width, x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, faces[j], level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, faces[j], level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
-            }
+	 const float ima = 1.0 / ma;
+	 ssss[j] = ( sc * ima + 1.0F ) * 0.5F;
+	 tttt[j] = ( tc * ima + 1.0F ) * 0.5F;
+	 samp->faces[j] = face;
+      }
+   }
 
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1],
-                                    tx[c][2], tx[c][3]);
-            }
+   /* In our little pipeline, the compare stage is next.  If compare
+    * is not active, this will point somewhere deeper into the
+    * pipeline, eg. to mip_filter or even img_filter.
+    */
+   samp->compare(tgsi_sampler, ssss, tttt, NULL, lodbias, rgba);
+}
 
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y0[j], 0, tx, 0);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y0[j], 0, tx, 1);
-               get_texel(tgsi_sampler, faces[j], level1, x0[j], y1[j], 0, tx, 2);
-               get_texel(tgsi_sampler, faces[j], level1, x1[j], y1[j], 0, tx, 3);
-               if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE){
-                  shadow_compare4(sampler, tx, p);
-               }
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_2d(xw[j], yw[j],
-                                        tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
-               }
-
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+
+
+static wrap_nearest_func
+get_nearest_unorm_wrap(unsigned mode)
+{
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_unorm_clamp_to_border;
    default:
       assert(0);
+      return wrap_nearest_unorm_clamp;
    }
 }
 
 
-static INLINE void
-sp_get_samples_1d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_nearest_func
+get_nearest_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   static const float tzero[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, tzero, NULL,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_nearest_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_nearest_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_nearest_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_nearest_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_nearest_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_nearest_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_nearest_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_nearest_mirror_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_nearest_repeat;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_2d(const struct tgsi_sampler *sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_unorm_wrap(unsigned mode)
 {
-   static const unsigned faces[4] = {0, 0, 0, 0};
-   sp_get_samples_2d_common(sampler, s, t, p,
-                            computeLambda, lodbias, rgba, faces);
+   switch (mode) {
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_unorm_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_unorm_clamp_to_border;
+   default:
+      assert(0);
+      return wrap_linear_unorm_clamp;
+   }
 }
 
 
-static INLINE void
-sp_get_samples_3d(const struct tgsi_sampler *tgsi_sampler,
-                  const float s[QUAD_SIZE],
-                  const float t[QUAD_SIZE],
-                  const float p[QUAD_SIZE],
-                  boolean computeLambda,
-                  float lodbias,
-                  float rgba[NUM_CHANNELS][QUAD_SIZE])
+static wrap_linear_func
+get_linear_wrap(unsigned mode)
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   /* get/map pipe_surfaces corresponding to 3D tex slices */
-   unsigned level0, level1, j, imgFilter;
-   int width, height, depth;
-   float levelBlend;
-   const uint face = 0;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   assert(sampler->normalized_coords);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-   depth = texture->depth[level0];
-
-   assert(width > 0);
-   assert(height > 0);
-   assert(depth > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4], z[4];
-         nearest_texcoord_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_4(sampler->wrap_t, t, height, y);
-         nearest_texcoord_4(sampler->wrap_r, p, depth, z);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], z[j], rgba, j);
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               unsigned c;
-               x[j] /= 2;
-               y[j] /= 2;
-               z[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x[j], y[j], z[j], rgba2, j);
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba2[c][j], rgba[c][j]);
-               }
-            }
-         }
-      }
-      break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
-      {
-         int x0[4], x1[4], y0[4], y1[4], z0[4], z1[4];
-         float xw[4], yw[4], zw[4]; /* interpolation weights */
-         linear_texcoord_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_4(sampler->wrap_t, t, height, y0, y1, yw);
-         linear_texcoord_4(sampler->wrap_r, p, depth,  z0, z1, zw);
-
-         for (j = 0; j < QUAD_SIZE; j++) {
-            int c;
-            float tx0[4][4], tx1[4][4];
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z0[j], tx0, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z0[j], tx0, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z0[j], tx0, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z0[j], tx0, 3);
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], z1[j], tx1, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], z1[j], tx1, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], z1[j], tx1, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], z1[j], tx1, 3);
-
-            /* interpolate R, G, B, A */
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                    tx0[c][0], tx0[c][1],
-                                    tx0[c][2], tx0[c][3],
-                                    tx1[c][0], tx1[c][1],
-                                    tx1[c][2], tx1[c][3]);
-            }
-
-            if (level0 != level1) {
-               /* get texels from second mipmap level and blend */
-               float rgba2[4][4];
-               x0[j] /= 2;
-               y0[j] /= 2;
-               z0[j] /= 2;
-               x1[j] /= 2;
-               y1[j] /= 2;
-               z1[j] /= 2;
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z0[j], tx0, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z0[j], tx0, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z0[j], tx0, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z0[j], tx0, 3);
-               get_texel(tgsi_sampler, face, level1, x0[j], y0[j], z1[j], tx1, 0);
-               get_texel(tgsi_sampler, face, level1, x1[j], y0[j], z1[j], tx1, 1);
-               get_texel(tgsi_sampler, face, level1, x0[j], y1[j], z1[j], tx1, 2);
-               get_texel(tgsi_sampler, face, level1, x1[j], y1[j], z1[j], tx1, 3);
-
-               /* interpolate R, G, B, A */
-               for (c = 0; c < 4; c++) {
-                  rgba2[c][j] = lerp_3d(xw[j], yw[j], zw[j],
-                                        tx0[c][0], tx0[c][1],
-                                        tx0[c][2], tx0[c][3],
-                                        tx1[c][0], tx1[c][1],
-                                        tx1[c][2], tx1[c][3]);
-               }
-
-               /* blend mipmap levels */
-               for (c = 0; c < NUM_CHANNELS; c++) {
-                  rgba[c][j] = lerp(levelBlend, rgba[c][j], rgba2[c][j]);
-               }
-            }
-         }
-      }
-      break;
+   switch (mode) {
+   case PIPE_TEX_WRAP_REPEAT:
+      return wrap_linear_repeat;
+   case PIPE_TEX_WRAP_CLAMP:
+      return wrap_linear_clamp;
+   case PIPE_TEX_WRAP_CLAMP_TO_EDGE:
+      return wrap_linear_clamp_to_edge;
+   case PIPE_TEX_WRAP_CLAMP_TO_BORDER:
+      return wrap_linear_clamp_to_border;
+   case PIPE_TEX_WRAP_MIRROR_REPEAT:
+      return wrap_linear_mirror_repeat;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP:
+      return wrap_linear_mirror_clamp;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_EDGE:
+      return wrap_linear_mirror_clamp_to_edge;
+   case PIPE_TEX_WRAP_MIRROR_CLAMP_TO_BORDER:
+      return wrap_linear_mirror_clamp_to_border;
    default:
       assert(0);
+      return wrap_linear_repeat;
    }
 }
 
 
-static void
-sp_get_samples_cube(const struct tgsi_sampler *sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
+static compute_lambda_func
+get_lambda_func(const union sp_sampler_key key)
 {
-   unsigned faces[QUAD_SIZE], j;
-   float ssss[4], tttt[4];
-   for (j = 0; j < QUAD_SIZE; j++) {
-      faces[j] = choose_cube_face(s[j], t[j], p[j], ssss + j, tttt + j);
+   if (key.bits.processor == TGSI_PROCESSOR_VERTEX)
+      return compute_lambda_vert;
+   
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      return compute_lambda_1d;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_CUBE:
+      return compute_lambda_2d;
+   case PIPE_TEXTURE_3D:
+      return compute_lambda_3d;
+   default:
+      assert(0);
+      return compute_lambda_1d;
    }
-   sp_get_samples_2d_common(sampler, ssss, tttt, NULL,
-                            computeLambda, lodbias, rgba, faces);
 }
 
 
-static void
-sp_get_samples_rect(const struct tgsi_sampler *tgsi_sampler,
-                    const float s[QUAD_SIZE],
-                    const float t[QUAD_SIZE],
-                    const float p[QUAD_SIZE],
-                    boolean computeLambda,
-                    float lodbias,
-                    float rgba[NUM_CHANNELS][QUAD_SIZE])
-{
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-   const uint face = 0;
-   unsigned level0, level1, j, imgFilter;
-   int width, height;
-   float levelBlend;
-
-   choose_mipmap_levels(texture, sampler, s, t, p, computeLambda, lodbias,
-                        &level0, &level1, &levelBlend, &imgFilter);
-
-   /* texture RECTS cannot be mipmapped */
-   assert(level0 == level1);
-
-   width = texture->width[level0];
-   height = texture->height[level0];
-
-   assert(width > 0);
-
-   switch (imgFilter) {
-   case PIPE_TEX_FILTER_NEAREST:
-      {
-         int x[4], y[4];
-         nearest_texcoord_unnorm_4(sampler->wrap_s, s, width, x);
-         nearest_texcoord_unnorm_4(sampler->wrap_t, t, height, y);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            get_texel(tgsi_sampler, face, level0, x[j], y[j], 0, rgba, j);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare(sampler, rgba, p, j);
-            }
-         }
-      }
+static filter_func
+get_img_filter(const union sp_sampler_key key,
+               unsigned filter,
+               const struct pipe_sampler_state *sampler)
+{
+   switch (key.bits.target) {
+   case PIPE_TEXTURE_1D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_1d_nearest;
+      else
+         return img_filter_1d_linear;
       break;
-   case PIPE_TEX_FILTER_LINEAR:
-   case PIPE_TEX_FILTER_ANISO:
+   case PIPE_TEXTURE_2D:
+      /* Try for fast path:
+       */
+      if (key.bits.is_pot &&
+          sampler->wrap_s == sampler->wrap_t &&
+          sampler->normalized_coords) 
       {
-         int x0[4], y0[4], x1[4], y1[4];
-         float xw[4], yw[4]; /* weights */
-         linear_texcoord_unnorm_4(sampler->wrap_s, s, width,  x0, x1, xw);
-         linear_texcoord_unnorm_4(sampler->wrap_t, t, height, y0, y1, yw);
-         for (j = 0; j < QUAD_SIZE; j++) {
-            float tx[4][4]; /* texels */
-            int c;
-            get_texel(tgsi_sampler, face, level0, x0[j], y0[j], 0, tx, 0);
-            get_texel(tgsi_sampler, face, level0, x1[j], y0[j], 0, tx, 1);
-            get_texel(tgsi_sampler, face, level0, x0[j], y1[j], 0, tx, 2);
-            get_texel(tgsi_sampler, face, level0, x1[j], y1[j], 0, tx, 3);
-            if (sampler->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
-               shadow_compare4(sampler, tx, p);
+         switch (sampler->wrap_s) {
+         case PIPE_TEX_WRAP_REPEAT:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_repeat_POT;
+            case PIPE_TEX_FILTER_LINEAR:
+               return img_filter_2d_linear_repeat_POT;
+            default:
+               break;
             }
-            for (c = 0; c < 4; c++) {
-               rgba[c][j] = lerp_2d(xw[j], yw[j],
-                                    tx[c][0], tx[c][1], tx[c][2], tx[c][3]);
+            break;
+         case PIPE_TEX_WRAP_CLAMP:
+            switch (filter) {
+            case PIPE_TEX_FILTER_NEAREST:
+               return img_filter_2d_nearest_clamp_POT;
+            default:
+               break;
             }
          }
       }
+      /* Otherwise use default versions:
+       */
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_2d_nearest;
+      else
+         return img_filter_2d_linear;
+      break;
+   case PIPE_TEXTURE_CUBE:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_cube_nearest;
+      else
+         return img_filter_cube_linear;
+      break;
+   case PIPE_TEXTURE_3D:
+      if (filter == PIPE_TEX_FILTER_NEAREST) 
+         return img_filter_3d_nearest;
+      else
+         return img_filter_3d_linear;
       break;
    default:
       assert(0);
+      return img_filter_1d_nearest;
    }
 }
 
 
 /**
- * Common code for vertex/fragment program texture sampling.
+ * Bind the given texture object and texture cache to the sampler varient.
  */
-static INLINE void
-sp_get_samples(struct tgsi_sampler *tgsi_sampler,
-               const float s[QUAD_SIZE],
-               const float t[QUAD_SIZE],
-               const float p[QUAD_SIZE],
-               boolean computeLambda,
-               float lodbias,
-               float rgba[NUM_CHANNELS][QUAD_SIZE])
+void
+sp_sampler_varient_bind_texture( struct sp_sampler_varient *samp,
+                                 struct softpipe_tex_tile_cache *tex_cache,
+                                 const struct pipe_texture *texture )
 {
-   const struct sp_shader_sampler *samp = sp_shader_sampler(tgsi_sampler);
-   const struct softpipe_context *sp = samp->sp;
-   const uint unit = samp->unit;
-   const struct pipe_texture *texture = sp->texture[unit];
-   const struct pipe_sampler_state *sampler = sp->sampler[unit];
-
-   if (!texture)
-      return;
+   const struct pipe_sampler_state *sampler = samp->sampler;
 
-   switch (texture->target) {
-   case PIPE_TEXTURE_1D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_1d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_2D:
-      if (sampler->normalized_coords)
-         sp_get_samples_2d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      else
-         sp_get_samples_rect(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_3D:
-      assert(sampler->normalized_coords);
-      sp_get_samples_3d(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   case PIPE_TEXTURE_CUBE:
-      assert(sampler->normalized_coords);
-      sp_get_samples_cube(tgsi_sampler, s, t, p, computeLambda, lodbias, rgba);
-      break;
-   default:
-      assert(0);
-   }
-
-#if 0 /* DEBUG */
-   {
-      int i;
-      printf("Sampled at %f, %f, %f:\n", s[0], t[0], p[0]);
-      for (i = 0; i < 4; i++) {
-         printf("Frag %d: %f %f %f %f\n", i,
-                rgba[0][i],
-                rgba[1][i],
-                rgba[2][i],
-                rgba[3][i]);
-      }
-   }
-#endif
+   samp->texture = texture;
+   samp->cache = tex_cache;
+   samp->xpot = util_unsigned_logbase2( texture->width[0] );
+   samp->ypot = util_unsigned_logbase2( texture->height[0] );
+   samp->level = CLAMP((int) sampler->min_lod, 0, (int) texture->last_level);
 }
 
 
-/**
- * Called via tgsi_sampler::get_samples() when running a fragment shader.
- * Get four filtered RGBA values from the sampler's texture.
- */
 void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE])
+sp_sampler_varient_destroy( struct sp_sampler_varient *samp )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, TRUE, lodbias, rgba);
+   FREE(samp);
 }
 
 
 /**
- * Called via tgsi_sampler::get_samples() when running a vertex shader.
- * Get four filtered RGBA values from the sampler's texture.
+ * Create a sampler varient for a given set of non-orthogonal state.
  */
-void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE])
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key )
 {
-   sp_get_samples(tgsi_sampler, s, t, p, FALSE, lodbias, rgba);
+   struct sp_sampler_varient *samp = CALLOC_STRUCT(sp_sampler_varient);
+   if (!samp)
+      return NULL;
+
+   samp->sampler = sampler;
+   samp->key = key;
+
+   /* Note that (for instance) linear_texcoord_s and
+    * nearest_texcoord_s may be active at the same time, if the
+    * sampler min_img_filter differs from its mag_img_filter.
+    */
+   if (sampler->normalized_coords) {
+      samp->linear_texcoord_s = get_linear_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_wrap( sampler->wrap_r );
+   }
+   else {
+      samp->linear_texcoord_s = get_linear_unorm_wrap( sampler->wrap_s );
+      samp->linear_texcoord_t = get_linear_unorm_wrap( sampler->wrap_t );
+      samp->linear_texcoord_p = get_linear_unorm_wrap( sampler->wrap_r );
+      
+      samp->nearest_texcoord_s = get_nearest_unorm_wrap( sampler->wrap_s );
+      samp->nearest_texcoord_t = get_nearest_unorm_wrap( sampler->wrap_t );
+      samp->nearest_texcoord_p = get_nearest_unorm_wrap( sampler->wrap_r );
+   }
+   
+   samp->compute_lambda = get_lambda_func( key );
+
+   samp->min_img_filter = get_img_filter(key, sampler->min_img_filter, sampler);
+   samp->mag_img_filter = get_img_filter(key, sampler->mag_img_filter, sampler);
+
+   switch (sampler->min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NONE:
+      if (sampler->min_img_filter == sampler->mag_img_filter) 
+         samp->mip_filter = samp->min_img_filter;         
+      else
+         samp->mip_filter = mip_filter_none;
+      break;
+
+   case PIPE_TEX_MIPFILTER_NEAREST:
+      samp->mip_filter = mip_filter_nearest;
+      break;
+
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      if (key.bits.is_pot &&
+          sampler->min_img_filter == sampler->mag_img_filter &&
+          sampler->normalized_coords &&
+          sampler->wrap_s == PIPE_TEX_WRAP_REPEAT &&
+          sampler->wrap_t == PIPE_TEX_WRAP_REPEAT &&
+          sampler->min_img_filter == PIPE_TEX_FILTER_LINEAR)
+      {
+         samp->mip_filter = mip_filter_linear_2d_linear_repeat_POT;
+      }
+      else 
+      {
+         samp->mip_filter = mip_filter_linear;
+      }
+      break;
+   }
+
+   if (sampler->compare_mode != FALSE) {
+      samp->compare = sample_compare;
+   }
+   else {
+      /* Skip compare operation by promoting the mip_filter function
+       * pointer:
+       */
+      samp->compare = samp->mip_filter;
+   }
+   
+   if (key.bits.target == PIPE_TEXTURE_CUBE) {
+      samp->base.get_samples = sample_cube;
+   }
+   else {
+      samp->faces[0] = 0;
+      samp->faces[1] = 0;
+      samp->faces[2] = 0;
+      samp->faces[3] = 0;
+
+      /* Skip cube face determination by promoting the compare
+       * function pointer:
+       */
+      samp->base.get_samples = samp->compare;
+   }
+
+   return samp;
 }
diff --git a/src/gallium/drivers/softpipe/sp_tex_sample.h b/src/gallium/drivers/softpipe/sp_tex_sample.h
index 40d8eb2c2a8..b0797711d37 100644
--- a/src/gallium/drivers/softpipe/sp_tex_sample.h
+++ b/src/gallium/drivers/softpipe/sp_tex_sample.h
@@ -31,43 +31,122 @@
 
 #include "tgsi/tgsi_exec.h"
 
+struct sp_sampler_varient;
+
+typedef void (*wrap_nearest_func)(const float s[4],
+                                  unsigned size,
+                                  int icoord[4]);
+
+typedef void (*wrap_linear_func)(const float s[4], 
+                                 unsigned size,
+                                 int icoord0[4],
+                                 int icoord1[4],
+                                 float w[4]);
+
+typedef float (*compute_lambda_func)(const struct sp_sampler_varient *sampler,
+                                     const float s[QUAD_SIZE],
+                                     const float t[QUAD_SIZE],
+                                     const float p[QUAD_SIZE],
+                                     float lodbias);
+
+typedef void (*filter_func)(struct tgsi_sampler *tgsi_sampler,
+                            const float s[QUAD_SIZE],
+                            const float t[QUAD_SIZE],
+                            const float p[QUAD_SIZE],
+                            float lodbias,
+                            float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+union sp_sampler_key {
+   struct {
+      unsigned target:3;
+      unsigned is_pot:1;
+      unsigned processor:2;
+      unsigned unit:4;
+      unsigned pad:22;
+   } bits;
+   unsigned value;
+};
 
 /**
  * Subclass of tgsi_sampler
  */
-struct sp_shader_sampler
+struct sp_sampler_varient
 {
    struct tgsi_sampler base;  /**< base class */
 
-   uint unit;
-   struct softpipe_context *sp;
-   struct softpipe_tile_cache *cache;
+   union sp_sampler_key key;
+
+   /* The owner of this struct:
+    */
+   const struct pipe_sampler_state *sampler;
+
+
+   /* Currently bound texture:
+    */
+   const struct pipe_texture *texture;
+   struct softpipe_tex_tile_cache *cache;
+
+   unsigned processor;
+
+   /* For sp_get_samples_2d_linear_POT:
+    */
+   unsigned xpot;
+   unsigned ypot;
+   unsigned level;
+
+   unsigned faces[4];
+   
+   wrap_nearest_func nearest_texcoord_s;
+   wrap_nearest_func nearest_texcoord_t;
+   wrap_nearest_func nearest_texcoord_p;
+
+   wrap_linear_func linear_texcoord_s;
+   wrap_linear_func linear_texcoord_t;
+   wrap_linear_func linear_texcoord_p;
+
+   filter_func min_img_filter;
+   filter_func mag_img_filter;
+
+   compute_lambda_func compute_lambda;
+
+   filter_func mip_filter;
+   filter_func compare;
+   
+   /* Linked list:
+    */
+   struct sp_sampler_varient *next;
 };
 
+struct sp_sampler;
 
+/* Create a sampler varient for a given set of non-orthogonal state.  Currently the 
+ */
+struct sp_sampler_varient *
+sp_create_sampler_varient( const struct pipe_sampler_state *sampler,
+                           const union sp_sampler_key key );
 
-static INLINE const struct sp_shader_sampler *
-sp_shader_sampler(const struct tgsi_sampler *sampler)
-{
-   return (const struct sp_shader_sampler *) sampler;
-}
+void sp_sampler_varient_bind_texture( struct sp_sampler_varient *varient,
+                                      struct softpipe_tex_tile_cache *tex_cache,
+                                      const struct pipe_texture *tex );
 
+void sp_sampler_varient_destroy( struct sp_sampler_varient * );
 
-extern void
-sp_get_samples_fragment(struct tgsi_sampler *tgsi_sampler,
-                        const float s[QUAD_SIZE],
-                        const float t[QUAD_SIZE],
-                        const float p[QUAD_SIZE],
-                        float lodbias,
-                        float rgba[NUM_CHANNELS][QUAD_SIZE]);
+
+
+static INLINE struct sp_sampler_varient *
+sp_sampler_varient(const struct tgsi_sampler *sampler)
+{
+   return (struct sp_sampler_varient *) sampler;
+}
 
 extern void
-sp_get_samples_vertex(struct tgsi_sampler *tgsi_sampler,
-                      const float s[QUAD_SIZE],
-                      const float t[QUAD_SIZE],
-                      const float p[QUAD_SIZE],
-                      float lodbias,
-                      float rgba[NUM_CHANNELS][QUAD_SIZE]);
+sp_get_samples(struct tgsi_sampler *tgsi_sampler,
+               const float s[QUAD_SIZE],
+               const float t[QUAD_SIZE],
+               const float p[QUAD_SIZE],
+               float lodbias,
+               float rgba[NUM_CHANNELS][QUAD_SIZE]);
 
 
 #endif /* SP_TEX_SAMPLE_H */
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.c b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
new file mode 100644
index 00000000000..407a22a9f4b
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.c
@@ -0,0 +1,273 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Texture tile caching.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+#include "pipe/p_inlines.h"
+#include "util/u_memory.h"
+#include "util/u_tile.h"
+#include "sp_context.h"
+#include "sp_surface.h"
+#include "sp_texture.h"
+#include "sp_tex_tile_cache.h"
+
+   
+
+struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen )
+{
+   struct softpipe_tex_tile_cache *tc;
+   uint pos;
+
+   tc = CALLOC_STRUCT( softpipe_tex_tile_cache );
+   if (tc) {
+      tc->screen = screen;
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+   }
+   return tc;
+}
+
+
+void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   struct pipe_screen *screen;
+   uint pos;
+
+   for (pos = 0; pos < NUM_ENTRIES; pos++) {
+      /*assert(tc->entries[pos].x < 0);*/
+   }
+   if (tc->transfer) {
+      screen = tc->transfer->texture->screen;
+      screen->tex_transfer_destroy(tc->transfer);
+   }
+   if (tc->tex_trans) {
+      screen = tc->tex_trans->texture->screen;
+      screen->tex_transfer_destroy(tc->tex_trans);
+   }
+
+   FREE( tc );
+}
+
+
+
+
+void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans && !tc->tex_trans_map)
+      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
+}
+
+
+void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc)
+{
+   if (tc->tex_trans_map) {
+      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+      tc->tex_trans_map = NULL;
+   }
+}
+
+/**
+ * Invalidate all cached tiles for the cached texture.
+ * Should be called when the texture is modified.
+ */
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc)
+{
+   unsigned i;
+
+   assert(tc);
+   assert(tc->texture);
+
+   for (i = 0; i < NUM_ENTRIES; i++) {
+      tc->entries[i].addr.bits.invalid = 1;
+   }
+}
+
+/**
+ * Specify the texture to cache.
+ */
+void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture)
+{
+   uint i;
+
+   assert(!tc->transfer);
+
+   if (tc->texture != texture) {
+      pipe_texture_reference(&tc->texture, texture);
+
+      if (tc->tex_trans) {
+         struct pipe_screen *screen = tc->tex_trans->texture->screen;
+         
+         if (tc->tex_trans_map) {
+            screen->transfer_unmap(screen, tc->tex_trans);
+            tc->tex_trans_map = NULL;
+         }
+
+         screen->tex_transfer_destroy(tc->tex_trans);
+         tc->tex_trans = NULL;
+      }
+
+      /* mark as entries as invalid/empty */
+      /* XXX we should try to avoid this when the teximage hasn't changed */
+      for (i = 0; i < NUM_ENTRIES; i++) {
+         tc->entries[i].addr.bits.invalid = 1;
+      }
+
+      tc->tex_face = -1; /* any invalid value here */
+   }
+}
+
+
+
+
+/**
+ * Flush the tile cache: write all dirty tiles back to the transfer.
+ * any tiles "flagged" as cleared will be "really" cleared.
+ */
+void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc)
+{
+   int pos;
+
+   if (tc->texture) {
+      /* caching a texture, mark all entries as empty */
+      for (pos = 0; pos < NUM_ENTRIES; pos++) {
+         tc->entries[pos].addr.bits.invalid = 1;
+      }
+      tc->tex_face = -1;
+   }
+
+}
+
+
+/**
+ * Given the texture face, level, zslice, x and y values, compute
+ * the cache entry position/index where we'd hope to find the
+ * cached texture tile.
+ * This is basically a direct-map cache.
+ * XXX There's probably lots of ways in which we can improve this.
+ */
+static INLINE uint
+tex_cache_pos( union tex_tile_address addr )
+{
+   uint entry = (addr.bits.x + 
+                 addr.bits.y * 9 + 
+                 addr.bits.z * 3 + 
+                 addr.bits.face + 
+                 addr.bits.level * 7);
+
+   return entry % NUM_ENTRIES;
+}
+
+/**
+ * Similar to sp_get_cached_tile() but for textures.
+ * Tiles are read-only and indexed with more params.
+ */
+const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                        union tex_tile_address addr )
+{
+   struct pipe_screen *screen = tc->screen;
+   struct softpipe_tex_cached_tile *tile;
+   
+   tile = tc->entries + tex_cache_pos( addr );
+
+   if (addr.value != tile->addr.value) {
+
+      /* cache miss.  Most misses are because we've invaldiated the
+       * texture cache previously -- most commonly on binding a new
+       * texture.  Currently we effectively flush the cache on texture
+       * bind.
+       */
+#if 0
+      _debug_printf("miss at %u:  x=%d y=%d z=%d face=%d level=%d\n"
+                    "   tile %u:  x=%d y=%d z=%d face=%d level=%d\n",
+                    pos, x/TILE_SIZE, y/TILE_SIZE, z, face, level,
+                    pos, tile->addr.bits.x, tile->addr.bits.y, tile->z, tile->face, tile->level);
+#endif
+
+      /* check if we need to get a new transfer */
+      if (!tc->tex_trans ||
+          tc->tex_face != addr.bits.face ||
+          tc->tex_level != addr.bits.level ||
+          tc->tex_z != addr.bits.z) {
+         /* get new transfer (view into texture) */
+
+         if (tc->tex_trans) {
+            if (tc->tex_trans_map) {
+               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
+               tc->tex_trans_map = NULL;
+            }
+
+            screen->tex_transfer_destroy(tc->tex_trans);
+            tc->tex_trans = NULL;
+         }
+
+         tc->tex_trans = 
+            screen->get_tex_transfer(screen, tc->texture, 
+                                     addr.bits.face, 
+                                     addr.bits.level, 
+                                     addr.bits.z, 
+                                     PIPE_TRANSFER_READ, 0, 0,
+                                     tc->texture->width[addr.bits.level],
+                                     tc->texture->height[addr.bits.level]);
+
+         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
+
+         tc->tex_face = addr.bits.face;
+         tc->tex_level = addr.bits.level;
+         tc->tex_z = addr.bits.z;
+      }
+
+      /* get tile from the transfer (view into texture) */
+      pipe_get_tile_rgba(tc->tex_trans,
+                         addr.bits.x * TILE_SIZE, 
+                         addr.bits.y * TILE_SIZE,
+                         TILE_SIZE, TILE_SIZE,
+                         (float *) tile->data.color);
+      tile->addr = addr;
+   }
+
+   tc->last_tile = tile;
+   return tile;
+}
+
+
+
diff --git a/src/gallium/drivers/softpipe/sp_tex_tile_cache.h b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
new file mode 100644
index 00000000000..ac6886a3df1
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_tex_tile_cache.h
@@ -0,0 +1,155 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef SP_TEX_TILE_CACHE_H
+#define SP_TEX_TILE_CACHE_H
+
+
+#include "pipe/p_compiler.h"
+
+
+struct softpipe_context;
+struct softpipe_tex_tile_cache;
+
+
+/**
+ * Cache tile size (width and height). This needs to be a power of two.
+ */
+#define TILE_SIZE 64
+
+
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tex_tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned z:12;            /* 4096 -- z not tiled */
+      unsigned face:3;
+      unsigned level:4;
+      unsigned invalid:1;
+   } bits;
+   unsigned value;
+};
+
+
+struct softpipe_tex_cached_tile
+{
+   union tex_tile_address addr;
+   union {
+      float color[TILE_SIZE][TILE_SIZE][4];
+   } data;
+};
+
+#define NUM_ENTRIES 50
+
+struct softpipe_tex_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct pipe_texture *texture;  /**< if caching a texture */
+   unsigned timestamp;
+
+   struct softpipe_tex_cached_tile entries[NUM_ENTRIES];
+
+   struct pipe_transfer *tex_trans;
+   void *tex_trans_map;
+   int tex_face, tex_level, tex_z;
+
+   struct softpipe_tex_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
+
+extern struct softpipe_tex_tile_cache *
+sp_create_tex_tile_cache( struct pipe_screen *screen );
+
+extern void
+sp_destroy_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+extern void
+sp_tex_tile_cache_map_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_unmap_transfers(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_tex_tile_cache_set_texture(struct softpipe_tex_tile_cache *tc,
+                          struct pipe_texture *texture);
+
+void
+sp_tex_tile_cache_validate_texture(struct softpipe_tex_tile_cache *tc);
+
+extern void
+sp_flush_tex_tile_cache(struct softpipe_tex_tile_cache *tc);
+
+
+
+extern const struct softpipe_tex_cached_tile *
+sp_find_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr );
+
+static INLINE union tex_tile_address
+tex_tile_address( unsigned x,
+		  unsigned y,
+		  unsigned z,
+		  unsigned face,
+		  unsigned level )
+{
+   union tex_tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+   addr.bits.z = z;
+   addr.bits.face = face;
+   addr.bits.level = level;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE const struct softpipe_tex_cached_tile *
+sp_get_cached_tile_tex(struct softpipe_tex_tile_cache *tc, 
+                         union tex_tile_address addr )
+{
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile_tex( tc, addr );
+}
+
+
+
+
+
+#endif /* SP_TEX_TILE_CACHE_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 548e40c4a8c..2e6c43c7ef2 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -113,16 +113,20 @@ softpipe_displaytarget_layout(struct pipe_screen *screen,
 
 static struct pipe_texture *
 softpipe_texture_create(struct pipe_screen *screen,
-                        const struct pipe_texture *templat)
+                        const struct pipe_texture *template)
 {
    struct softpipe_texture *spt = CALLOC_STRUCT(softpipe_texture);
    if (!spt)
       return NULL;
 
-   spt->base = *templat;
+   spt->base = *template;
    pipe_reference_init(&spt->base.reference, 1);
    spt->base.screen = screen;
 
+   spt->pot = (util_is_power_of_two(template->width[0]) &&
+               util_is_power_of_two(template->height[0]) &&
+               util_is_power_of_two(template->depth[0]));
+
    if (spt->base.tex_usage & (PIPE_TEXTURE_USAGE_DISPLAY_TARGET |
                               PIPE_TEXTURE_USAGE_PRIMARY)) {
       if (!softpipe_displaytarget_layout(screen, spt))
@@ -218,6 +222,13 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
       if (ps->usage & PIPE_BUFFER_USAGE_GPU_READ)
          ps->usage |= PIPE_BUFFER_USAGE_CPU_READ;
 
+      if (ps->usage & (PIPE_BUFFER_USAGE_CPU_WRITE |
+                       PIPE_BUFFER_USAGE_GPU_WRITE)) {
+         /* Mark the surface as dirty.  The tile cache will look for this. */
+         spt->timestamp++;
+         softpipe_screen(screen)->timestamp++;
+      }
+
       ps->face = face;
       ps->level = level;
       ps->zslice = zslice;
@@ -318,27 +329,18 @@ softpipe_transfer_map( struct pipe_screen *screen,
 {
    ubyte *map, *xfer_map;
    struct softpipe_texture *spt;
-   unsigned flags = 0;
 
    assert(transfer->texture);
    spt = softpipe_texture(transfer->texture);
 
-   if (transfer->usage != PIPE_TRANSFER_READ) {
-      flags |= PIPE_BUFFER_USAGE_CPU_WRITE;
-   }
-
-   if (transfer->usage != PIPE_TRANSFER_WRITE) {
-      flags |= PIPE_BUFFER_USAGE_CPU_READ;
-   }
-
-   map = pipe_buffer_map(screen, spt->buffer, flags);
+   map = pipe_buffer_map(screen, spt->buffer, pipe_transfer_buffer_flags(transfer));
    if (map == NULL)
       return NULL;
 
    /* May want to different things here depending on read/write nature
     * of the map:
     */
-   if (transfer->texture && transfer->usage != PIPE_TRANSFER_READ) {
+   if (transfer->texture && (transfer->usage & PIPE_TRANSFER_WRITE)) {
       /* Do something to notify sharing contexts of a texture change.
        * In softpipe, that would mean flushing the texture cache.
        */
@@ -364,12 +366,64 @@ softpipe_transfer_unmap(struct pipe_screen *screen,
 
    pipe_buffer_unmap( screen, spt->buffer );
 
-   if (transfer->usage != PIPE_TRANSFER_READ) {
+   if (transfer->usage & PIPE_TRANSFER_WRITE) {
       /* Mark the texture as dirty to expire the tile caches. */
-      spt->modified = TRUE;
+      spt->timestamp++;
    }
 }
 
+static struct pipe_video_surface*
+softpipe_video_surface_create(struct pipe_screen *screen,
+                              enum pipe_video_chroma_format chroma_format,
+                              unsigned width, unsigned height)
+{
+   struct softpipe_video_surface *sp_vsfc;
+   struct pipe_texture template;
+
+   assert(screen);
+   assert(width && height);
+
+   sp_vsfc = CALLOC_STRUCT(softpipe_video_surface);
+   if (!sp_vsfc)
+      return NULL;
+
+   pipe_reference_init(&sp_vsfc->base.reference, 1);
+   sp_vsfc->base.screen = screen;
+   sp_vsfc->base.chroma_format = chroma_format;
+   /*sp_vsfc->base.surface_format = PIPE_VIDEO_SURFACE_FORMAT_VUYA;*/
+   sp_vsfc->base.width = width;
+   sp_vsfc->base.height = height;
+
+   memset(&template, 0, sizeof(struct pipe_texture));
+   template.target = PIPE_TEXTURE_2D;
+   template.format = PIPE_FORMAT_X8R8G8B8_UNORM;
+   template.last_level = 0;
+   /* vl_mpeg12_mc_renderer expects this when it's initialized with pot_buffers=true */
+   template.width[0] = util_next_power_of_two(width);
+   template.height[0] = util_next_power_of_two(height);
+   template.depth[0] = 1;
+   pf_get_block(template.format, &template.block);
+   template.tex_usage = PIPE_TEXTURE_USAGE_SAMPLER | PIPE_TEXTURE_USAGE_RENDER_TARGET;
+
+   sp_vsfc->tex = screen->texture_create(screen, &template);
+   if (!sp_vsfc->tex) {
+      FREE(sp_vsfc);
+      return NULL;
+   }
+
+   return &sp_vsfc->base;
+}
+
+
+static void
+softpipe_video_surface_destroy(struct pipe_video_surface *vsfc)
+{
+   struct softpipe_video_surface *sp_vsfc = softpipe_video_surface(vsfc);
+
+   pipe_texture_reference(&sp_vsfc->tex, NULL);
+   FREE(sp_vsfc);
+}
+
 
 void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
@@ -385,6 +439,9 @@ softpipe_init_screen_texture_funcs(struct pipe_screen *screen)
    screen->tex_transfer_destroy = softpipe_tex_transfer_destroy;
    screen->transfer_map = softpipe_transfer_map;
    screen->transfer_unmap = softpipe_transfer_unmap;
+
+   screen->video_surface_create = softpipe_video_surface_create;
+   screen->video_surface_destroy = softpipe_video_surface_destroy;
 }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_texture.h b/src/gallium/drivers/softpipe/sp_texture.h
index b1b6130b22c..2ef64e1e7c3 100644
--- a/src/gallium/drivers/softpipe/sp_texture.h
+++ b/src/gallium/drivers/softpipe/sp_texture.h
@@ -30,6 +30,7 @@
 
 
 #include "pipe/p_state.h"
+#include "pipe/p_video_state.h"
 
 
 struct pipe_context;
@@ -48,7 +49,11 @@ struct softpipe_texture
     */
    struct pipe_buffer *buffer;
 
-   boolean modified;
+   /* True if texture images are power-of-two in all dimensions:
+    */
+   boolean pot;
+
+   unsigned timestamp;
 };
 
 struct softpipe_transfer
@@ -58,6 +63,15 @@ struct softpipe_transfer
    unsigned long offset;
 };
 
+struct softpipe_video_surface
+{
+   struct pipe_video_surface base;
+
+   /* The data is held here:
+    */
+   struct pipe_texture *tex;
+};
+
 
 /** cast wrappers */
 static INLINE struct softpipe_texture *
@@ -72,6 +86,12 @@ softpipe_transfer(struct pipe_transfer *pt)
    return (struct softpipe_transfer *) pt;
 }
 
+static INLINE struct softpipe_video_surface *
+softpipe_video_surface(struct pipe_video_surface *pvs)
+{
+   return (struct softpipe_video_surface *) pvs;
+}
+
 
 extern void
 softpipe_init_screen_texture_funcs(struct pipe_screen *screen);
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.c b/src/gallium/drivers/softpipe/sp_tile_cache.c
index 1f9b8f1f4fb..83fb4e0d151 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.c
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 /**
- * Texture tile caching.
+ * Render target tile caching.
  *
  * Author:
  *    Brian Paul
@@ -35,38 +35,8 @@
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "util/u_tile.h"
-#include "sp_context.h"
-#include "sp_surface.h"
-#include "sp_texture.h"
 #include "sp_tile_cache.h"
 
-#define NUM_ENTRIES 50
-
-
-/** XXX move these */
-#define MAX_WIDTH 2048
-#define MAX_HEIGHT 2048
-
-
-struct softpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-   struct pipe_texture *texture;  /**< if caching a texture */
-   struct softpipe_cached_tile entries[NUM_ENTRIES];
-   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
-   float clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
-
-   struct pipe_transfer *tex_trans;
-   void *tex_trans_map;
-   int tex_face, tex_level, tex_z;
-
-   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
-};
 
 
 /**
@@ -76,7 +46,7 @@ struct softpipe_tile_cache
  * a LRU replacement policy.
  */
 #define CACHE_POS(x, y) \
-   (((x) / TILE_SIZE + ((y) / TILE_SIZE) * 5) % NUM_ENTRIES)
+   (((x) + (y) * 5) % NUM_ENTRIES)
 
 
 
@@ -84,12 +54,10 @@ struct softpipe_tile_cache
  * Is the tile at (x,y) in cleared state?
  */
 static INLINE uint
-is_clear_flag_set(const uint *bitvec, int x, int y)
+is_clear_flag_set(const uint *bitvec, union tile_address addr)
 {
    int pos, bit;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bit = bitvec[pos / 32] & (1 << (pos & 31));
    return bit;
@@ -100,12 +68,10 @@ is_clear_flag_set(const uint *bitvec, int x, int y)
  * Mark the tile at (x,y) as not cleared.
  */
 static INLINE void
-clear_clear_flag(uint *bitvec, int x, int y)
+clear_clear_flag(uint *bitvec, union tile_address addr)
 {
    int pos;
-   x /= TILE_SIZE;
-   y /= TILE_SIZE;
-   pos = y * (MAX_WIDTH / TILE_SIZE) + x;
+   pos = addr.bits.y * (MAX_WIDTH / TILE_SIZE) + addr.bits.x;
    assert(pos / 32 < (MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32);
    bitvec[pos / 32] &= ~(1 << (pos & 31));
 }
@@ -116,14 +82,25 @@ sp_create_tile_cache( struct pipe_screen *screen )
 {
    struct softpipe_tile_cache *tc;
    uint pos;
+   int maxLevels, maxTexSize;
+
+   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
+   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
+   maxTexSize = 1 << (maxLevels - 1);
+   assert(MAX_WIDTH >= maxTexSize);
 
    tc = CALLOC_STRUCT( softpipe_tile_cache );
    if (tc) {
       tc->screen = screen;
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x =
-         tc->entries[pos].y = -1;
+         tc->entries[pos].addr.bits.invalid = 1;
       }
+      tc->last_tile = &tc->entries[0]; /* any tile */
+
+#if TILE_CLEAR_OPTIMIZATION
+      /* set flags to indicate all the tiles are cleared */
+      memset(tc->clear_flags, 255, sizeof(tc->clear_flags));
+#endif
    }
    return tc;
 }
@@ -142,10 +119,6 @@ sp_destroy_tile_cache(struct softpipe_tile_cache *tc)
       screen = tc->transfer->texture->screen;
       screen->tex_transfer_destroy(tc->transfer);
    }
-   if (tc->tex_trans) {
-      screen = tc->tex_trans->texture->screen;
-      screen->tex_transfer_destroy(tc->tex_trans);
-   }
 
    FREE( tc );
 }
@@ -158,8 +131,6 @@ void
 sp_tile_cache_set_surface(struct softpipe_tile_cache *tc,
                           struct pipe_surface *ps)
 {
-   assert(!tc->texture);
-
    if (tc->transfer) {
       struct pipe_screen *screen = tc->transfer->texture->screen;
 
@@ -211,9 +182,6 @@ sp_tile_cache_map_transfers(struct softpipe_tile_cache *tc)
 {
    if (tc->transfer && !tc->transfer_map)
       tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-
-   if (tc->tex_trans && !tc->tex_trans_map)
-      tc->tex_trans_map = tc->screen->transfer_map(tc->screen, tc->tex_trans);
 }
 
 
@@ -224,47 +192,6 @@ sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc)
       tc->screen->transfer_unmap(tc->screen, tc->transfer);
       tc->transfer_map = NULL;
    }
-
-   if (tc->tex_trans_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-      tc->tex_trans_map = NULL;
-   }
-}
-
-
-/**
- * Specify the texture to cache.
- */
-void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture)
-{
-   uint i;
-
-   assert(!tc->transfer);
-
-   pipe_texture_reference(&tc->texture, texture);
-
-   if (tc->tex_trans) {
-      struct pipe_screen *screen = tc->tex_trans->texture->screen;
-
-      if (tc->tex_trans_map) {
-         screen->transfer_unmap(screen, tc->tex_trans);
-         tc->tex_trans_map = NULL;
-      }
-
-      screen->tex_transfer_destroy(tc->tex_trans);
-      tc->tex_trans = NULL;
-   }
-
-   /* mark as entries as invalid/empty */
-   /* XXX we should try to avoid this when the teximage hasn't changed */
-   for (i = 0; i < NUM_ENTRIES; i++) {
-      tc->entries[i].x = -1;
-   }
-
-   tc->tex_face = -1; /* any invalid value here */
 }
 
 
@@ -308,7 +235,7 @@ clear_tile(struct softpipe_cached_tile *tile,
 
    switch (pf_get_size(format)) {
    case 1:
-      memset(tile->data.any, 0, TILE_SIZE * TILE_SIZE);
+      memset(tile->data.any, clear_value, TILE_SIZE * TILE_SIZE);
       break;
    case 2:
       if (clear_value == 0) {
@@ -344,8 +271,7 @@ clear_tile(struct softpipe_cached_tile *tile,
  * Actually clear the tiles which were flagged as being in a clear state.
  */
 static void
-sp_tile_cache_flush_clear(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc)
+sp_tile_cache_flush_clear(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    const uint w = tc->transfer->width;
@@ -359,13 +285,15 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
    /* push the tile to all positions marked as clear */
    for (y = 0; y < h; y += TILE_SIZE) {
       for (x = 0; x < w; x += TILE_SIZE) {
-         if (is_clear_flag_set(tc->clear_flags, x, y)) {
+         union tile_address addr = tile_address(x, y);
+
+         if (is_clear_flag_set(tc->clear_flags, addr)) {
             pipe_put_tile_raw(pt,
                               x, y, TILE_SIZE, TILE_SIZE,
                               tc->tile.data.color32, 0/*STRIDE*/);
 
             /* do this? */
-            clear_clear_flag(tc->clear_flags, x, y);
+            clear_clear_flag(tc->clear_flags, addr);
 
             numCleared++;
          }
@@ -382,8 +310,7 @@ sp_tile_cache_flush_clear(struct pipe_context *pipe,
  * any tiles "flagged" as cleared will be "really" cleared.
  */
 void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc)
+sp_flush_tile_cache(struct softpipe_tile_cache *tc)
 {
    struct pipe_transfer *pt = tc->transfer;
    int inuse = 0, pos;
@@ -392,33 +319,30 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
       /* caching a drawing transfer */
       for (pos = 0; pos < NUM_ENTRIES; pos++) {
          struct softpipe_cached_tile *tile = tc->entries + pos;
-         if (tile->x >= 0) {
+         if (!tile->addr.bits.invalid) {
             if (tc->depth_stencil) {
                pipe_put_tile_raw(pt,
-                                 tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                 tile->addr.bits.x * TILE_SIZE, 
+                                 tile->addr.bits.y * TILE_SIZE, 
+                                 TILE_SIZE, TILE_SIZE,
                                  tile->data.depth32, 0/*STRIDE*/);
             }
             else {
                pipe_put_tile_rgba(pt,
-                                  tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                                  tile->addr.bits.x * TILE_SIZE, 
+                                  tile->addr.bits.y * TILE_SIZE, 
+                                  TILE_SIZE, TILE_SIZE,
                                   (float *) tile->data.color);
             }
-            tile->x = tile->y = -1;  /* mark as empty */
+            tile->addr.bits.invalid = 1;  /* mark as empty */
             inuse++;
          }
       }
 
 #if TILE_CLEAR_OPTIMIZATION
-      sp_tile_cache_flush_clear(&softpipe->pipe, tc);
+      sp_tile_cache_flush_clear(tc);
 #endif
    }
-   else if (tc->texture) {
-      /* caching a texture, mark all entries as empty */
-      for (pos = 0; pos < NUM_ENTRIES; pos++) {
-         tc->entries[pos].x = -1;
-      }
-      tc->tex_face = -1;
-   }
 
 #if 0
    debug_printf("flushed tiles in use: %d\n", inuse);
@@ -431,40 +355,39 @@ sp_flush_tile_cache(struct softpipe_context *softpipe,
  * \param x, y  position of tile, in pixels
  */
 struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y)
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr )
 {
    struct pipe_transfer *pt = tc->transfer;
-
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-
+   
    /* cache pos/entry: */
-   const int pos = CACHE_POS(x, y);
+   const int pos = CACHE_POS(addr.bits.x,
+                             addr.bits.y);
    struct softpipe_cached_tile *tile = tc->entries + pos;
 
-   if (tile_x != tile->x ||
-       tile_y != tile->y) {
+   if (addr.value != tile->addr.value) {
 
-      if (tile->x != -1) {
+      if (tile->addr.bits.invalid == 0) {
          /* put dirty tile back in framebuffer */
          if (tc->depth_stencil) {
             pipe_put_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE,
+                              tile->addr.bits.y * TILE_SIZE,
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_put_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE,
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
 
-      tile->x = tile_x;
-      tile->y = tile_y;
+      tile->addr = addr;
 
-      if (is_clear_flag_set(tc->clear_flags, x, y)) {
+      if (is_clear_flag_set(tc->clear_flags, addr)) {
          /* don't get tile from framebuffer, just clear it */
          if (tc->depth_stencil) {
             clear_tile(tile, pt->format, tc->clear_val);
@@ -472,125 +395,33 @@ sp_get_cached_tile(struct softpipe_context *softpipe,
          else {
             clear_tile_rgba(tile, pt->format, tc->clear_color);
          }
-         clear_clear_flag(tc->clear_flags, x, y);
+         clear_clear_flag(tc->clear_flags, addr);
       }
       else {
          /* get new tile data from transfer */
          if (tc->depth_stencil) {
             pipe_get_tile_raw(pt,
-                              tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                              tile->addr.bits.x * TILE_SIZE, 
+                              tile->addr.bits.y * TILE_SIZE, 
+                              TILE_SIZE, TILE_SIZE,
                               tile->data.depth32, 0/*STRIDE*/);
          }
          else {
             pipe_get_tile_rgba(pt,
-                               tile->x, tile->y, TILE_SIZE, TILE_SIZE,
+                               tile->addr.bits.x * TILE_SIZE, 
+                               tile->addr.bits.y * TILE_SIZE,
+                               TILE_SIZE, TILE_SIZE,
                                (float *) tile->data.color);
          }
       }
    }
 
+   tc->last_tile = tile;
    return tile;
 }
 
 
-/**
- * Given the texture face, level, zslice, x and y values, compute
- * the cache entry position/index where we'd hope to find the
- * cached texture tile.
- * This is basically a direct-map cache.
- * XXX There's probably lots of ways in which we can improve this.
- */
-static INLINE uint
-tex_cache_pos(int x, int y, int z, int face, int level)
-{
-   uint entry = x + y * 9 + z * 3 + face + level * 7;
-   return entry % NUM_ENTRIES;
-}
-
-
-/**
- * Similar to sp_get_cached_tile() but for textures.
- * Tiles are read-only and indexed with more params.
- */
-const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *sp,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level)
-{
-   struct pipe_screen *screen = sp->pipe.screen;
-   /* tile pos in framebuffer: */
-   const int tile_x = x & ~(TILE_SIZE - 1);
-   const int tile_y = y & ~(TILE_SIZE - 1);
-   /* cache pos/entry: */
-   const uint pos = tex_cache_pos(x / TILE_SIZE, y / TILE_SIZE, z,
-                                  face, level);
-   struct softpipe_cached_tile *tile = tc->entries + pos;
 
-   if (tc->texture) {
-      struct softpipe_texture *spt = softpipe_texture(tc->texture);
-      if (spt->modified) {
-         /* texture was modified, invalidate all cached tiles */
-         uint p;
-         for (p = 0; p < NUM_ENTRIES; p++) {
-            tile = tc->entries + p;
-            tile->x = -1;
-         }
-         spt->modified = FALSE;
-      }
-   }
-
-   if (tile_x != tile->x ||
-       tile_y != tile->y ||
-       z != tile->z ||
-       face != tile->face ||
-       level != tile->level) {
-      /* cache miss */
-
-#if 0
-      printf("miss at %u  x=%d y=%d z=%d face=%d level=%d\n", pos,
-             x/TILE_SIZE, y/TILE_SIZE, z, face, level);
-#endif
-      /* check if we need to get a new transfer */
-      if (!tc->tex_trans ||
-          tc->tex_face != face ||
-          tc->tex_level != level ||
-          tc->tex_z != z) {
-         /* get new transfer (view into texture) */
-
-         if (tc->tex_trans) {
-            if (tc->tex_trans_map) {
-               tc->screen->transfer_unmap(tc->screen, tc->tex_trans);
-               tc->tex_trans_map = NULL;
-            }
-
-            screen->tex_transfer_destroy(tc->tex_trans);
-            tc->tex_trans = NULL;
-         }
-
-         tc->tex_trans = screen->get_tex_transfer(screen, tc->texture, face, level, z, 
-                                                  PIPE_TRANSFER_READ, 0, 0,
-                                                  tc->texture->width[level],
-                                                  tc->texture->height[level]);
-         tc->tex_trans_map = screen->transfer_map(screen, tc->tex_trans);
-
-         tc->tex_face = face;
-         tc->tex_level = level;
-         tc->tex_z = z;
-      }
-
-      /* get tile from the transfer (view into texture) */
-      pipe_get_tile_rgba(tc->tex_trans,
-                         tile_x, tile_y, TILE_SIZE, TILE_SIZE,
-                         (float *) tile->data.color);
-      tile->x = tile_x;
-      tile->y = tile_y;
-      tile->z = z;
-      tile->face = face;
-      tile->level = level;
-   }
-
-   return tile;
-}
 
 
 /**
@@ -621,6 +452,6 @@ sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
 
    for (pos = 0; pos < NUM_ENTRIES; pos++) {
       struct softpipe_cached_tile *tile = tc->entries + pos;
-      tile->x = tile->y = -1;
+      tile->addr.bits.invalid = 1;
    }
 }
diff --git a/src/gallium/drivers/softpipe/sp_tile_cache.h b/src/gallium/drivers/softpipe/sp_tile_cache.h
index 8f247d0e580..a12092702a6 100644
--- a/src/gallium/drivers/softpipe/sp_tile_cache.h
+++ b/src/gallium/drivers/softpipe/sp_tile_cache.h
@@ -34,7 +34,6 @@
 #include "pipe/p_compiler.h"
 
 
-struct softpipe_context;
 struct softpipe_tile_cache;
 
 
@@ -44,11 +43,23 @@ struct softpipe_tile_cache;
 #define TILE_SIZE 64
 
 
+/* If we need to support > 4096, just expand this to be a 64 bit
+ * union, or consider tiling in Z as well.
+ */
+union tile_address {
+   struct {
+      unsigned x:6;             /* 4096 / TILE_SIZE */
+      unsigned y:6;             /* 4096 / TILE_SIZE */
+      unsigned invalid:1;
+      unsigned pad:19;
+   } bits;
+   unsigned value;
+};
+
 
 struct softpipe_cached_tile
 {
-   int x, y;           /**< pos of tile in window coords */
-   int z, face, level; /**< Extra texture indexes */
+   union tile_address addr;
    union {
       float color[TILE_SIZE][TILE_SIZE][4];
       uint color32[TILE_SIZE][TILE_SIZE];
@@ -59,6 +70,32 @@ struct softpipe_cached_tile
    } data;
 };
 
+#define NUM_ENTRIES 50
+
+
+/** XXX move these */
+#define MAX_WIDTH 4096
+#define MAX_HEIGHT 4096
+
+
+struct softpipe_tile_cache
+{
+   struct pipe_screen *screen;
+   struct pipe_surface *surface;  /**< the surface we're caching */
+   struct pipe_transfer *transfer;
+   void *transfer_map;
+
+   struct softpipe_cached_tile entries[NUM_ENTRIES];
+   uint clear_flags[(MAX_WIDTH / TILE_SIZE) * (MAX_HEIGHT / TILE_SIZE) / 32];
+   float clear_color[4];  /**< for color bufs */
+   uint clear_val;        /**< for z+stencil, or packed color clear value */
+   boolean depth_stencil; /**< Is the surface a depth/stencil format? */
+
+   struct softpipe_cached_tile tile;  /**< scratch tile for clears */
+
+   struct softpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
+};
+
 
 extern struct softpipe_tile_cache *
 sp_create_tile_cache( struct pipe_screen *screen );
@@ -80,26 +117,45 @@ extern void
 sp_tile_cache_unmap_transfers(struct softpipe_tile_cache *tc);
 
 extern void
-sp_tile_cache_set_texture(struct pipe_context *pipe,
-                          struct softpipe_tile_cache *tc,
-                          struct pipe_texture *texture);
-
-extern void
-sp_flush_tile_cache(struct softpipe_context *softpipe,
-                    struct softpipe_tile_cache *tc);
+sp_flush_tile_cache(struct softpipe_tile_cache *tc);
 
 extern void
 sp_tile_cache_clear(struct softpipe_tile_cache *tc, const float *rgba,
                     uint clearValue);
 
 extern struct softpipe_cached_tile *
-sp_get_cached_tile(struct softpipe_context *softpipe,
-                   struct softpipe_tile_cache *tc, int x, int y);
+sp_find_cached_tile(struct softpipe_tile_cache *tc, 
+                    union tile_address addr );
+
+
+static INLINE union tile_address
+tile_address( unsigned x,
+              unsigned y )
+{
+   union tile_address addr;
+
+   addr.value = 0;
+   addr.bits.x = x / TILE_SIZE;
+   addr.bits.y = y / TILE_SIZE;
+      
+   return addr;
+}
+
+/* Quickly retrieve tile if it matches last lookup.
+ */
+static INLINE struct softpipe_cached_tile *
+sp_get_cached_tile(struct softpipe_tile_cache *tc, 
+                   int x, int y )
+{
+   union tile_address addr = tile_address( x, y );
+
+   if (tc->last_tile->addr.value == addr.value)
+      return tc->last_tile;
+
+   return sp_find_cached_tile( tc, addr );
+}
+
 
-extern const struct softpipe_cached_tile *
-sp_get_cached_tile_tex(struct softpipe_context *softpipe,
-                       struct softpipe_tile_cache *tc, int x, int y, int z,
-                       int face, int level);
 
 
 #endif /* SP_TILE_CACHE_H */
diff --git a/src/gallium/drivers/softpipe/sp_video_context.c b/src/gallium/drivers/softpipe/sp_video_context.c
new file mode 100644
index 00000000000..cae2d3efc58
--- /dev/null
+++ b/src/gallium/drivers/softpipe/sp_video_context.c
@@ -0,0 +1,304 @@
+/**************************************************************************
+ * 
+ * Copyright 2009 Younes Manton.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "sp_video_context.h"
+#include <pipe/p_inlines.h>
+#include <util/u_memory.h>
+#include "softpipe/sp_winsys.h"
+#include "softpipe/sp_texture.h"
+
+static void
+sp_mpeg12_destroy(struct pipe_video_context *vpipe)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+	
+   /* Asserted in softpipe_delete_fs_state() for some reason */
+   ctx->pipe->bind_vs_state(ctx->pipe, NULL);
+   ctx->pipe->bind_fs_state(ctx->pipe, NULL);
+
+   ctx->pipe->delete_blend_state(ctx->pipe, ctx->blend);
+   ctx->pipe->delete_rasterizer_state(ctx->pipe, ctx->rast);
+   ctx->pipe->delete_depth_stencil_alpha_state(ctx->pipe, ctx->dsa);
+
+   pipe_video_surface_reference(&ctx->decode_target, NULL);
+   vl_compositor_cleanup(&ctx->compositor);
+   vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+   ctx->pipe->destroy(ctx->pipe);
+
+   FREE(ctx);
+}
+
+static void
+sp_mpeg12_decode_macroblocks(struct pipe_video_context *vpipe,
+                             struct pipe_video_surface *past,
+                             struct pipe_video_surface *future,
+                             unsigned num_macroblocks,
+                             struct pipe_macroblock *macroblocks,
+                             struct pipe_fence_handle **fence)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+   struct pipe_mpeg12_macroblock *mpeg12_macroblocks = (struct pipe_mpeg12_macroblock*)macroblocks;
+
+   assert(vpipe);
+   assert(num_macroblocks);
+   assert(macroblocks);
+   assert(macroblocks->codec == PIPE_VIDEO_CODEC_MPEG12);
+   assert(ctx->decode_target);
+
+   vl_mpeg12_mc_renderer_render_macroblocks(&ctx->mc_renderer,
+                                            softpipe_video_surface(ctx->decode_target)->tex,
+                                            past ? softpipe_video_surface(past)->tex : NULL,
+                                            future ? softpipe_video_surface(future)->tex : NULL,
+                                            num_macroblocks, mpeg12_macroblocks, fence);
+}
+
+static void
+sp_mpeg12_clear_surface(struct pipe_video_context *vpipe,
+                        unsigned x, unsigned y,
+                        unsigned width, unsigned height,
+                        unsigned value,
+                        struct pipe_surface *surface)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+   assert(surface);
+
+   ctx->pipe->surface_fill(ctx->pipe, surface, x, y, width, height, value);
+}
+
+static void
+sp_mpeg12_render_picture(struct pipe_video_context     *vpipe,
+                         /*struct pipe_surface         *backround,
+                         struct pipe_video_rect        *backround_area,*/
+                         struct pipe_video_surface     *src_surface,
+                         enum pipe_mpeg12_picture_type picture_type,
+                         /*unsigned                    num_past_surfaces,
+                         struct pipe_video_surface     *past_surfaces,
+                         unsigned                      num_future_surfaces,
+                         struct pipe_video_surface     *future_surfaces,*/
+                         struct pipe_video_rect        *src_area,
+                         struct pipe_surface           *dst_surface,
+                         struct pipe_video_rect        *dst_area,
+                         /*unsigned                      num_layers,
+                         struct pipe_surface           *layers,
+                         struct pipe_video_rect        *layer_src_areas,
+                         struct pipe_video_rect        *layer_dst_areas*/
+                         struct pipe_fence_handle      **fence)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+	
+   assert(vpipe);
+   assert(src_surface);
+   assert(src_area);
+   assert(dst_surface);
+   assert(dst_area);
+	
+   vl_compositor_render(&ctx->compositor, softpipe_video_surface(src_surface)->tex,
+                        picture_type, src_area, dst_surface->texture, dst_area, fence);
+}
+
+static void
+sp_mpeg12_set_decode_target(struct pipe_video_context *vpipe,
+                            struct pipe_video_surface *dt)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+   assert(dt);
+
+   pipe_video_surface_reference(&ctx->decode_target, dt);
+}
+
+static void sp_mpeg12_set_csc_matrix(struct pipe_video_context *vpipe, const float *mat)
+{
+   struct sp_mpeg12_context *ctx = (struct sp_mpeg12_context*)vpipe;
+
+   assert(vpipe);
+
+   vl_compositor_set_csc_matrix(&ctx->compositor, mat);
+}
+
+static bool
+init_pipe_state(struct sp_mpeg12_context *ctx)
+{
+   struct pipe_rasterizer_state rast;
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   unsigned i;
+
+   assert(ctx);
+	
+   rast.flatshade = 1;
+   rast.flatshade_first = 0;
+   rast.light_twoside = 0;
+   rast.front_winding = PIPE_WINDING_CCW;
+   rast.cull_mode = PIPE_WINDING_CW;
+   rast.fill_cw = PIPE_POLYGON_MODE_FILL;
+   rast.fill_ccw = PIPE_POLYGON_MODE_FILL;
+   rast.offset_cw = 0;
+   rast.offset_ccw = 0;
+   rast.scissor = 0;
+   rast.poly_smooth = 0;
+   rast.poly_stipple_enable = 0;
+   rast.point_sprite = 0;
+   rast.point_size_per_vertex = 0;
+   rast.multisample = 0;
+   rast.line_smooth = 0;
+   rast.line_stipple_enable = 0;
+   rast.line_stipple_factor = 0;
+   rast.line_stipple_pattern = 0;
+   rast.line_last_pixel = 0;
+   rast.bypass_vs_clip_and_viewport = 0;
+   rast.line_width = 1;
+   rast.point_smooth = 0;
+   rast.point_size = 1;
+   rast.offset_units = 1;
+   rast.offset_scale = 1;
+   /*rast.sprite_coord_mode[i] = ;*/
+   ctx->rast = ctx->pipe->create_rasterizer_state(ctx->pipe, &rast);
+   ctx->pipe->bind_rasterizer_state(ctx->pipe, ctx->rast);
+
+   blend.blend_enable = 0;
+   blend.rgb_func = PIPE_BLEND_ADD;
+   blend.rgb_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.rgb_dst_factor = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_func = PIPE_BLEND_ADD;
+   blend.alpha_src_factor = PIPE_BLENDFACTOR_ONE;
+   blend.alpha_dst_factor = PIPE_BLENDFACTOR_ONE;
+   blend.logicop_enable = 0;
+   blend.logicop_func = PIPE_LOGICOP_CLEAR;
+   /* Needed to allow color writes to FB, even if blending disabled */
+   blend.colormask = PIPE_MASK_RGBA;
+   blend.dither = 0;
+   ctx->blend = ctx->pipe->create_blend_state(ctx->pipe, &blend);
+   ctx->pipe->bind_blend_state(ctx->pipe, ctx->blend);
+
+   dsa.depth.enabled = 0;
+   dsa.depth.writemask = 0;
+   dsa.depth.func = PIPE_FUNC_ALWAYS;
+   for (i = 0; i < 2; ++i) {
+      dsa.stencil[i].enabled = 0;
+      dsa.stencil[i].func = PIPE_FUNC_ALWAYS;
+      dsa.stencil[i].fail_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].zpass_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].zfail_op = PIPE_STENCIL_OP_KEEP;
+      dsa.stencil[i].ref_value = 0;
+      dsa.stencil[i].valuemask = 0;
+      dsa.stencil[i].writemask = 0;
+   }
+   dsa.alpha.enabled = 0;
+   dsa.alpha.func = PIPE_FUNC_ALWAYS;
+   dsa.alpha.ref_value = 0;
+   ctx->dsa = ctx->pipe->create_depth_stencil_alpha_state(ctx->pipe, &dsa);
+   ctx->pipe->bind_depth_stencil_alpha_state(ctx->pipe, ctx->dsa);
+	
+   return true;
+}
+
+static struct pipe_video_context *
+sp_mpeg12_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                 enum pipe_video_chroma_format chroma_format,
+                 unsigned width, unsigned height)
+{
+   struct sp_mpeg12_context *ctx;
+
+   assert(u_reduce_video_profile(profile) == PIPE_VIDEO_CODEC_MPEG12);
+
+   ctx = CALLOC_STRUCT(sp_mpeg12_context);
+
+   if (!ctx)
+      return NULL;
+
+   ctx->base.profile = profile;
+   ctx->base.chroma_format = chroma_format;
+   ctx->base.width = width;
+   ctx->base.height = height;
+
+   ctx->base.screen = screen;
+   ctx->base.destroy = sp_mpeg12_destroy;
+   ctx->base.decode_macroblocks = sp_mpeg12_decode_macroblocks;
+   ctx->base.clear_surface = sp_mpeg12_clear_surface;
+   ctx->base.render_picture = sp_mpeg12_render_picture;
+   ctx->base.set_decode_target = sp_mpeg12_set_decode_target;
+   ctx->base.set_csc_matrix = sp_mpeg12_set_csc_matrix;
+
+   ctx->pipe = softpipe_create(screen);
+   if (!ctx->pipe) {
+      FREE(ctx);
+      return NULL;
+   }
+
+   /* TODO: Use slice buffering for softpipe when implemented, no advantage to buffering an entire picture */
+   if (!vl_mpeg12_mc_renderer_init(&ctx->mc_renderer, ctx->pipe,
+                                   width, height, chroma_format,
+                                   VL_MPEG12_MC_RENDERER_BUFFER_PICTURE,
+                                   /* TODO: Use XFER_NONE when implemented */
+                                   VL_MPEG12_MC_RENDERER_EMPTY_BLOCK_XFER_ONE,
+                                   true)) {
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+	
+   if (!vl_compositor_init(&ctx->compositor, ctx->pipe)) {
+      vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+	
+   if (!init_pipe_state(ctx)) {
+      vl_compositor_cleanup(&ctx->compositor);
+      vl_mpeg12_mc_renderer_cleanup(&ctx->mc_renderer);
+      ctx->pipe->destroy(ctx->pipe);
+      FREE(ctx);
+      return NULL;
+   }
+
+   return &ctx->base;
+}
+
+struct pipe_video_context *
+sp_video_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height)
+{
+   assert(screen);
+   assert(width && height);
+
+   switch (u_reduce_video_profile(profile)) {
+      case PIPE_VIDEO_CODEC_MPEG12:
+         return sp_mpeg12_create(screen, profile,
+                                 chroma_format,
+                                 width, height);
+      default:
+         return NULL;
+   }
+}
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_video_context.h
index dfa7ff3b1d1..ccbd1ffe4c8 100644
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ b/src/gallium/drivers/softpipe/sp_video_context.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2009 Younes Manton.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,61 +25,33 @@
  * 
  **************************************************************************/
 
+#ifndef SP_VIDEO_CONTEXT_H
+#define SP_VIDEO_CONTEXT_H
 
-/**
- * \brief  Quad occlusion counter stage
- * \author  Brian Paul
- */
+#include <pipe/p_video_context.h>
+#include <vl/vl_mpeg12_mc_renderer.h>
+#include <vl/vl_compositor.h>
 
+struct pipe_screen;
+struct pipe_context;
+struct pipe_video_surface;
 
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "sp_context.h"
-#include "sp_quad.h"
-#include "sp_surface.h"
-#include "sp_quad_pipe.h"
-
-static unsigned count_bits( unsigned val )
-{
-   unsigned i;
-
-   for (i = 0; val ; val >>= 1)
-      i += (val & 1);
-
-   return i;
-}
-
-static void
-occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
-{
-   struct softpipe_context *softpipe = qs->softpipe;
-
-   softpipe->occlusion_count += count_bits(quad->inout.mask);
-
-   qs->next->run(qs->next, quad);
-}
-
-
-static void occlusion_begin(struct quad_stage *qs)
+struct sp_mpeg12_context
 {
-   qs->next->begin(qs->next);
-}
-
-
-static void occlusion_destroy(struct quad_stage *qs)
-{
-   FREE( qs );
-}
-
-
-struct quad_stage *sp_quad_occlusion_stage( struct softpipe_context *softpipe )
-{
-   struct quad_stage *stage = CALLOC_STRUCT(quad_stage);
-
-   stage->softpipe = softpipe;
-   stage->begin = occlusion_begin;
-   stage->run = occlusion_count_quad;
-   stage->destroy = occlusion_destroy;
-
-   return stage;
-}
+   struct pipe_video_context base;
+   struct pipe_context *pipe;
+   struct pipe_video_surface *decode_target;
+   struct vl_mpeg12_mc_renderer mc_renderer;
+   struct vl_compositor compositor;
+
+   void *rast;
+   void *dsa;
+   void *blend;
+};
+
+struct pipe_video_context *
+sp_video_create(struct pipe_screen *screen, enum pipe_video_profile profile,
+                enum pipe_video_chroma_format chroma_format,
+                unsigned width, unsigned height);
+
+#endif /* SP_VIDEO_CONTEXT_H */
diff --git a/src/gallium/drivers/trace/tr_context.c b/src/gallium/drivers/trace/tr_context.c
index ae0af4d0557..bf470b46ae1 100644
--- a/src/gallium/drivers/trace/tr_context.c
+++ b/src/gallium/drivers/trace/tr_context.c
@@ -125,11 +125,11 @@ trace_context_draw_block(struct trace_context *tr_ctx, int flag)
    } else if ((tr_ctx->draw_rule.blocker & flag) &&
               (tr_ctx->draw_blocker & 4)) {
       boolean block = FALSE;
-      debug_printf("%s (%lu %lu) (%lu %lu) (%lu %u) (%lu %u)\n", __FUNCTION__,
-					tr_ctx->draw_rule.fs, tr_ctx->curr.fs,
-					tr_ctx->draw_rule.vs, tr_ctx->curr.vs,
-					tr_ctx->draw_rule.surf, 0,
-					tr_ctx->draw_rule.tex, 0);
+      debug_printf("%s (%p %p) (%p %p) (%p %u) (%p %u)\n", __FUNCTION__,
+                   (void *) tr_ctx->draw_rule.fs, (void *) tr_ctx->curr.fs,
+                   (void *) tr_ctx->draw_rule.vs, (void *) tr_ctx->curr.vs,
+                   (void *) tr_ctx->draw_rule.surf, 0,
+                   (void *) tr_ctx->draw_rule.tex, 0);
       if (tr_ctx->draw_rule.fs &&
           tr_ctx->draw_rule.fs == tr_ctx->curr.fs)
          block = TRUE;
diff --git a/src/gallium/drivers/trace/tr_rbug.c b/src/gallium/drivers/trace/tr_rbug.c
index e85ac15edca..81e0a6f3b00 100644
--- a/src/gallium/drivers/trace/tr_rbug.c
+++ b/src/gallium/drivers/trace/tr_rbug.c
@@ -44,7 +44,7 @@
 
 #if defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 #  define sleep Sleep
-#elif defined(PIPE_OS_LINUX)
+#elif defined(PIPE_OS_LINUX) || defined(PIPE_OS_BSD)
 void usleep(int);
 #  define sleep usleep
 #else
diff --git a/src/gallium/drivers/trace/tr_screen.c b/src/gallium/drivers/trace/tr_screen.c
index 26f1c04594f..ab605c7fc87 100644
--- a/src/gallium/drivers/trace/tr_screen.c
+++ b/src/gallium/drivers/trace/tr_screen.c
@@ -403,7 +403,7 @@ trace_screen_transfer_map(struct pipe_screen *_screen,
 
    map = screen->transfer_map(screen, transfer);
    if(map) {
-      if(transfer->usage != PIPE_TRANSFER_READ) {
+      if(transfer->usage & PIPE_TRANSFER_WRITE) {
          assert(!tr_trans->map);
          tr_trans->map = map;
       }