136 files changed, 4737 insertions, 2650 deletions
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index e8467b2ae3c..80d7200ca63 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -63,7 +63,8 @@ struct vcache_frontend {
    unsigned opt;
 };
 
-static void vcache_flush( struct vcache_frontend *vcache )
+static INLINE void 
+vcache_flush( struct vcache_frontend *vcache )
 {
    if (vcache->middle_prim != vcache->output_prim) {
       vcache->middle_prim = vcache->output_prim;
@@ -86,7 +87,8 @@ static void vcache_flush( struct vcache_frontend *vcache )
    vcache->draw_count = 0;
 }
 
-static void vcache_check_flush( struct vcache_frontend *vcache )
+static INLINE void 
+vcache_check_flush( struct vcache_frontend *vcache )
 {
    if ( vcache->draw_count + 6 >= DRAW_MAX ||
         vcache->fetch_count + 4 >= FETCH_MAX )
@@ -96,9 +98,10 @@ static void vcache_check_flush( struct vcache_frontend *vcache )
 }
 
 
-static INLINE void vcache_elt( struct vcache_frontend *vcache,
-                               unsigned felt,
-                               ushort flags )
+static INLINE void 
+vcache_elt( struct vcache_frontend *vcache,
+            unsigned felt,
+            ushort flags )
 {
    unsigned idx = felt % CACHE_MAX;
 
@@ -115,10 +118,11 @@ static INLINE void vcache_elt( struct vcache_frontend *vcache,
 
 
                    
-static void vcache_triangle( struct vcache_frontend *vcache,
-                             unsigned i0,
-                             unsigned i1,
-                             unsigned i2 )
+static INLINE void 
+vcache_triangle( struct vcache_frontend *vcache,
+                 unsigned i0,
+                 unsigned i1,
+                 unsigned i2 )
 {
    vcache_elt(vcache, i0, 0);
    vcache_elt(vcache, i1, 0);
@@ -127,11 +131,12 @@ static void vcache_triangle( struct vcache_frontend *vcache,
 }
 
 			  
-static void vcache_triangle_flags( struct vcache_frontend *vcache,
-                                   ushort flags,
-                                   unsigned i0,
-                                   unsigned i1,
-                                   unsigned i2 )
+static INLINE void 
+vcache_triangle_flags( struct vcache_frontend *vcache,
+                       ushort flags,
+                       unsigned i0,
+                       unsigned i1,
+                       unsigned i2 )
 {
    vcache_elt(vcache, i0, flags);
    vcache_elt(vcache, i1, 0);
@@ -139,9 +144,10 @@ static void vcache_triangle_flags( struct vcache_frontend *vcache,
    vcache_check_flush(vcache);
 }
 
-static void vcache_line( struct vcache_frontend *vcache,
-                         unsigned i0,
-                         unsigned i1 )
+static INLINE void 
+vcache_line( struct vcache_frontend *vcache,
+             unsigned i0,
+             unsigned i1 )
 {
    vcache_elt(vcache, i0, 0);
    vcache_elt(vcache, i1, 0);
@@ -149,10 +155,11 @@ static void vcache_line( struct vcache_frontend *vcache,
 }
 
 
-static void vcache_line_flags( struct vcache_frontend *vcache,
-                               ushort flags,
-                               unsigned i0,
-                               unsigned i1 )
+static INLINE void 
+vcache_line_flags( struct vcache_frontend *vcache,
+                   ushort flags,
+                   unsigned i0,
+                   unsigned i1 )
 {
    vcache_elt(vcache, i0, flags);
    vcache_elt(vcache, i1, 0);
@@ -160,28 +167,31 @@ static void vcache_line_flags( struct vcache_frontend *vcache,
 }
 
 
-static void vcache_point( struct vcache_frontend *vcache,
-                          unsigned i0 )
+static INLINE void 
+vcache_point( struct vcache_frontend *vcache,
+              unsigned i0 )
 {
    vcache_elt(vcache, i0, 0);
    vcache_check_flush(vcache);
 }
 
-static void vcache_quad( struct vcache_frontend *vcache,
-                         unsigned i0,
-                         unsigned i1,
-                         unsigned i2,
-                         unsigned i3 )
+static INLINE void 
+vcache_quad( struct vcache_frontend *vcache,
+             unsigned i0,
+             unsigned i1,
+             unsigned i2,
+             unsigned i3 )
 {
    vcache_triangle( vcache, i0, i1, i3 );
    vcache_triangle( vcache, i1, i2, i3 );
 }
 
-static void vcache_ef_quad( struct vcache_frontend *vcache,
-                            unsigned i0,
-                            unsigned i1,
-                            unsigned i2,
-                            unsigned i3 )
+static INLINE void 
+vcache_ef_quad( struct vcache_frontend *vcache,
+                unsigned i0,
+                unsigned i1,
+                unsigned i2,
+                unsigned i3 )
 {
    vcache_triangle_flags( vcache,
                           ( DRAW_PIPE_RESET_STIPPLE |
@@ -213,10 +223,11 @@ static void vcache_ef_quad( struct vcache_frontend *vcache,
 #define FUNC vcache_run
 #include "draw_pt_vcache_tmp.h"
 
-static void rebase_uint_elts( const unsigned *src,
-                              unsigned count,
-                              int delta,
-                              ushort *dest )
+static INLINE void 
+rebase_uint_elts( const unsigned *src,
+                  unsigned count,
+                  int delta,
+                  ushort *dest )
 {
    unsigned i;
 
@@ -224,9 +235,10 @@ static void rebase_uint_elts( const unsigned *src,
       dest[i] = (ushort)(src[i] + delta);
 }
 
-static void rebase_ushort_elts( const ushort *src,
-                                unsigned count,
-                                int delta,
+static INLINE void 
+rebase_ushort_elts( const ushort *src,
+                    unsigned count,
+                    int delta,
                                 ushort *dest )
 {
    unsigned i;
@@ -235,10 +247,11 @@ static void rebase_ushort_elts( const ushort *src,
       dest[i] = (ushort)(src[i] + delta);
 }
 
-static void rebase_ubyte_elts( const ubyte *src,
-                               unsigned count,
-                               int delta,
-                               ushort *dest )
+static INLINE void 
+rebase_ubyte_elts( const ubyte *src,
+                   unsigned count,
+                   int delta,
+                   ushort *dest )
 {
    unsigned i;
 
@@ -248,9 +261,10 @@ static void rebase_ubyte_elts( const ubyte *src,
 
 
 
-static void translate_uint_elts( const unsigned *src,
-                                 unsigned count,
-                                 ushort *dest )
+static INLINE void 
+translate_uint_elts( const unsigned *src,
+                     unsigned count,
+                     ushort *dest )
 {
    unsigned i;
 
@@ -258,9 +272,10 @@ static void translate_uint_elts( const unsigned *src,
       dest[i] = (ushort)(src[i]);
 }
 
-static void translate_ushort_elts( const ushort *src,
-                                   unsigned count,
-                                   ushort *dest )
+static INLINE void 
+translate_ushort_elts( const ushort *src,
+                       unsigned count,
+                       ushort *dest )
 {
    unsigned i;
 
@@ -268,9 +283,10 @@ static void translate_ushort_elts( const ushort *src,
       dest[i] = (ushort)(src[i]);
 }
 
-static void translate_ubyte_elts( const ubyte *src,
-                                  unsigned count,
-                                  ushort *dest )
+static INLINE void 
+translate_ubyte_elts( const ubyte *src,
+                      unsigned count,
+                      ushort *dest )
 {
    unsigned i;
 
@@ -282,7 +298,8 @@ static void translate_ubyte_elts( const ubyte *src,
 
 
 #if 0
-static enum pipe_format format_from_get_elt( pt_elt_func get_elt )
+static INLINE enum pipe_format 
+format_from_get_elt( pt_elt_func get_elt )
 {
    switch (draw->pt.user.eltSize) {
    case 1: return PIPE_FORMAT_R8_UNORM;
@@ -293,10 +310,11 @@ static enum pipe_format format_from_get_elt( pt_elt_func get_elt )
 }
 #endif
 
-static void vcache_check_run( struct draw_pt_front_end *frontend, 
-                              pt_elt_func get_elt,
-                              const void *elts,
-                              unsigned draw_count )
+static INLINE void 
+vcache_check_run( struct draw_pt_front_end *frontend, 
+                  pt_elt_func get_elt,
+                  const void *elts,
+                  unsigned draw_count )
 {
    struct vcache_frontend *vcache = (struct vcache_frontend *)frontend; 
    struct draw_context *draw = vcache->draw;
@@ -416,10 +434,11 @@ static void vcache_check_run( struct draw_pt_front_end *frontend,
 
 
 
-static void vcache_prepare( struct draw_pt_front_end *frontend,
-                            unsigned prim,
-                            struct draw_pt_middle_end *middle,
-			    unsigned opt )
+static void 
+vcache_prepare( struct draw_pt_front_end *frontend,
+                unsigned prim,
+                struct draw_pt_middle_end *middle,
+                unsigned opt )
 {
    struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
 
@@ -448,14 +467,16 @@ static void vcache_prepare( struct draw_pt_front_end *frontend,
 
 
 
-static void vcache_finish( struct draw_pt_front_end *frontend )
+static void 
+vcache_finish( struct draw_pt_front_end *frontend )
 {
    struct vcache_frontend *vcache = (struct vcache_frontend *)frontend;
    vcache->middle->finish( vcache->middle );
    vcache->middle = NULL;
 }
 
-static void vcache_destroy( struct draw_pt_front_end *frontend )
+static void 
+vcache_destroy( struct draw_pt_front_end *frontend )
 {
    FREE(frontend);
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index 760fcb389fc..a556477a767 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -32,6 +32,7 @@
 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
@@ -119,21 +120,26 @@ static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
 
    switch (file) {
    case TGSI_FILE_INPUT:
+      assert(idx < MAX_INPUTS);
       return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
 
    case TGSI_FILE_OUTPUT:
       return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
 
    case TGSI_FILE_TEMPORARY:
+      assert(idx < MAX_TEMPS);
       return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
 
    case AOS_FILE_INTERNAL:
+      assert(idx < MAX_INTERNALS);
       return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
 
    case TGSI_FILE_IMMEDIATE: 
+      assert(idx < MAX_IMMEDIATES);  /* just a sanity check */
       return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
 
    case TGSI_FILE_CONSTANT: 
+      assert(idx < MAX_CONSTANTS);  /* just a sanity check */
       return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
 
    default:
@@ -2108,6 +2114,11 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
                          start,
                          count,
                          output_buffer );
+
+   /* Sanity spot checks to make sure we didn't trash our constants */
+   assert(machine->internal[IMM_ONES][0] == 1.0f);
+   assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
+   assert(machine->internal[IMM_NEGS][0] == -1.0f);
 }
 
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 64e021ff6b7..7fe6f79db0d 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -51,11 +51,11 @@ struct x86_function;
 #define W    3
 
 #define MAX_INPUTS     PIPE_MAX_ATTRIBS
-#define MAX_OUTPUTS    PIPE_MAX_ATTRIBS
-#define MAX_TEMPS      PIPE_MAX_ATTRIBS /* say */
-#define MAX_CONSTANTS  PIPE_MAX_ATTRIBS /* say */
-#define MAX_IMMEDIATES PIPE_MAX_ATTRIBS /* say */
-#define MAX_INTERNALS  8
+#define MAX_OUTPUTS    PIPE_MAX_SHADER_OUTPUTS
+#define MAX_TEMPS      TGSI_EXEC_NUM_TEMPS
+#define MAX_CONSTANTS  1024  /** only used for sanity checking */
+#define MAX_IMMEDIATES 1024  /** only used for sanity checking */
+#define MAX_INTERNALS  8     /** see IMM_x values below */
 
 #define AOS_FILE_INTERNAL TGSI_FILE_COUNT
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index ab3c5b94a50..26297c74f82 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -55,9 +55,13 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
 				 struct x86_reg src_ptr )
 {
    sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
+   /* data = z ? ? ? */
    sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
+   /* data = z ? 0 1 */
    sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
+   /* data = ? 0 z 1 */
    sse_movlps(cp->func, data, src_ptr);
+   /* data = x y z 1 */
 }
 
 static void emit_load_R32G32( struct aos_compilation *cp, 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
index 410d336fefc..17b27810520 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -69,7 +69,7 @@
 
 struct fenced_buffer_list
 {
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    struct pipe_winsys *winsys;
    
@@ -240,7 +240,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
    struct fenced_buffer_list *fenced_list = fenced_buf->list;
 
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
    assert(fenced_buf->base.base.refcount == 0);
    if (fenced_buf->fence) {
       struct pipe_winsys *winsys = fenced_list->winsys;
@@ -263,7 +263,7 @@ fenced_buffer_destroy(struct pb_buffer *buf)
    else {
       _fenced_buffer_destroy(fenced_buf);
    }
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
@@ -396,7 +396,7 @@ buffer_fence(struct pb_buffer *buf,
       return;
    }
    
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
    if (fenced_buf->fence)
       _fenced_buffer_remove(fenced_list, fenced_buf);
    if (fence) {
@@ -404,7 +404,7 @@ buffer_fence(struct pb_buffer *buf,
       fenced_buf->flags |= flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE;
       _fenced_buffer_add(fenced_buf);
    }
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
@@ -423,7 +423,7 @@ fenced_buffer_list_create(struct pipe_winsys *winsys)
 
    fenced_list->numDelayed = 0;
    
-   _glthread_INIT_MUTEX(fenced_list->mutex);
+   pipe_mutex_init(fenced_list->mutex);
 
    return fenced_list;
 }
@@ -433,28 +433,28 @@ void
 fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
                               int wait)
 {
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
    _fenced_buffer_list_check_free(fenced_list, wait);
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
 }
 
 
 void
 fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
 {
-   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_lock(fenced_list->mutex);
 
    /* Wait on outstanding fences */
    while (fenced_list->numDelayed) {
-      _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+      pipe_mutex_unlock(fenced_list->mutex);
 #if defined(PIPE_OS_LINUX)
       sched_yield();
 #endif
       _fenced_buffer_list_check_free(fenced_list, 1);
-      _glthread_LOCK_MUTEX(fenced_list->mutex);
+      pipe_mutex_lock(fenced_list->mutex);
    }
 
-   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   pipe_mutex_unlock(fenced_list->mutex);
    
    FREE(fenced_list);
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index b914c2d0fea..1ec422fb19e 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -79,7 +79,7 @@ struct pb_cache_manager
    struct pb_manager *provider;
    unsigned usecs;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    struct list_head delayed;
    size_t numDelayed;
@@ -153,7 +153,7 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf)
    struct pb_cache_buffer *buf = pb_cache_buffer(_buf);   
    struct pb_cache_manager *mgr = buf->mgr;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    assert(buf->base.base.refcount == 0);
    
    _pb_cache_buffer_list_check_free(mgr);
@@ -162,7 +162,7 @@ pb_cache_buffer_destroy(struct pb_buffer *_buf)
    util_time_add(&buf->start, mgr->usecs, &buf->end);
    LIST_ADDTAIL(&buf->head, &mgr->delayed);
    ++mgr->numDelayed;
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 }
 
 
@@ -235,7 +235,7 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
    struct list_head *curr, *next;
    struct util_time now;
    
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
 
    buf = NULL;
    curr = mgr->delayed.next;
@@ -249,27 +249,35 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
 	 buf = curr_buf;
       else if(util_time_timeout(&curr_buf->start, &curr_buf->end, &now))
 	 _pb_cache_buffer_destroy(curr_buf);
+      else
+         /* This buffer (and all hereafter) are still hot in cache */
+         break;
       curr = next; 
       next = curr->next;
    }
 
    /* keep searching in the hot buffers */
-   while(!buf && curr != &mgr->delayed) {
-      curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
-      if(pb_cache_is_buffer_compat(curr_buf, size, desc))
-	 buf = curr_buf;
-      curr = next; 
-      next = curr->next;
+   if(!buf) {
+      while(curr != &mgr->delayed) {
+         curr_buf = LIST_ENTRY(struct pb_cache_buffer, curr, head);
+         if(pb_cache_is_buffer_compat(curr_buf, size, desc)) {
+            buf = curr_buf;
+            break;
+         }
+         /* no need to check the timeout here */
+         curr = next;
+         next = curr->next;
+      }
    }
    
    if(buf) {
       LIST_DEL(&buf->head);
-      _glthread_UNLOCK_MUTEX(mgr->mutex);
+      pipe_mutex_unlock(mgr->mutex);
       ++buf->base.base.refcount;
       return &buf->base;
    }
    
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 
    buf = CALLOC_STRUCT(pb_cache_buffer);
    if(!buf)
@@ -305,7 +313,7 @@ pb_cache_flush(struct pb_manager *_mgr)
    struct list_head *curr, *next;
    struct pb_cache_buffer *buf;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    curr = mgr->delayed.next;
    next = curr->next;
    while(curr != &mgr->delayed) {
@@ -314,7 +322,7 @@ pb_cache_flush(struct pb_manager *_mgr)
       curr = next; 
       next = curr->next;
    }
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 }
 
 
@@ -345,7 +353,7 @@ pb_cache_manager_create(struct pb_manager *provider,
    mgr->usecs = usecs;
    LIST_INITHEAD(&mgr->delayed);
    mgr->numDelayed = 0;
-   _glthread_INIT_MUTEX(mgr->mutex);
+   pipe_mutex_init(mgr->mutex);
       
    return &mgr->base;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 4cb8c3bb557..5f1ed3e5a8a 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -129,7 +129,7 @@ check_random_pattern(const uint8_t *dst, size_t size,
    for(i = 0; i < size; ++i) {
       if(*dst++ != random_pattern[i % sizeof(random_pattern)]) {
          *min_ofs = MIN2(*min_ofs, i);
-         *max_ofs = MIN2(*max_ofs, i);
+         *max_ofs = MAX2(*max_ofs, i);
 	 result = FALSE;
       }
    }
@@ -138,12 +138,30 @@ check_random_pattern(const uint8_t *dst, size_t size,
 
 
 static void
-pb_debug_buffer_destroy(struct pb_buffer *_buf)
+pb_debug_buffer_fill(struct pb_debug_buffer *buf)
 {
-   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);  
    uint8_t *map;
    
-   assert(!buf->base.base.refcount);
+   map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+   assert(map);
+   if(map) {
+      fill_random_pattern(map, buf->underflow_size);
+      fill_random_pattern(map + buf->underflow_size + buf->base.base.size, 
+                          buf->overflow_size);
+      pb_unmap(buf->buffer);
+   }
+}
+
+
+/**
+ * Check for under/over flows.
+ * 
+ * Should be called with the buffer unmaped.
+ */
+static void
+pb_debug_buffer_check(struct pb_debug_buffer *buf)
+{
+   uint8_t *map;
    
    map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_READ);
    assert(map);
@@ -154,24 +172,45 @@ pb_debug_buffer_destroy(struct pb_buffer *_buf)
       underflow = !check_random_pattern(map, buf->underflow_size, 
                                         &min_ofs, &max_ofs);
       if(underflow) {
-	 debug_printf("buffer underflow (%u of %u bytes) detected\n",
-	              buf->underflow_size - min_ofs,
-	              buf->underflow_size);
+         debug_printf("buffer underflow (offset -%u%s to -%u bytes) detected\n",
+                      buf->underflow_size - min_ofs,
+                      min_ofs == 0 ? "+" : "",
+                      buf->underflow_size - max_ofs);
       }
       
       overflow = !check_random_pattern(map + buf->underflow_size + buf->base.base.size, 
                                        buf->overflow_size, 
                                        &min_ofs, &max_ofs);
       if(overflow) {
-         debug_printf("buffer overflow (%u of %u bytes) detected\n",
+         debug_printf("buffer overflow (size %u plus offset %u to %u%s bytes) detected\n",
+                      buf->base.base.size,
+                      min_ofs,
                       max_ofs,
-                      buf->overflow_size);
+                      max_ofs == buf->overflow_size - 1 ? "+" : "");
       }
       
       debug_assert(!underflow && !overflow);
-      
+
+      /* re-fill if not aborted */
+      if(underflow)
+         fill_random_pattern(map, buf->underflow_size);
+      if(overflow)
+         fill_random_pattern(map + buf->underflow_size + buf->base.base.size, 
+                             buf->overflow_size);
+
       pb_unmap(buf->buffer);
    }
+}
+
+
+static void
+pb_debug_buffer_destroy(struct pb_buffer *_buf)
+{
+   struct pb_debug_buffer *buf = pb_debug_buffer(_buf);  
+   
+   assert(!buf->base.base.refcount);
+   
+   pb_debug_buffer_check(buf);
 
    pb_reference(&buf->buffer, NULL);
    FREE(buf);
@@ -183,9 +222,14 @@ pb_debug_buffer_map(struct pb_buffer *_buf,
                     unsigned flags)
 {
    struct pb_debug_buffer *buf = pb_debug_buffer(_buf);
-   void *map = pb_map(buf->buffer, flags);
+   void *map;
+   
+   pb_debug_buffer_check(buf);
+
+   map = pb_map(buf->buffer, flags);
    if(!map)
       return NULL;
+   
    return (uint8_t *)map + buf->underflow_size;
 }
 
@@ -195,6 +239,8 @@ pb_debug_buffer_unmap(struct pb_buffer *_buf)
 {
    struct pb_debug_buffer *buf = pb_debug_buffer(_buf);   
    pb_unmap(buf->buffer);
+   
+   pb_debug_buffer_check(buf);
 }
 
 
@@ -227,7 +273,6 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr,
    struct pb_debug_buffer *buf;
    struct pb_desc real_desc;
    size_t real_size;
-   uint8_t *map;
    
    buf = CALLOC_STRUCT(pb_debug_buffer);
    if(!buf)
@@ -262,13 +307,7 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr,
    buf->underflow_size = mgr->band_size;
    buf->overflow_size = buf->buffer->base.size - buf->underflow_size - size;
    
-   map = pb_map(buf->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
-   assert(map);
-   if(map) {
-      fill_random_pattern(map, buf->underflow_size);
-      fill_random_pattern(map + buf->underflow_size + size, buf->overflow_size);
-      pb_unmap(buf->buffer);
-   }
+   pb_debug_buffer_fill(buf);
    
    return &buf->base;
 }
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index b40eb6cc90c..e8c7f8e1f82 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -53,7 +53,7 @@ struct mm_pb_manager
 {
    struct pb_manager base;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    size_t size;
    struct mem_block *heap;
@@ -99,10 +99,10 @@ mm_buffer_destroy(struct pb_buffer *buf)
    
    assert(buf->base.refcount == 0);
    
-   _glthread_LOCK_MUTEX(mm->mutex);
+   pipe_mutex_lock(mm->mutex);
    mmFreeMem(mm_buf->block);
    FREE(buf);
-   _glthread_UNLOCK_MUTEX(mm->mutex);
+   pipe_mutex_unlock(mm->mutex);
 }
 
 
@@ -158,11 +158,11 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    if(desc->alignment % (1 << mm->align2))
       return NULL;
    
-   _glthread_LOCK_MUTEX(mm->mutex);
+   pipe_mutex_lock(mm->mutex);
 
    mm_buf = CALLOC_STRUCT(mm_buffer);
    if (!mm_buf) {
-      _glthread_UNLOCK_MUTEX(mm->mutex);
+      pipe_mutex_unlock(mm->mutex);
       return NULL;
    }
 
@@ -185,7 +185,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
       mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
       if(!mm_buf->block) {
          FREE(mm_buf);
-         _glthread_UNLOCK_MUTEX(mm->mutex);
+         pipe_mutex_unlock(mm->mutex);
          return NULL;
       }
    }
@@ -194,7 +194,7 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
    assert(0 <= (unsigned)mm_buf->block->ofs && (unsigned)mm_buf->block->ofs < mm->size);
    assert(size <= (unsigned)mm_buf->block->size && (unsigned)mm_buf->block->ofs + (unsigned)mm_buf->block->size <= mm->size);
    
-   _glthread_UNLOCK_MUTEX(mm->mutex);
+   pipe_mutex_unlock(mm->mutex);
    return SUPER(mm_buf);
 }
 
@@ -204,14 +204,14 @@ mm_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct mm_pb_manager *mm = mm_pb_manager(mgr);
    
-   _glthread_LOCK_MUTEX(mm->mutex);
+   pipe_mutex_lock(mm->mutex);
 
    mmDestroy(mm->heap);
    
    pb_unmap(mm->buffer);
    pb_reference(&mm->buffer, NULL);
    
-   _glthread_UNLOCK_MUTEX(mm->mutex);
+   pipe_mutex_unlock(mm->mutex);
    
    FREE(mgr);
 }
@@ -236,7 +236,7 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    mm->size = size;
    mm->align2 = align2; /* 64-byte alignment */
 
-   _glthread_INIT_MUTEX(mm->mutex);
+   pipe_mutex_init(mm->mutex);
 
    mm->buffer = buffer; 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 93d2cc96351..3ef72c5bbb3 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -56,7 +56,7 @@ struct pool_pb_manager
 {
    struct pb_manager base;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    
    size_t bufSize;
    size_t bufAlign;
@@ -110,10 +110,10 @@ pool_buffer_destroy(struct pb_buffer *buf)
    
    assert(pool_buf->base.base.refcount == 0);
 
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
    LIST_ADD(&pool_buf->head, &pool->free);
    pool->numFree++;
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
 }
 
 
@@ -124,9 +124,9 @@ pool_buffer_map(struct pb_buffer *buf, unsigned flags)
    struct pool_pb_manager *pool = pool_buf->mgr;
    void *map;
 
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
    map = (unsigned char *) pool->map + pool_buf->start;
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
    return map;
 }
 
@@ -171,10 +171,10 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    assert(size == pool->bufSize);
    assert(pool->bufAlign % desc->alignment == 0);
    
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
 
    if (pool->numFree == 0) {
-      _glthread_UNLOCK_MUTEX(pool->mutex);
+      pipe_mutex_unlock(pool->mutex);
       debug_printf("warning: out of fixed size buffer objects\n");
       return NULL;
    }
@@ -182,7 +182,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    item = pool->free.next;
 
    if (item == &pool->free) {
-      _glthread_UNLOCK_MUTEX(pool->mutex);
+      pipe_mutex_unlock(pool->mutex);
       debug_printf("error: fixed size buffer pool corruption\n");
       return NULL;
    }
@@ -190,7 +190,7 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
    LIST_DEL(item);
    --pool->numFree;
 
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
    
    pool_buf = LIST_ENTRY(struct pool_buffer, item, head);
    assert(pool_buf->base.base.refcount == 0);
@@ -206,14 +206,14 @@ static void
 pool_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct pool_pb_manager *pool = pool_pb_manager(mgr);
-   _glthread_LOCK_MUTEX(pool->mutex);
+   pipe_mutex_lock(pool->mutex);
 
    FREE(pool->bufs);
    
    pb_unmap(pool->buffer);
    pb_reference(&pool->buffer, NULL);
    
-   _glthread_UNLOCK_MUTEX(pool->mutex);
+   pipe_mutex_unlock(pool->mutex);
    
    FREE(mgr);
 }
@@ -246,7 +246,7 @@ pool_bufmgr_create(struct pb_manager *provider,
    pool->bufSize = bufSize;
    pool->bufAlign = desc->alignment; 
    
-   _glthread_INIT_MUTEX(pool->mutex);
+   pipe_mutex_init(pool->mutex);
 
    pool->buffer = provider->create_buffer(provider, numBufs*bufSize, desc); 
    if (!pool->buffer)
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index af307e265a2..8698c4cff62 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -30,6 +30,8 @@
  * @file
  * S-lab pool implementation.
  * 
+ * @sa http://en.wikipedia.org/wiki/Slab_allocation
+ * 
  * @author Thomas Hellstrom <thomas-at-tungstengraphics-dot-com>
  * @author Jose Fonseca <[email protected]>
  */
@@ -49,46 +51,96 @@
 
 struct pb_slab;
 
+
+/**
+ * Buffer in a slab.
+ * 
+ * Sub-allocation of a contiguous buffer.
+ */
 struct pb_slab_buffer
 {
    struct pb_buffer base;
    
    struct pb_slab *slab;
+   
    struct list_head head;
+   
    unsigned mapCount;
+   
+   /** Offset relative to the start of the slab buffer. */
    size_t start;
-   _glthread_Cond event;
+   
+   /** Use when validating, to signal that all mappings are finished */
+   /* TODO: Actually validation does not reach this stage yet */
+   pipe_condvar event;
 };
 
+
+/**
+ * Slab -- a contiguous piece of memory. 
+ */
 struct pb_slab
 {
    struct list_head head;
    struct list_head freeBuffers;
    size_t numBuffers;
    size_t numFree;
+   
    struct pb_slab_buffer *buffers;
    struct pb_slab_manager *mgr;
    
+   /** Buffer from the provider */
    struct pb_buffer *bo;
+   
    void *virtual;   
 };
 
+
+/**
+ * It adds/removes slabs as needed in order to meet the allocation/destruction 
+ * of individual buffers.
+ */
 struct pb_slab_manager 
 {
    struct pb_manager base;
    
+   /** From where we get our buffers */
    struct pb_manager *provider;
+   
+   /** Size of the buffers we hand on downstream */
    size_t bufSize;
+   
+   /** Size of the buffers we request upstream */
    size_t slabSize;
+   
+   /** 
+    * Alignment, usage to be used to allocate the slab buffers.
+    * 
+    * We can only provide buffers which are consistent (in alignment, usage) 
+    * with this description.   
+    */
    struct pb_desc desc;
 
+   /** 
+    * Partial slabs
+    * 
+    * Full slabs are not stored in any list. Empty slabs are destroyed 
+    * immediatly.
+    */
    struct list_head slabs;
-   struct list_head freeSlabs;
    
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
 };
 
+
 /**
+ * Wrapper around several slabs, therefore capable of handling buffers of 
+ * multiple sizes. 
+ * 
+ * This buffer manager just dispatches buffer allocations to the appropriate slab
+ * manager, according to the requested buffer size, or by passes the slab 
+ * managers altogether for even greater sizes.
+ * 
  * The data of this structure remains constant after
  * initialization and thus needs no mutex protection.
  */
@@ -97,12 +149,17 @@ struct pb_slab_range_manager
    struct pb_manager base;
 
    struct pb_manager *provider;
+   
    size_t minBufSize;
    size_t maxBufSize;
+   
+   /** @sa pb_slab_manager::desc */ 
    struct pb_desc desc;
    
    unsigned numBuckets;
    size_t *bucketSizes;
+   
+   /** Array of pb_slab_manager, one for each bucket size */
    struct pb_manager **buckets;
 };
 
@@ -143,7 +200,7 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
    struct pb_slab_manager *mgr = slab->mgr;
    struct list_head *list = &buf->head;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
    
    assert(buf->base.base.refcount == 0);
    
@@ -156,30 +213,16 @@ pb_slab_buffer_destroy(struct pb_buffer *_buf)
    if (slab->head.next == &slab->head)
       LIST_ADDTAIL(&slab->head, &mgr->slabs);
 
+   /* If the slab becomes totally empty, free it */
    if (slab->numFree == slab->numBuffers) {
       list = &slab->head;
-      LIST_DEL(list);
-      LIST_ADDTAIL(list, &mgr->freeSlabs);
+      LIST_DELINIT(list);
+      pb_reference(&slab->bo, NULL);
+      FREE(slab->buffers);
+      FREE(slab);
    }
 
-   if (mgr->slabs.next == &mgr->slabs || slab->numFree
-	 != slab->numBuffers) {
-
-      struct list_head *next;
-
-      for (list = mgr->freeSlabs.next, next = list->next; list
-	    != &mgr->freeSlabs; list = next, next = list->next) {
-
-	 slab = LIST_ENTRY(struct pb_slab, list, head);
-
-	 LIST_DELINIT(list);
-	 pb_reference(&slab->bo, NULL);
-	 FREE(slab->buffers);
-	 FREE(slab);
-      }
-   }
-   
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
 }
 
 
@@ -201,7 +244,7 @@ pb_slab_buffer_unmap(struct pb_buffer *_buf)
 
    --buf->mapCount;
    if (buf->mapCount == 0) 
-       _glthread_COND_BROADCAST(buf->event);
+       pipe_condvar_broadcast(buf->event);
 }
 
 
@@ -225,6 +268,11 @@ pb_slab_buffer_vtbl = {
 };
 
 
+/**
+ * Create a new slab.
+ * 
+ * Called when we ran out of free slabs.
+ */
 static enum pipe_error
 pb_slab_create(struct pb_slab_manager *mgr)
 {
@@ -238,17 +286,14 @@ pb_slab_create(struct pb_slab_manager *mgr)
    if (!slab)
       return PIPE_ERROR_OUT_OF_MEMORY;
 
-   /*
-    * FIXME: We should perhaps allow some variation in slabsize in order
-    * to efficiently reuse slabs.
-    */
-
    slab->bo = mgr->provider->create_buffer(mgr->provider, mgr->slabSize, &mgr->desc);
    if(!slab->bo) {
       ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out_err0;
    }
 
+   /* Note down the slab virtual address. All mappings are accessed directly 
+    * through this address so it is required that the buffer is pinned. */
    slab->virtual = pb_map(slab->bo, 
                           PIPE_BUFFER_USAGE_CPU_READ |
                           PIPE_BUFFER_USAGE_CPU_WRITE);
@@ -256,7 +301,6 @@ pb_slab_create(struct pb_slab_manager *mgr)
       ret = PIPE_ERROR_OUT_OF_MEMORY;
       goto out_err1;
    }
-
    pb_unmap(slab->bo);
 
    numBuffers = slab->bo->base.size / mgr->bufSize;
@@ -283,12 +327,13 @@ pb_slab_create(struct pb_slab_manager *mgr)
       buf->slab = slab;
       buf->start = i* mgr->bufSize;
       buf->mapCount = 0;
-      _glthread_INIT_COND(buf->event);
+      pipe_condvar_init(buf->event);
       LIST_ADDTAIL(&buf->head, &slab->freeBuffers);
       slab->numFree++;
       buf++;
    }
 
+   /* Add this slab to the list of partial slabs */
    LIST_ADDTAIL(&slab->head, &mgr->slabs);
 
    return PIPE_OK;
@@ -328,23 +373,29 @@ pb_slab_manager_create_buffer(struct pb_manager *_mgr,
    if(!pb_check_usage(desc->usage, mgr->desc.usage))
       return NULL;
 
-   _glthread_LOCK_MUTEX(mgr->mutex);
+   pipe_mutex_lock(mgr->mutex);
+   
+   /* Create a new slab, if we run out of partial slabs */
    if (mgr->slabs.next == &mgr->slabs) {
       (void) pb_slab_create(mgr);
       if (mgr->slabs.next == &mgr->slabs) {
-	 _glthread_UNLOCK_MUTEX(mgr->mutex);
+	 pipe_mutex_unlock(mgr->mutex);
 	 return NULL;
       }
    }
+   
+   /* Allocate the buffer from a partial (or just created) slab */
    list = mgr->slabs.next;
    slab = LIST_ENTRY(struct pb_slab, list, head);
+   
+   /* If totally full remove from the partial slab list */
    if (--slab->numFree == 0)
       LIST_DELINIT(list);
 
    list = slab->freeBuffers.next;
    LIST_DELINIT(list);
 
-   _glthread_UNLOCK_MUTEX(mgr->mutex);
+   pipe_mutex_unlock(mgr->mutex);
    buf = LIST_ENTRY(struct pb_slab_buffer, list, head);
    
    ++buf->base.base.refcount;
@@ -386,9 +437,8 @@ pb_slab_manager_create(struct pb_manager *provider,
    mgr->desc = *desc;
 
    LIST_INITHEAD(&mgr->slabs);
-   LIST_INITHEAD(&mgr->freeSlabs);
    
-   _glthread_INIT_MUTEX(mgr->mutex);
+   pipe_mutex_init(mgr->mutex);
 
    return &mgr->base;
 }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_execmem.c b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
index dfa5c35ab6a..19087589a87 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_execmem.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_execmem.c
@@ -47,11 +47,12 @@
 
 #include <unistd.h>
 #include <sys/mman.h>
+#include "pipe/p_thread.h"
 #include "util/u_mm.h"
 
 #define EXEC_HEAP_SIZE (10*1024*1024)
 
-_glthread_DECLARE_STATIC_MUTEX(exec_mutex);
+pipe_static_mutex(exec_mutex);
 
 static struct mem_block *exec_heap = NULL;
 static unsigned char *exec_mem = NULL;
@@ -76,7 +77,7 @@ rtasm_exec_malloc(size_t size)
    struct mem_block *block = NULL;
    void *addr = NULL;
 
-   _glthread_LOCK_MUTEX(exec_mutex);
+   pipe_mutex_lock(exec_mutex);
 
    init_heap();
 
@@ -90,7 +91,7 @@ rtasm_exec_malloc(size_t size)
    else 
       debug_printf("rtasm_exec_malloc failed\n");
    
-   _glthread_UNLOCK_MUTEX(exec_mutex);
+   pipe_mutex_unlock(exec_mutex);
    
    return addr;
 }
@@ -99,7 +100,7 @@ rtasm_exec_malloc(size_t size)
 void 
 rtasm_exec_free(void *addr)
 {
-   _glthread_LOCK_MUTEX(exec_mutex);
+   pipe_mutex_lock(exec_mutex);
 
    if (exec_heap) {
       struct mem_block *block = mmFindBlock(exec_heap, (unsigned char *)addr - exec_mem);
@@ -108,7 +109,7 @@ rtasm_exec_free(void *addr)
 	 mmFreeMem(block);
    }
 
-   _glthread_UNLOCK_MUTEX(exec_mutex);
+   pipe_mutex_unlock(exec_mutex);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 285ddc0e3f3..61010e43339 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -151,8 +151,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -165,8 +165,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rB = rB;
     inst.inst.rA = rA;
     inst.inst.rC = rC;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -178,8 +178,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i7 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -192,8 +192,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i8 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -206,8 +206,8 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.i10 = imm;
     inst.inst.rA = rA;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -218,8 +218,8 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i16 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -230,8 +230,8 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.op = op;
     inst.inst.i18 = imm;
     inst.inst.rT = rT;
-    *p->csr = inst.bits;
-    p->csr++;
+    p->store[p->num_inst++] = inst.bits;
+    assert(p->num_inst <= p->max_inst);
 }
 
 
@@ -300,13 +300,16 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
-/*
+/**
+ * Initialize an spe_function.
+ * \param code_size  size of instruction buffer to allocate, in bytes.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
     p->store = align_malloc(code_size, 16);
-    p->csr = p->store;
-    
+    p->num_inst = 0;
+    p->max_inst = code_size / SPE_INST_SIZE;
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
     p->regs[0] = ~7;
@@ -316,21 +319,26 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
 
 void spe_release_func(struct spe_function *p)
 {
+    assert(p->num_inst <= p->max_inst);
     if (p->store != NULL) {
         align_free(p->store);
     }
     p->store = NULL;
-    p->csr = NULL;
 }
 
 
+/**
+ * Alloate a SPE register.
+ * \return register index or -1 if none left.
+ */
 int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
-   for (i = 0; i < 128; i++) {
+   for (i = 0; i < SPE_NUM_REGS; i++) {
       const uint64_t mask = (1ULL << (i % 64));
       const unsigned idx = i / 64;
 
+      assert(idx < 2);
       if ((p->regs[idx] & mask) != 0) {
          p->regs[idx] &= ~mask;
          return i;
@@ -341,11 +349,15 @@ int spe_allocate_available_register(struct spe_function *p)
 }
 
 
+/**
+ * Mark the given SPE register as "allocated".
+ */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) != 0);
 
    p->regs[idx] &= ~(1ULL << bit);
@@ -353,57 +365,75 @@ int spe_allocate_register(struct spe_function *p, int reg)
 }
 
 
+/**
+ * Mark the given SPE register as "unallocated".
+ */
 void spe_release_register(struct spe_function *p, int reg)
 {
    const unsigned idx = reg / 64;
    const unsigned bit = reg % 64;
 
+   assert(idx < 2);
+
+   assert(reg < SPE_NUM_REGS);
    assert((p->regs[idx] & (1ULL << bit)) == 0);
 
    p->regs[idx] |= (1ULL << bit);
 }
 
 
+/**
+ * For branch instructions:
+ * \param d  if 1, disable interupts if branch is taken
+ * \param e  if 1, enable interupts if branch is taken
+ * If d and e are both zero, don't change interupt status (right?)
+ */
 
-
+/** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
     emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
 }
 
-void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d,
-		int e)
+/** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
+void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
 }
 
+/** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
     emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
@@ -432,4 +462,54 @@ EMIT_R   (spe_mfspr, 0x00c);
 EMIT_R   (spe_mtspr, 0x10c);
 #endif
 
+
+/**
+ ** Helper / "macro" instructions.
+ ** Use somewhat verbose names as a reminder that these aren't native
+ ** SPE instructions.
+ **/
+
+
+void
+spe_load_float(struct spe_function *p, unsigned rT, float x)
+{
+   union {
+      float f;
+      unsigned u;
+   } bits;
+   bits.f = x;
+   spe_ilhu(p, rT, bits.u >> 16);
+   spe_iohl(p, rT, bits.u & 0xffff);
+}
+
+
+void
+spe_load_int(struct spe_function *p, unsigned rT, int i)
+{
+   spe_ilhu(p, rT, i >> 16);
+   spe_iohl(p, rT, i & 0xffff);
+}
+
+
+void
+spe_complement(struct spe_function *p, unsigned rT)
+{
+   spe_nor(p, rT, rT, rT);
+}
+
+
+void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA)
+{
+   spe_ori(p, rT, rA, 0);
+}
+
+
+void
+spe_zero(struct spe_function *p, unsigned rT)
+{
+   spe_xor(p, rT, rT, rT);
+}
+
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 1cacc717b15..dee8c55c4a9 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -25,6 +25,7 @@
 /**
  * \file
  * Real-time assembly generation interface for Cell B.E. SPEs.
+ * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <[email protected]>
  */
@@ -32,13 +33,24 @@
 #ifndef RTASM_PPC_SPE_H
 #define RTASM_PPC_SPE_H
 
-struct spe_function {
-    /**
-     *
-     */
-    uint32_t *store;
-    uint32_t *csr;
-    const char *fn;
+/** 4 bytes per instruction */
+#define SPE_INST_SIZE 4
+
+/** number of general-purpose SIMD registers */
+#define SPE_NUM_REGS  128
+
+/** Return Address register */
+#define SPE_REG_RA  0
+
+/** Stack Pointer register */
+#define SPE_REG_SP  1
+
+
+struct spe_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
 
     /**
      * Mask of used / unused registers
@@ -50,7 +62,7 @@ struct spe_function {
      * spe_allocate_register, spe_allocate_available_register,
      * spe_release_register
      */
-    uint64_t regs[2];
+    uint64_t regs[SPE_NUM_REGS / 64];
 };
 
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
@@ -119,7 +131,8 @@ EMIT_RI16(spe_ilhu,  0x082);
 EMIT_RI16(spe_il,    0x081);
 EMIT_RI18(spe_ila,   0x021);
 EMIT_RI16(spe_iohl,  0x0c1);
-EMIT_RI16(spe_fsmbi, 0x0c5);
+EMIT_RI16(spe_fsmbi, 0x065);
+
 
 
 /* Integer and logical instructions
@@ -271,6 +284,27 @@ extern void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA,
     int d, int e);
 
 
+/** Load/splat immediate float into rT. */
+extern void
+spe_load_float(struct spe_function *p, unsigned rT, float x);
+
+/** Load/splat immediate int into rT. */
+extern void
+spe_load_int(struct spe_function *p, unsigned rT, int i);
+
+/** Complement/invert all bits in rT. */
+extern void
+spe_complement(struct spe_function *p, unsigned rT);
+
+/** rT = rA. */
+extern void
+spe_move(struct spe_function *p, unsigned rT, unsigned rA);
+
+/** rT = {0,0,0,0}. */
+extern void
+spe_zero(struct spe_function *p, unsigned rT);
+
+
 /* Floating-point instructions
  */
 EMIT_RR  (spe_fa,         0x2c4);
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index f4ca282dd93..6d4c081e04e 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -27,7 +27,7 @@
 
 #include "pipe/p_compiler.h"
 #include "pipe/p_debug.h"
-#include "pipe/p_pointer.h"
+#include "util/u_pointer.h"
 
 #include "rtasm_execmem.h"
 #include "rtasm_x86sse.h"
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index a168c949280..afc8ffa553c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -37,30 +37,40 @@ struct dump_ctx
 
    uint instno;
    
-   struct util_strbuf *sbuf;
+   void (*printf)(struct dump_ctx *ctx, const char *format, ...);
 };
 
+static void 
+dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
+{
+   va_list ap;
+   (void)ctx;
+   va_start(ap, format);
+   debug_vprintf(format, ap);
+   va_end(ap);
+}
+
 static void
 dump_enum(
-   struct util_strbuf *sbuf,
+   struct dump_ctx *ctx,
    uint e,
    const char **enums,
    uint enum_count )
 {
    if (e >= enum_count)
-      util_strbuf_printf( sbuf, "%u", e );
+      ctx->printf( ctx, "%u", e );
    else
-      util_strbuf_printf( sbuf, "%s", enums[e] );
+      ctx->printf( ctx, "%s", enums[e] );
 }
 
-#define EOL()           util_strbuf_printf( sbuf, "\n" )
-#define TXT(S)          util_strbuf_printf( sbuf, "%s", S )
-#define CHR(C)          util_strbuf_printf( sbuf, "%c", C )
-#define UIX(I)          util_strbuf_printf( sbuf, "0x%x", I )
-#define UID(I)          util_strbuf_printf( sbuf, "%u", I )
-#define SID(I)          util_strbuf_printf( sbuf, "%d", I )
-#define FLT(F)          util_strbuf_printf( sbuf, "%10.4f", F )
-#define ENM(E,ENUMS)    dump_enum( sbuf, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+#define EOL()           ctx->printf( ctx, "\n" )
+#define TXT(S)          ctx->printf( ctx, "%s", S )
+#define CHR(C)          ctx->printf( ctx, "%c", C )
+#define UIX(I)          ctx->printf( ctx, "0x%x", I )
+#define UID(I)          ctx->printf( ctx, "%u", I )
+#define SID(I)          ctx->printf( ctx, "%d", I )
+#define FLT(F)          ctx->printf( ctx, "%10.4f", F )
+#define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
 
 static const char *processor_type_names[] =
 {
@@ -148,7 +158,7 @@ static const char *modulate_names[TGSI_MODULATE_COUNT] =
 
 static void
 _dump_register(
-   struct util_strbuf *sbuf,
+   struct dump_ctx *ctx,
    uint file,
    int first,
    int last )
@@ -165,7 +175,7 @@ _dump_register(
 
 static void
 _dump_register_ind(
-   struct util_strbuf *sbuf,
+   struct dump_ctx *ctx,
    uint file,
    int index,
    uint ind_file,
@@ -187,7 +197,7 @@ _dump_register_ind(
 
 static void
 _dump_writemask(
-   struct util_strbuf *sbuf,
+   struct dump_ctx *ctx,
    uint writemask )
 {
    if (writemask != TGSI_WRITEMASK_XYZW) {
@@ -208,18 +218,17 @@ iter_declaration(
    struct tgsi_iterate_context *iter,
    struct tgsi_full_declaration *decl )
 {
-   struct dump_ctx *ctx = (struct dump_ctx *) iter;
-   struct util_strbuf *sbuf = ctx->sbuf;
+   struct dump_ctx *ctx = (struct dump_ctx *)iter;
 
    TXT( "DCL " );
 
    _dump_register(
-      sbuf,
+      ctx,
       decl->Declaration.File,
       decl->DeclarationRange.First,
       decl->DeclarationRange.Last );
    _dump_writemask(
-      sbuf,
+      ctx,
       decl->Declaration.UsageMask );
 
    if (decl->Declaration.Semantic) {
@@ -245,17 +254,11 @@ void
 tgsi_dump_declaration(
    const struct tgsi_full_declaration *decl )
 {
-   static char str[1024];
-   struct util_strbuf sbuf;
    struct dump_ctx ctx;
 
-   util_strbuf_init(&sbuf, str, sizeof(str));
-   
-   ctx.sbuf = &sbuf;
+   ctx.printf = dump_ctx_printf;
 
    iter_declaration( &ctx.iter, (struct tgsi_full_declaration *)decl );
-   
-   debug_printf("%s", str);
 }
 
 static boolean
@@ -264,7 +267,6 @@ iter_immediate(
    struct tgsi_full_immediate *imm )
 {
    struct dump_ctx *ctx = (struct dump_ctx *) iter;
-   struct util_strbuf *sbuf = ctx->sbuf;
 
    uint i;
 
@@ -295,17 +297,11 @@ void
 tgsi_dump_immediate(
    const struct tgsi_full_immediate *imm )
 {
-   static char str[1024];
-   struct util_strbuf sbuf;
    struct dump_ctx ctx;
 
-   util_strbuf_init(&sbuf, str, sizeof(str));
-   
-   ctx.sbuf = &sbuf;
+   ctx.printf = dump_ctx_printf;
 
    iter_immediate( &ctx.iter, (struct tgsi_full_immediate *)imm );
-   
-   debug_printf("%s", str);
 }
 
 static boolean
@@ -314,7 +310,6 @@ iter_instruction(
    struct tgsi_full_instruction *inst )
 {
    struct dump_ctx *ctx = (struct dump_ctx *) iter;
-   struct util_strbuf *sbuf = ctx->sbuf;
    uint instno = ctx->instno++;
    
    uint i;
@@ -345,12 +340,12 @@ iter_instruction(
       CHR( ' ' );
 
       _dump_register(
-         sbuf,
+         ctx,
          dst->DstRegister.File,
          dst->DstRegister.Index,
          dst->DstRegister.Index );
       ENM( dst->DstRegisterExtModulate.Modulate, modulate_names );
-      _dump_writemask( sbuf, dst->DstRegister.WriteMask );
+      _dump_writemask( ctx, dst->DstRegister.WriteMask );
 
       first_reg = FALSE;
    }
@@ -377,7 +372,7 @@ iter_instruction(
 
       if (src->SrcRegister.Indirect) {
          _dump_register_ind(
-            sbuf,
+            ctx,
             src->SrcRegister.File,
             src->SrcRegister.Index,
             src->SrcRegisterInd.File,
@@ -385,7 +380,7 @@ iter_instruction(
       }
       else {
          _dump_register(
-            sbuf,
+            ctx,
             src->SrcRegister.File,
             src->SrcRegister.Index,
             src->SrcRegister.Index );
@@ -460,18 +455,12 @@ tgsi_dump_instruction(
    const struct tgsi_full_instruction *inst,
    uint instno )
 {
-   static char str[1024];
-   struct util_strbuf sbuf;
    struct dump_ctx ctx;
 
-   util_strbuf_init(&sbuf, str, sizeof(str));
-   
    ctx.instno = instno;
-   ctx.sbuf = &sbuf;
+   ctx.printf = dump_ctx_printf;
 
    iter_instruction( &ctx.iter, (struct tgsi_full_instruction *)inst );
-   
-   debug_printf("%s", str);
 }
 
 static boolean
@@ -479,7 +468,6 @@ prolog(
    struct tgsi_iterate_context *iter )
 {
    struct dump_ctx *ctx = (struct dump_ctx *) iter;
-   struct util_strbuf *sbuf = ctx->sbuf;
    ENM( iter->processor.Processor, processor_type_names );
    UID( iter->version.MajorVersion );
    CHR( '.' );
@@ -489,17 +477,12 @@ prolog(
 }
 
 void
-tgsi_dump_str(
+tgsi_dump(
    const struct tgsi_token *tokens,
-   uint flags,
-   char *str,
-   size_t size)
+   uint flags )
 {
-   struct util_strbuf sbuf;
    struct dump_ctx ctx;
 
-   util_strbuf_init(&sbuf, str, size);
-   
    ctx.iter.prolog = prolog;
    ctx.iter.iterate_instruction = iter_instruction;
    ctx.iter.iterate_declaration = iter_declaration;
@@ -507,34 +490,57 @@ tgsi_dump_str(
    ctx.iter.epilog = NULL;
 
    ctx.instno = 0;
-   ctx.sbuf = &sbuf;
+   ctx.printf = dump_ctx_printf;
 
    tgsi_iterate_shader( tokens, &ctx.iter );
 }
 
+struct str_dump_ctx
+{
+   struct dump_ctx base;
+   char *str;
+   char *ptr;
+   size_t left;
+};
+
+static void
+str_dump_ctx_printf(struct dump_ctx *ctx, const char *format, ...)
+{
+   struct str_dump_ctx *sctx = (struct str_dump_ctx *)ctx;
+   
+   if(sctx->left > 1) {
+      size_t written;
+      va_list ap;
+      va_start(ap, format);
+      written = util_vsnprintf(sctx->ptr, sctx->left, format, ap);
+      va_end(ap);
+      sctx->ptr += written;
+      sctx->left -= written;
+   }
+}
+
 void
-tgsi_dump(
+tgsi_dump_str(
    const struct tgsi_token *tokens,
-   uint flags )
+   uint flags,
+   char *str,
+   size_t size)
 {
-   static char str[4096];
-   uint len;
-   char *p = str;
-
-   tgsi_dump_str(tokens, flags, str, sizeof(str));
-
-   /* Workaround output buffer size limitations.
-    */
-   len = strlen( str );
-   while (len > 256) {
-      char piggy_bank;
-
-      piggy_bank = p[256];
-      p[256] = '\0';
-      debug_printf( "%s", p );
-      p[256] = piggy_bank;
-      p += 256;
-      len -= 256;
-   }
-   debug_printf( "%s", p );
+   struct str_dump_ctx ctx;
+
+   ctx.base.iter.prolog = prolog;
+   ctx.base.iter.iterate_instruction = iter_instruction;
+   ctx.base.iter.iterate_declaration = iter_declaration;
+   ctx.base.iter.iterate_immediate = iter_immediate;
+   ctx.base.iter.epilog = NULL;
+
+   ctx.base.instno = 0;
+   ctx.base.printf = &str_dump_ctx_printf;
+
+   ctx.str = str;
+   ctx.str[0] = 0;
+   ctx.ptr = str;
+   ctx.left = size;
+
+   tgsi_iterate_shader( tokens, &ctx.base.iter );
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index fb573fe1f0c..df002939c6b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -957,6 +957,7 @@ fetch_src_file_channel(
    case TGSI_EXTSWIZZLE_W:
       switch( file ) {
       case TGSI_FILE_CONSTANT:
+         assert(mach->Consts);
          chan->f[0] = mach->Consts[index->i[0]][swizzle];
          chan->f[1] = mach->Consts[index->i[1]][swizzle];
          chan->f[2] = mach->Consts[index->i[2]][swizzle];
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index 0b5bdd6ba1c..c6590272969 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -322,7 +322,7 @@ epilog(
    /* Print totals, if any.
     */
    if (ctx->errors || ctx->warnings)
-      debug_printf( "\n%u errors, %u warnings", ctx->errors, ctx->warnings );
+      debug_printf( "%u errors, %u warnings\n", ctx->errors, ctx->warnings );
 
    return TRUE;
 }
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 626724ad4e4..4681b29f52b 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -601,12 +601,10 @@ static void PIPE_CDECL
 cos4f(
    float *store )
 {
-   const unsigned X = 0;
-
-   store[X + 0] = cosf( store[X + 0] );
-   store[X + 1] = cosf( store[X + 1] );
-   store[X + 2] = cosf( store[X + 2] );
-   store[X + 3] = cosf( store[X + 3] );
+   store[0] = cosf( store[0] );
+   store[1] = cosf( store[1] );
+   store[2] = cosf( store[2] );
+   store[3] = cosf( store[3] );
 }
 
 static void
@@ -624,18 +622,16 @@ static void PIPE_CDECL
 ex24f(
    float *store )
 {
-   const unsigned X = 0;
-
 #if FAST_MATH
-   store[X + 0] = util_fast_exp2( store[X + 0] );
-   store[X + 1] = util_fast_exp2( store[X + 1] );
-   store[X + 2] = util_fast_exp2( store[X + 2] );
-   store[X + 3] = util_fast_exp2( store[X + 3] );
+   store[0] = util_fast_exp2( store[0] );
+   store[1] = util_fast_exp2( store[1] );
+   store[2] = util_fast_exp2( store[2] );
+   store[3] = util_fast_exp2( store[3] );
 #else
-   store[X + 0] = powf( 2.0f, store[X + 0] );
-   store[X + 1] = powf( 2.0f, store[X + 1] );
-   store[X + 2] = powf( 2.0f, store[X + 2] );
-   store[X + 3] = powf( 2.0f, store[X + 3] );
+   store[0] = powf( 2.0f, store[0] );
+   store[1] = powf( 2.0f, store[1] );
+   store[2] = powf( 2.0f, store[2] );
+   store[3] = powf( 2.0f, store[3] );
 #endif
 }
 
@@ -665,12 +661,10 @@ static void PIPE_CDECL
 flr4f(
    float *store )
 {
-   const unsigned X = 0;
-
-   store[X + 0] = floorf( store[X + 0] );
-   store[X + 1] = floorf( store[X + 1] );
-   store[X + 2] = floorf( store[X + 2] );
-   store[X + 3] = floorf( store[X + 3] );
+   store[0] = floorf( store[0] );
+   store[1] = floorf( store[1] );
+   store[2] = floorf( store[2] );
+   store[3] = floorf( store[3] );
 }
 
 static void
@@ -688,12 +682,10 @@ static void PIPE_CDECL
 frc4f(
    float *store )
 {
-   const unsigned X = 0;
-
-   store[X + 0] -= floorf( store[X + 0] );
-   store[X + 1] -= floorf( store[X + 1] );
-   store[X + 2] -= floorf( store[X + 2] );
-   store[X + 3] -= floorf( store[X + 3] );
+   store[0] -= floorf( store[0] );
+   store[1] -= floorf( store[1] );
+   store[2] -= floorf( store[2] );
+   store[3] -= floorf( store[3] );
 }
 
 static void
@@ -711,12 +703,10 @@ static void PIPE_CDECL
 lg24f(
    float *store )
 {
-   const unsigned X = 0;
-
-   store[X + 0] = util_fast_log2( store[X + 0] );
-   store[X + 1] = util_fast_log2( store[X + 1] );
-   store[X + 2] = util_fast_log2( store[X + 2] );
-   store[X + 3] = util_fast_log2( store[X + 3] );
+   store[0] = util_fast_log2( store[0] );
+   store[1] = util_fast_log2( store[1] );
+   store[2] = util_fast_log2( store[2] );
+   store[3] = util_fast_log2( store[3] );
 }
 
 static void
@@ -770,18 +760,16 @@ static void PIPE_CDECL
 pow4f(
    float *store )
 {
-   const unsigned X = 0;
-
 #if FAST_MATH
-   store[X + 0] = util_fast_pow( store[X + 0], store[X + 4] );
-   store[X + 1] = util_fast_pow( store[X + 1], store[X + 5] );
-   store[X + 2] = util_fast_pow( store[X + 2], store[X + 6] );
-   store[X + 3] = util_fast_pow( store[X + 3], store[X + 7] );
+   store[0] = util_fast_pow( store[0], store[4] );
+   store[1] = util_fast_pow( store[1], store[5] );
+   store[2] = util_fast_pow( store[2], store[6] );
+   store[3] = util_fast_pow( store[3], store[7] );
 #else
-   store[X + 0] = powf( store[X + 0], store[X + 4] );
-   store[X + 1] = powf( store[X + 1], store[X + 5] );
-   store[X + 2] = powf( store[X + 2], store[X + 6] );
-   store[X + 3] = powf( store[X + 3], store[X + 7] );
+   store[0] = powf( store[0], store[4] );
+   store[1] = powf( store[1], store[5] );
+   store[2] = powf( store[2], store[6] );
+   store[3] = powf( store[3], store[7] );
 #endif
 }
 
@@ -877,12 +865,10 @@ static void PIPE_CDECL
 sin4f(
    float *store )
 {
-   const unsigned X = 0;
-
-   store[X + 0] = sinf( store[X + 0] );
-   store[X + 1] = sinf( store[X + 1] );
-   store[X + 2] = sinf( store[X + 2] );
-   store[X + 3] = sinf( store[X + 3] );
+   store[0] = sinf( store[0] );
+   store[1] = sinf( store[1] );
+   store[2] = sinf( store[2] );
+   store[3] = sinf( store[3] );
 }
 
 static void
@@ -2416,3 +2402,4 @@ tgsi_emit_sse2(
 }
 
 #endif /* PIPE_ARCH_X86 */
+
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 4d336f47ea3..8d39b64c6c1 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -415,6 +415,12 @@ static fetch_func get_fetch_func( enum pipe_format format )
 
 static emit_func get_emit_func( enum pipe_format format )
 {
+   /* silence warnings */
+   (void) emit_R32G32B32A32_FIXED;
+   (void) emit_R32G32B32_FIXED;
+   (void) emit_R32G32_FIXED;
+   (void) emit_R32_FIXED;
+
    switch (format) {
    case PIPE_FORMAT_R64_FLOAT:
       return &emit_R64_FLOAT;
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index 6e5fd26c057..d3951e4e7d7 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -15,8 +15,11 @@ C_SOURCES = \
 	u_rect.c \
 	u_simple_shaders.c \
 	u_snprintf.c \
+	u_stream_stdc.c \
+	u_stream_wd.c \
 	u_tile.c \
-	u_time.c
+	u_time.c \
+	u_timed_winsys.c
 
 include ../../Makefile.template
 
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index ce3fad7068e..e65c17b1cc8 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -16,6 +16,8 @@ util = env.ConvenienceLibrary(
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
+        'u_stream_stdc.c',
+        'u_stream_wd.c',
 		'u_tile.c',
 		'u_time.c',
 	])
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index 7d1dba5a247..b6cff281e6d 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -55,7 +55,11 @@
 #include "pipe/p_format.h" 
 #include "pipe/p_state.h" 
 #include "pipe/p_inlines.h" 
+#include "util/u_memory.h" 
 #include "util/u_string.h" 
+#include "util/u_stream.h" 
+#include "util/u_math.h" 
+#include "util/u_tile.h" 
 
 
 #ifdef PIPE_SUBSYSTEM_WINDOWS_DISPLAY
@@ -76,7 +80,7 @@ void _debug_vprintf(const char *format, va_list ap)
    /* EngDebugPrint does not handle float point arguments, so we need to use
     * our own vsnprintf implementation. It is also very slow, so buffer until
     * we find a newline. */
-   static char buf[512 + 1] = {'\0'};
+   static char buf[512] = {'\0'};
    size_t len = strlen(buf);
    int ret = util_vsnprintf(buf + len, sizeof(buf) - len, format, ap);
    if(ret > (int)(sizeof(buf) - len - 1) || util_strchr(buf + len, '\n')) {
@@ -584,4 +588,109 @@ error2:
 error1:
    ;
 }
+
+
+#pragma pack(push,2)
+struct bmp_file_header {
+   uint16_t bfType;
+   uint32_t bfSize;
+   uint16_t bfReserved1;
+   uint16_t bfReserved2;
+   uint32_t bfOffBits;
+};
+#pragma pack(pop)
+
+struct bmp_info_header {
+   uint32_t biSize;
+   int32_t biWidth;
+   int32_t biHeight;
+   uint16_t biPlanes;
+   uint16_t biBitCount;
+   uint32_t biCompression;
+   uint32_t biSizeImage;
+   int32_t biXPelsPerMeter;
+   int32_t biYPelsPerMeter;
+   uint32_t biClrUsed;
+   uint32_t biClrImportant;
+};
+
+struct bmp_rgb_quad {
+   uint8_t rgbBlue;
+   uint8_t rgbGreen;
+   uint8_t rgbRed;
+   uint8_t rgbAlpha;
+};
+
+void 
+debug_dump_surface_bmp(const char *filename,
+                       struct pipe_surface *surface)
+{
+   struct util_stream *stream;
+   unsigned surface_usage;
+   struct bmp_file_header bmfh;
+   struct bmp_info_header bmih;
+   float *rgba;
+   unsigned x, y;
+
+   if (!surface)
+      goto error1;
+
+   rgba = MALLOC(surface->width*4*sizeof(float));
+   if(!rgba)
+      goto error1;
+   
+   bmfh.bfType = 0x4d42;
+   bmfh.bfSize = 14 + 40 + surface->height*surface->width*4;
+   bmfh.bfReserved1 = 0;
+   bmfh.bfReserved2 = 0;
+   bmfh.bfOffBits = 14 + 40;
+   
+   bmih.biSize = 40;
+   bmih.biWidth = surface->width;
+   bmih.biHeight = surface->height;
+   bmih.biPlanes = 1;
+   bmih.biBitCount = 32;
+   bmih.biCompression = 0;
+   bmih.biSizeImage = surface->height*surface->width*4;
+   bmih.biXPelsPerMeter = 0;
+   bmih.biYPelsPerMeter = 0;
+   bmih.biClrUsed = 0;
+   bmih.biClrImportant = 0;
+   
+   stream = util_stream_create(filename, bmfh.bfSize);
+   if(!stream)
+      goto error2;
+   
+   util_stream_write(stream, &bmfh, 14);
+   util_stream_write(stream, &bmih, 40);
+   
+   /* XXX: force mappable surface */
+   surface_usage = surface->usage;
+   surface->usage |= PIPE_BUFFER_USAGE_CPU_READ;
+
+   y = surface->height;
+   while(y--) {
+      pipe_get_tile_rgba(surface,
+                         0, y, surface->width, 1,
+                         rgba);
+      for(x = 0; x < surface->width; ++x)
+      {
+         struct bmp_rgb_quad pixel;
+         pixel.rgbRed   = float_to_ubyte(rgba[x*4 + 0]);
+         pixel.rgbGreen = float_to_ubyte(rgba[x*4 + 1]);
+         pixel.rgbBlue  = float_to_ubyte(rgba[x*4 + 2]);
+         pixel.rgbAlpha = float_to_ubyte(rgba[x*4 + 3]);
+         util_stream_write(stream, &pixel, 4);
+      }  
+   }
+   
+   surface->usage = surface_usage;
+
+   util_stream_close(stream);
+error2:
+   FREE(rgba);
+error1:
+   ;
+}
+
 #endif
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 05399f9885e..9adf72944e6 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -138,10 +138,10 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    /* fragment shader */
    ctx->fs = util_make_fragment_tex_shader(pipe, &ctx->frag_shader);
 
-   ctx->vbuf = pipe->winsys->buffer_create(pipe->winsys,
-                                           32,
-                                           PIPE_BUFFER_USAGE_VERTEX,
-                                           sizeof(ctx->vertices));
+   ctx->vbuf = pipe_buffer_create(pipe->screen,
+                                  32,
+                                  PIPE_BUFFER_USAGE_VERTEX,
+                                  sizeof(ctx->vertices));
    if (!ctx->vbuf) {
       FREE(ctx);
       ctx->pipe->delete_fs_state(ctx->pipe, ctx->fs);
@@ -174,7 +174,7 @@ util_destroy_blit(struct blit_state *ctx)
    FREE((void*) ctx->vert_shader.tokens);
    FREE((void*) ctx->frag_shader.tokens);
 
-   pipe->winsys->buffer_destroy(pipe->winsys, ctx->vbuf);
+   pipe_buffer_reference(pipe->screen, &ctx->vbuf, NULL);
 
    FREE(ctx);
 }
@@ -214,12 +214,12 @@ setup_vertex_data(struct blit_state *ctx,
    ctx->vertices[3][1][0] = 0.0f;
    ctx->vertices[3][1][1] = 1.0f;
 
-   buf = ctx->pipe->winsys->buffer_map(ctx->pipe->winsys, ctx->vbuf,
-                                       PIPE_BUFFER_USAGE_CPU_WRITE);
+   buf = pipe_buffer_map(ctx->pipe->screen, ctx->vbuf,
+                         PIPE_BUFFER_USAGE_CPU_WRITE);
 
    memcpy(buf, ctx->vertices, sizeof(ctx->vertices));
 
-   ctx->pipe->winsys->buffer_unmap(ctx->pipe->winsys, ctx->vbuf);
+   pipe_buffer_unmap(ctx->pipe->screen, ctx->vbuf);
 }
 
 
@@ -259,12 +259,12 @@ setup_vertex_data_tex(struct blit_state *ctx,
    ctx->vertices[3][1][0] = s0;
    ctx->vertices[3][1][1] = t1;
 
-   buf = ctx->pipe->winsys->buffer_map(ctx->pipe->winsys, ctx->vbuf,
-                                       PIPE_BUFFER_USAGE_CPU_WRITE);
+   buf = pipe_buffer_map(ctx->pipe->screen, ctx->vbuf,
+                         PIPE_BUFFER_USAGE_CPU_WRITE);
 
    memcpy(buf, ctx->vertices, sizeof(ctx->vertices));
 
-   ctx->pipe->winsys->buffer_unmap(ctx->pipe->winsys, ctx->vbuf);
+   pipe_buffer_unmap(ctx->pipe->screen, ctx->vbuf);
 }
 /**
  * Copy pixel block from src surface to dst surface.
diff --git a/src/gallium/auxiliary/util/u_draw_quad.c b/src/gallium/auxiliary/util/u_draw_quad.c
index bf143815d8b..8ecae71b641 100644
--- a/src/gallium/auxiliary/util/u_draw_quad.c
+++ b/src/gallium/auxiliary/util/u_draw_quad.c
@@ -86,11 +86,11 @@ util_draw_texquad(struct pipe_context *pipe,
    vertexBytes = 4 * (4 * numAttribs * sizeof(float));
 
    /* XXX create one-time */
-   vbuf = pipe->winsys->buffer_create(pipe->winsys, 32,
-                                      PIPE_BUFFER_USAGE_VERTEX, vertexBytes);
+   vbuf = pipe_buffer_create(pipe->screen, 32,
+                             PIPE_BUFFER_USAGE_VERTEX, vertexBytes);
    if (vbuf) {
-      float *v = (float *) pipe->winsys->buffer_map(pipe->winsys, vbuf,
-                                             PIPE_BUFFER_USAGE_CPU_WRITE);
+      float *v = (float *) pipe_buffer_map(pipe->screen, vbuf,
+                                           PIPE_BUFFER_USAGE_CPU_WRITE);
       if (v) {
          /*
           * Load vertex buffer
@@ -123,10 +123,10 @@ util_draw_texquad(struct pipe_context *pipe,
          v[28] = 0.0;
          v[29] = 1.0;
 
-         pipe->winsys->buffer_unmap(pipe->winsys, vbuf);
+         pipe_buffer_unmap(pipe->screen, vbuf);
          util_draw_vertex_buffer(pipe, vbuf, PIPE_PRIM_TRIANGLE_FAN, 4, 2);
       }
 
-      pipe_buffer_reference(pipe->winsys, &vbuf, NULL);
+      pipe_buffer_reference(pipe->screen, &vbuf, NULL);
    }
 }
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index c1e2c19f877..b19a649bbcd 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -580,7 +580,6 @@ make_1d_mipmap(struct gen_mipmap_state *ctx,
 {
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_winsys *winsys = pipe->winsys;
    const uint zslice = 0;
    uint dstLevel;
 
@@ -595,19 +594,19 @@ make_1d_mipmap(struct gen_mipmap_state *ctx,
       dstSurf = screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
                                         PIPE_BUFFER_USAGE_CPU_WRITE);
 
-      srcMap = ((ubyte *) winsys->buffer_map(winsys, srcSurf->buffer,
-                                            PIPE_BUFFER_USAGE_CPU_READ)
+      srcMap = ((ubyte *) pipe_buffer_map(screen, srcSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_READ)
                 + srcSurf->offset);
-      dstMap = ((ubyte *) winsys->buffer_map(winsys, dstSurf->buffer,
-                                            PIPE_BUFFER_USAGE_CPU_WRITE)
+      dstMap = ((ubyte *) pipe_buffer_map(screen, dstSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE)
                 + dstSurf->offset);
 
       reduce_1d(pt->format,
                 srcSurf->width, srcMap,
                 dstSurf->width, dstMap);
 
-      winsys->buffer_unmap(winsys, srcSurf->buffer);
-      winsys->buffer_unmap(winsys, dstSurf->buffer);
+      pipe_buffer_unmap(screen, srcSurf->buffer);
+      pipe_buffer_unmap(screen, dstSurf->buffer);
 
       pipe_surface_reference(&srcSurf, NULL);
       pipe_surface_reference(&dstSurf, NULL);
@@ -622,7 +621,6 @@ make_2d_mipmap(struct gen_mipmap_state *ctx,
 {
    struct pipe_context *pipe = ctx->pipe;
    struct pipe_screen *screen = pipe->screen;
-   struct pipe_winsys *winsys = pipe->winsys;
    const uint zslice = 0;
    uint dstLevel;
    
@@ -639,11 +637,11 @@ make_2d_mipmap(struct gen_mipmap_state *ctx,
       dstSurf = screen->get_tex_surface(screen, pt, face, dstLevel, zslice,
                                         PIPE_BUFFER_USAGE_CPU_WRITE);
 
-      srcMap = ((ubyte *) winsys->buffer_map(winsys, srcSurf->buffer,
-                                            PIPE_BUFFER_USAGE_CPU_READ)
+      srcMap = ((ubyte *) pipe_buffer_map(screen, srcSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_READ)
                 + srcSurf->offset);
-      dstMap = ((ubyte *) winsys->buffer_map(winsys, dstSurf->buffer,
-                                            PIPE_BUFFER_USAGE_CPU_WRITE)
+      dstMap = ((ubyte *) pipe_buffer_map(screen, dstSurf->buffer,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE)
                 + dstSurf->offset);
 
       reduce_2d(pt->format,
@@ -652,8 +650,8 @@ make_2d_mipmap(struct gen_mipmap_state *ctx,
                 dstSurf->width, dstSurf->height,
                 dstSurf->stride, dstMap);
 
-      winsys->buffer_unmap(winsys, srcSurf->buffer);
-      winsys->buffer_unmap(winsys, dstSurf->buffer);
+      pipe_buffer_unmap(screen, srcSurf->buffer);
+      pipe_buffer_unmap(screen, dstSurf->buffer);
 
       pipe_surface_reference(&srcSurf, NULL);
       pipe_surface_reference(&dstSurf, NULL);
@@ -759,10 +757,10 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    /* fragment shader */
    ctx->fs = util_make_fragment_tex_shader(pipe, &ctx->frag_shader);
 
-   ctx->vbuf = pipe->winsys->buffer_create(pipe->winsys,
-                                           32,
-                                           PIPE_BUFFER_USAGE_VERTEX,
-                                           sizeof(ctx->vertices));
+   ctx->vbuf = pipe_buffer_create(pipe->screen,
+                                  32,
+                                  PIPE_BUFFER_USAGE_VERTEX,
+                                  sizeof(ctx->vertices));
    if (!ctx->vbuf) {
       FREE(ctx);
       return NULL;
@@ -805,12 +803,12 @@ set_vertex_data(struct gen_mipmap_state *ctx, float width, float height)
    ctx->vertices[3][1][0] = 0.0f;
    ctx->vertices[3][1][1] = 1.0f;
 
-   buf = ctx->pipe->winsys->buffer_map(ctx->pipe->winsys, ctx->vbuf,
-                                       PIPE_BUFFER_USAGE_CPU_WRITE);
+   buf = pipe_buffer_map(ctx->pipe->screen, ctx->vbuf,
+                         PIPE_BUFFER_USAGE_CPU_WRITE);
 
    memcpy(buf, ctx->vertices, sizeof(ctx->vertices));
 
-   ctx->pipe->winsys->buffer_unmap(ctx->pipe->winsys, ctx->vbuf);
+   pipe_buffer_unmap(ctx->pipe->screen, ctx->vbuf);
 }
 
 
@@ -829,7 +827,7 @@ util_destroy_gen_mipmap(struct gen_mipmap_state *ctx)
    FREE((void*) ctx->vert_shader.tokens);
    FREE((void*) ctx->frag_shader.tokens);
 
-   pipe->winsys->buffer_destroy(pipe->winsys, ctx->vbuf);
+   pipe_buffer_reference(pipe->screen, &ctx->vbuf, NULL);
 
    FREE(ctx);
 }
diff --git a/src/gallium/auxiliary/util/u_memory.h b/src/gallium/auxiliary/util/u_memory.h
index 8fbbb4e55d8..857102719dc 100644
--- a/src/gallium/auxiliary/util/u_memory.h
+++ b/src/gallium/auxiliary/util/u_memory.h
@@ -39,7 +39,12 @@
 #include "pipe/p_debug.h"
 
 
- /* Define ENOMEM for WINCE */ 
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+
+/* Define ENOMEM for WINCE */ 
 #if (_WIN32_WCE < 600)
 #ifndef ENOMEM
 #define ENOMEM 12
@@ -47,7 +52,6 @@
 #endif
 
 
-
 #if defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) && defined(DEBUG) 
 
 /* memory debugging */
@@ -220,4 +224,9 @@ mem_dup(const void *src, uint size)
 
 
 
+#ifdef __cplusplus
+}
+#endif
+
+
 #endif /* U_MEMORY_H */
diff --git a/src/gallium/auxiliary/util/u_pack_color.h b/src/gallium/auxiliary/util/u_pack_color.h
index 39e4ae9d071..e0e8aa8e9fe 100644
--- a/src/gallium/auxiliary/util/u_pack_color.h
+++ b/src/gallium/auxiliary/util/u_pack_color.h
@@ -142,6 +142,161 @@ util_pack_color_ub(ubyte r, ubyte g, ubyte b, ubyte a,
  
 
 /**
+ * Unpack RGBA from a packed pixel, returning values as ubytes in [0,255].
+ */
+static INLINE void
+util_unpack_color_ub(enum pipe_format format, const void *src,
+                     ubyte *r, ubyte *g, ubyte *b, ubyte *a)
+{
+   switch (format) {
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 24) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >>  8) & 0xff);
+         *a = (ubyte) ((p >>  0) & 0xff);
+      }
+      return;
+   case PIPE_FORMAT_R8G8B8X8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 24) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >>  8) & 0xff);
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 16) & 0xff);
+         *g = (ubyte) ((p >>  8) & 0xff);
+         *b = (ubyte) ((p >>  0) & 0xff);
+         *a = (ubyte) ((p >> 24) & 0xff);
+      }
+      return;
+   case PIPE_FORMAT_X8R8G8B8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >> 16) & 0xff);
+         *g = (ubyte) ((p >>  8) & 0xff);
+         *b = (ubyte) ((p >>  0) & 0xff);
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >>  8) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >> 24) & 0xff);
+         *a = (ubyte) ((p >>  0) & 0xff);
+      }
+      return;
+   case PIPE_FORMAT_B8G8R8X8_UNORM:
+      {
+         uint p = ((const uint *) src)[0];
+         *r = (ubyte) ((p >>  8) & 0xff);
+         *g = (ubyte) ((p >> 16) & 0xff);
+         *b = (ubyte) ((p >> 24) & 0xff);
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      {
+         ushort p = ((const ushort *) src)[0];
+         *r = (ubyte) (((p >> 8) & 0xf8) | ((p >> 13) & 0x7));
+         *g = (ubyte) (((p >> 3) & 0xfc) | ((p >>  9) & 0x3));
+         *b = (ubyte) (((p << 3) & 0xf8) | ((p >>  2) & 0x7));
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      {
+         ushort p = ((const ushort *) src)[0];
+         *r = (ubyte) (((p >>  7) & 0xf8) | ((p >> 12) & 0x7));
+         *g = (ubyte) (((p >>  2) & 0xf8) | ((p >>  7) & 0x7));
+         *b = (ubyte) (((p <<  3) & 0xf8) | ((p >>  2) & 0x7));
+         *a = (ubyte) (0xff * (p >> 15));
+      }
+      return;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      {
+         ushort p = ((const ushort *) src)[0];
+         *r = (ubyte) (((p >> 4) & 0xf0) | ((p >>  8) & 0xf));
+         *g = (ubyte) (((p >> 0) & 0xf0) | ((p >>  4) & 0xf));
+         *b = (ubyte) (((p << 4) & 0xf0) | ((p >>  0) & 0xf));
+         *a = (ubyte) (((p >> 8) & 0xf0) | ((p >> 12) & 0xf));
+      }
+      return;
+   case PIPE_FORMAT_A8_UNORM:
+      {
+         ubyte p = ((const ubyte *) src)[0];
+         *r = *g = *b = (ubyte) 0xff;
+         *a = p;
+      }
+      return;
+   case PIPE_FORMAT_L8_UNORM:
+      {
+         ubyte p = ((const ubyte *) src)[0];
+         *r = *g = *b = p;
+         *a = (ubyte) 0xff;
+      }
+      return;
+   case PIPE_FORMAT_I8_UNORM:
+      {
+         ubyte p = ((const ubyte *) src)[0];
+         *r = *g = *b = *a = p;
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = float_to_ubyte(p[1]);
+         *b = float_to_ubyte(p[2]);
+         *a = float_to_ubyte(p[3]);
+      }
+      return;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = float_to_ubyte(p[1]);
+         *b = float_to_ubyte(p[2]);
+         *a = (ubyte) 0xff;
+      }
+      return;
+
+   case PIPE_FORMAT_R32G32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = float_to_ubyte(p[1]);
+         *b = *a = (ubyte) 0xff;
+      }
+      return;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      {
+         const float *p = (const float *) src;
+         *r = float_to_ubyte(p[0]);
+         *g = *b = *a = (ubyte) 0xff;
+      }
+      return;
+
+   /* XXX lots more cases to add */
+   default:
+      debug_print_format("gallium: unhandled format in util_unpack_color_ub()",
+                         format);
+      assert(0);
+   }
+}
+ 
+
+
+/**
  * Note rgba outside [0,1] will be clamped for int pixel formats.
  */
 static INLINE void
diff --git a/src/gallium/auxiliary/util/u_rect.c b/src/gallium/auxiliary/util/u_rect.c
index b31ab5415fa..f5619ef791d 100644
--- a/src/gallium/auxiliary/util/u_rect.c
+++ b/src/gallium/auxiliary/util/u_rect.c
@@ -32,6 +32,8 @@
 
 #include "pipe/p_defines.h"
 #include "pipe/p_format.h"
+#include "pipe/p_context.h"
+#include "pipe/p_screen.h"
 #include "util/u_rect.h"
 
 
@@ -148,3 +150,179 @@ pipe_fill_rect(ubyte * dst,
 	 break;
    }
 }
+
+
+
+/**
+ * Fallback function for pipe->surface_copy().
+ * Note: (X,Y)=(0,0) is always the upper-left corner.
+ * if do_flip, flip the image vertically on its way from src rect to dst rect.
+ * XXX should probably put this in new u_surface.c file...
+ */
+void
+util_surface_copy(struct pipe_context *pipe,
+                  boolean do_flip,
+                  struct pipe_surface *dst,
+                  unsigned dst_x, unsigned dst_y,
+                  struct pipe_surface *src,
+                  unsigned src_x, unsigned src_y, 
+                  unsigned w, unsigned h)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *new_src = NULL, *new_dst = NULL;
+   void *dst_map;
+   const void *src_map;
+
+   assert(dst->block.size == src->block.size);
+   assert(dst->block.width == src->block.width);
+   assert(dst->block.height == src->block.height);
+
+   if ((src->usage & PIPE_BUFFER_USAGE_CPU_READ) == 0) {
+      /* Need to create new src surface which is CPU readable */
+      assert(src->texture);
+      if (!src->texture)
+         return;
+      new_src = screen->get_tex_surface(screen,
+                                        src->texture,
+                                        src->face,
+                                        src->level,
+                                        src->zslice,
+                                        PIPE_BUFFER_USAGE_CPU_READ);
+      src = new_src;
+   }
+
+   if ((dst->usage & PIPE_BUFFER_USAGE_CPU_WRITE) == 0) {
+      /* Need to create new dst surface which is CPU writable */
+      assert(dst->texture);
+      if (!dst->texture)
+         return;
+      new_dst = screen->get_tex_surface(screen,
+                                        dst->texture,
+                                        dst->face,
+                                        dst->level,
+                                        dst->zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+      dst = new_dst;
+   }
+
+   src_map = pipe->screen->surface_map(screen,
+                                       src, PIPE_BUFFER_USAGE_CPU_READ);
+   dst_map = pipe->screen->surface_map(screen,
+                                       dst, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   assert(src_map);
+   assert(dst_map);
+
+   if (src_map && dst_map) {
+      /* If do_flip, invert src_y position and pass negative src stride */
+      pipe_copy_rect(dst_map,
+                     &dst->block,
+                     dst->stride,
+                     dst_x, dst_y,
+                     w, h,
+                     src_map,
+                     do_flip ? -(int) src->stride : src->stride,
+                     src_x, src_y);
+   }
+
+   pipe->screen->surface_unmap(pipe->screen, src);
+   pipe->screen->surface_unmap(pipe->screen, dst);
+
+   if (new_src)
+      screen->tex_surface_release(screen, &new_src);
+   if (new_dst)
+      screen->tex_surface_release(screen, &new_dst);
+}
+
+
+
+static void *
+get_pointer(struct pipe_surface *dst, void *dst_map, unsigned x, unsigned y)
+{
+   return (char *)dst_map
+      + y / dst->block.height * dst->stride
+      + x / dst->block.width * dst->block.size;
+}
+
+
+#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
+
+
+/**
+ * Fallback for pipe->surface_fill() function.
+ * XXX should probably put this in new u_surface.c file...
+ */
+void
+util_surface_fill(struct pipe_context *pipe,
+                  struct pipe_surface *dst,
+                  unsigned dstx, unsigned dsty,
+                  unsigned width, unsigned height, unsigned value)
+{
+   struct pipe_screen *screen = pipe->screen;
+   struct pipe_surface *new_dst = NULL;
+   void *dst_map;
+
+   if ((dst->usage & PIPE_BUFFER_USAGE_CPU_WRITE) == 0) {
+      /* Need to create new dst surface which is CPU writable */
+      assert(dst->texture);
+      if (!dst->texture)
+         return;
+      new_dst = screen->get_tex_surface(screen,
+                                        dst->texture,
+                                        dst->face,
+                                        dst->level,
+                                        dst->zslice,
+                                        PIPE_BUFFER_USAGE_CPU_WRITE);
+      dst = new_dst;
+   }
+
+   dst_map = pipe->screen->surface_map(screen,
+                                       dst, PIPE_BUFFER_USAGE_CPU_WRITE);
+
+   assert(dst_map);
+
+   if (dst_map) {
+      assert(dst->stride > 0);
+
+      switch (dst->block.size) {
+      case 1:
+      case 2:
+      case 4:
+         pipe_fill_rect(dst_map, &dst->block, dst->stride,
+                        dstx, dsty, width, height, value);
+         break;
+      case 8:
+         {
+            /* expand the 4-byte clear value to an 8-byte value */
+            ushort *row = (ushort *) get_pointer(dst, dst_map, dstx, dsty);
+            ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
+            ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
+            ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
+            ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
+            unsigned i, j;
+            val0 = (val0 << 8) | val0;
+            val1 = (val1 << 8) | val1;
+            val2 = (val2 << 8) | val2;
+            val3 = (val3 << 8) | val3;
+            for (i = 0; i < height; i++) {
+               for (j = 0; j < width; j++) {
+                  row[j*4+0] = val0;
+                  row[j*4+1] = val1;
+                  row[j*4+2] = val2;
+                  row[j*4+3] = val3;
+               }
+               row += dst->stride/2;
+            }
+         }
+         break;
+      default:
+         assert(0);
+         break;
+      }
+   }
+
+   pipe->screen->surface_unmap(pipe->screen, dst);
+
+   if (new_dst)
+      screen->tex_surface_release(screen, &new_dst);
+}
diff --git a/src/gallium/auxiliary/util/u_rect.h b/src/gallium/auxiliary/util/u_rect.h
index fba48088641..59e842e16d1 100644
--- a/src/gallium/auxiliary/util/u_rect.h
+++ b/src/gallium/auxiliary/util/u_rect.h
@@ -37,6 +37,9 @@
 
 #include "pipe/p_format.h"
 
+struct pipe_context;
+struct pipe_surface;
+
 
 extern void
 pipe_copy_rect(ubyte * dst, const struct pipe_format_block *block,
@@ -50,5 +53,20 @@ pipe_fill_rect(ubyte * dst, const struct pipe_format_block *block,
                unsigned width, unsigned height, uint32_t value);
 
 
+extern void
+util_surface_copy(struct pipe_context *pipe,
+                  boolean do_flip,
+                  struct pipe_surface *dst,
+                  unsigned dst_x, unsigned dst_y,
+                  struct pipe_surface *src,
+                  unsigned src_x, unsigned src_y, 
+                  unsigned w, unsigned h);
+
+extern void
+util_surface_fill(struct pipe_context *pipe,
+                  struct pipe_surface *dst,
+                  unsigned dstx, unsigned dsty,
+                  unsigned width, unsigned height, unsigned value);
+
 
 #endif /* U_RECT_H */
diff --git a/src/gallium/drivers/trace/tr_stream.h b/src/gallium/auxiliary/util/u_stream.h
index 6111174d6a8..a9d0f0121a6 100644
--- a/src/gallium/drivers/trace/tr_stream.h
+++ b/src/gallium/auxiliary/util/u_stream.h
@@ -28,32 +28,34 @@
 /**
  * @file
  * Cross-platform sequential access stream abstraction.
- * 
- * These are really general purpose file access functions, and might one day
- * be moved into the util module.  
  */
 
-#ifndef TR_STREAM_H
-#define TR_STREAM_H
+#ifndef U_STREAM_H
+#define U_STREAM_H
 
 
 #include "pipe/p_compiler.h"
 
 
-struct trace_stream;
+struct util_stream;
 
 
-struct trace_stream *
-trace_stream_create(const char *filename);
+/**
+ * Create a stream
+ * @param filename relative or absolute path (necessary for windows)  
+ * @param optional maximum file size (0 for a growable size).
+ */
+struct util_stream *
+util_stream_create(const char *filename, size_t max_size);
 
 boolean
-trace_stream_write(struct trace_stream *stream, const void *data, size_t size);
+util_stream_write(struct util_stream *stream, const void *data, size_t size);
 
 void
-trace_stream_flush(struct trace_stream *stream);
+util_stream_flush(struct util_stream *stream);
 
 void
-trace_stream_close(struct trace_stream *stream);
+util_stream_close(struct util_stream *stream);
 
 
-#endif /* TR_STREAM_H */
+#endif /* U_STREAM_H */
diff --git a/src/gallium/drivers/trace/tr_stream_stdc.c b/src/gallium/auxiliary/util/u_stream_stdc.c
index 4c19ec0b243..ca80bef0f3d 100644
--- a/src/gallium/drivers/trace/tr_stream_stdc.c
+++ b/src/gallium/auxiliary/util/u_stream_stdc.c
@@ -32,27 +32,29 @@
 
 #include "pipe/p_config.h"
 
-#if defined(PIPE_OS_LINUX)
+#if defined(PIPE_OS_LINUX) || defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 
 #include <stdio.h>
 
 #include "util/u_memory.h"
 
-#include "tr_stream.h"
+#include "u_stream.h"
 
 
-struct trace_stream 
+struct util_stream 
 {
    FILE *file;
 };
 
 
-struct trace_stream *
-trace_stream_create(const char *filename)
+struct util_stream *
+util_stream_create(const char *filename, size_t max_size)
 {
-   struct trace_stream *stream;
+   struct util_stream *stream;
    
-   stream = CALLOC_STRUCT(trace_stream);
+   (void)max_size;
+   
+   stream = CALLOC_STRUCT(util_stream);
    if(!stream)
       goto error1;
    
@@ -70,7 +72,7 @@ error1:
 
 
 boolean
-trace_stream_write(struct trace_stream *stream, const void *data, size_t size)
+util_stream_write(struct util_stream *stream, const void *data, size_t size)
 {
    if(!stream)
       return FALSE;
@@ -80,7 +82,7 @@ trace_stream_write(struct trace_stream *stream, const void *data, size_t size)
 
 
 void
-trace_stream_flush(struct trace_stream *stream) 
+util_stream_flush(struct util_stream *stream) 
 {
    if(!stream)
       return;
@@ -90,7 +92,7 @@ trace_stream_flush(struct trace_stream *stream)
 
 
 void
-trace_stream_close(struct trace_stream *stream) 
+util_stream_close(struct util_stream *stream) 
 {
    if(!stream)
       return;
diff --git a/src/gallium/drivers/trace/tr_stream_wd.c b/src/gallium/auxiliary/util/u_stream_wd.c
index 704eb15bd71..864489e7755 100644
--- a/src/gallium/drivers/trace/tr_stream_wd.c
+++ b/src/gallium/auxiliary/util/u_stream_wd.c
@@ -40,16 +40,18 @@
 #include "util/u_memory.h"
 #include "util/u_string.h"
 
-#include "tr_stream.h"
+#include "u_stream.h"
 
 
 #define MAP_FILE_SIZE (4*1024*1024)
 
 
-struct trace_stream 
+struct util_stream 
 {
    char filename[MAX_PATH + 1];
    WCHAR wFileName[MAX_PATH + 1];
+   boolean growable;
+   size_t map_size;
    ULONG_PTR iFile;
    char *pMap;
    size_t written;
@@ -58,17 +60,23 @@ struct trace_stream
 
 
 static INLINE boolean
-trace_stream_map(struct trace_stream *stream)
+util_stream_map(struct util_stream *stream)
 {
    ULONG BytesInUnicodeString;
    static char filename[MAX_PATH + 1];
    unsigned filename_len;
 
-   filename_len = util_snprintf(filename,
-                                sizeof(filename),
-                                "\\??\\%s.%04x",
-                                stream->filename,
-                                stream->suffix++);
+   if(stream->growable)
+      filename_len = util_snprintf(filename,
+                                   sizeof(filename),
+                                   "%s.%04x",
+                                   stream->filename,
+                                   stream->suffix++);
+   else
+      filename_len = util_snprintf(filename,
+                                   sizeof(filename),
+                                   "%s",
+                                   stream->filename);
 
    EngMultiByteToUnicodeN(
          stream->wFileName,
@@ -77,11 +85,11 @@ trace_stream_map(struct trace_stream *stream)
          filename,
          filename_len);
    
-   stream->pMap = EngMapFile(stream->wFileName, MAP_FILE_SIZE, &stream->iFile);
+   stream->pMap = EngMapFile(stream->wFileName, stream->map_size, &stream->iFile);
    if(!stream->pMap)
       return FALSE;
    
-   memset(stream->pMap, 0, MAP_FILE_SIZE);
+   memset(stream->pMap, 0, stream->map_size);
    stream->written = 0;
    
    return TRUE;
@@ -89,10 +97,10 @@ trace_stream_map(struct trace_stream *stream)
 
 
 static INLINE void
-trace_stream_unmap(struct trace_stream *stream)
+util_stream_unmap(struct util_stream *stream)
 {
    EngUnmapFile(stream->iFile);
-   if(stream->written < MAP_FILE_SIZE) {
+   if(stream->written < stream->map_size) {
       /* Truncate file size */
       stream->pMap = EngMapFile(stream->wFileName, stream->written, &stream->iFile);
       if(stream->pMap)
@@ -103,18 +111,51 @@ trace_stream_unmap(struct trace_stream *stream)
 }
 
 
-struct trace_stream *
-trace_stream_create(const char *filename)
+static INLINE void
+util_stream_full_qualified_filename(char *dst, size_t size, const char *src)
+{
+   boolean need_drive, need_root;
+   
+   if((('A' <= src[0] && src[0] <= 'Z') || ('a' <= src[0] && src[0] <= 'z')) && src[1] == ':') {
+      need_drive = FALSE;
+      need_root = src[2] == '\\' ? FALSE : TRUE;
+   }
+   else {
+      need_drive = TRUE;
+      need_root = src[0] == '\\' ? FALSE : TRUE;
+   }
+   
+   util_snprintf(dst, size, 
+                 "\\??\\%s%s%s",
+                 need_drive ? "C:" : "",
+                 need_root ? "\\" : "",
+                 src);
+}
+
+
+struct util_stream *
+util_stream_create(const char *filename, size_t max_size)
 {
-   struct trace_stream *stream;
+   struct util_stream *stream;
    
-   stream = CALLOC_STRUCT(trace_stream);
+   stream = CALLOC_STRUCT(util_stream);
    if(!stream)
       goto error1;
    
-   strncpy(stream->filename, filename, sizeof(stream->filename));
+   util_stream_full_qualified_filename(stream->filename,
+                                       sizeof(stream->filename),
+                                       filename);
+   
+   if(max_size) {
+      stream->growable = FALSE;
+      stream->map_size = max_size;
+   }
+   else {
+      stream->growable = TRUE;
+      stream->map_size = MAP_FILE_SIZE;
+   }
    
-   if(!trace_stream_map(stream))
+   if(!util_stream_map(stream))
       goto error2;
    
    return stream;
@@ -127,16 +168,16 @@ error1:
 
 
 static INLINE void
-trace_stream_copy(struct trace_stream *stream, const char *data, size_t size)
+util_stream_copy(struct util_stream *stream, const char *data, size_t size)
 {
-   assert(stream->written + size <= MAP_FILE_SIZE);
+   assert(stream->written + size <= stream->map_size);
    memcpy(stream->pMap + stream->written, data, size);
    stream->written += size;
 }
 
 
 boolean
-trace_stream_write(struct trace_stream *stream, const void *data, size_t size)
+util_stream_write(struct util_stream *stream, const void *data, size_t size)
 {
    if(!stream)
       return FALSE;
@@ -144,37 +185,37 @@ trace_stream_write(struct trace_stream *stream, const void *data, size_t size)
    if(!stream->pMap)
       return FALSE;
    
-   while(stream->written + size > MAP_FILE_SIZE) {
-      size_t step = MAP_FILE_SIZE - stream->written;
-      trace_stream_copy(stream, data, step);
+   while(stream->written + size > stream->map_size) {
+      size_t step = stream->map_size - stream->written;
+      util_stream_copy(stream, data, step);
       data = (const char *)data + step;
       size -= step;
       
-      trace_stream_unmap(stream);
-      if(!trace_stream_map(stream))
+      util_stream_unmap(stream);
+      if(!stream->growable || !util_stream_map(stream))
          return FALSE;
    }
 
-   trace_stream_copy(stream, data, size);
+   util_stream_copy(stream, data, size);
    
    return TRUE;
 }
 
 
 void
-trace_stream_flush(struct trace_stream *stream) 
+util_stream_flush(struct util_stream *stream) 
 {
    (void)stream;
 }
 
 
 void
-trace_stream_close(struct trace_stream *stream) 
+util_stream_close(struct util_stream *stream) 
 {
    if(!stream)
       return;
    
-   trace_stream_unmap(stream);
+   util_stream_unmap(stream);
 
    FREE(stream);
 }
diff --git a/src/gallium/auxiliary/util/u_time.c b/src/gallium/auxiliary/util/u_time.c
index 49dce752895..bf7d1d1c8d5 100644
--- a/src/gallium/auxiliary/util/u_time.c
+++ b/src/gallium/auxiliary/util/u_time.c
@@ -136,6 +136,26 @@ util_time_diff(const struct util_time *t1,
 }
 
 
+
+uint64_t
+util_time_micros( void )
+{
+   struct util_time t1;
+   
+   util_time_get(&t1);
+   
+#if defined(PIPE_OS_LINUX)
+   return t1.tv.tv_usec + t1.tv.tv_sec*1000000LL;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_DISPLAY) || defined(PIPE_SUBSYSTEM_WINDOWS_USER) || defined(PIPE_SUBSYSTEM_WINDOWS_CE)
+   util_time_get_frequency();
+   return t1.counter*INT64_C(1000000)/frequency;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_MINIPORT)
+   return t1.counter/10;
+#endif
+}
+
+
+
 /**
  * Compare two time values.
  * 
diff --git a/src/gallium/auxiliary/util/u_time.h b/src/gallium/auxiliary/util/u_time.h
index f9963ce0e27..35d97d16c73 100644
--- a/src/gallium/auxiliary/util/u_time.h
+++ b/src/gallium/auxiliary/util/u_time.h
@@ -74,6 +74,9 @@ util_time_add(const struct util_time *t1,
               int64_t usecs,
               struct util_time *t2);
 
+uint64_t
+util_time_micros( void );
+
 int64_t
 util_time_diff(const struct util_time *t1, 
                const struct util_time *t2);
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.c b/src/gallium/auxiliary/util/u_timed_winsys.c
new file mode 100644
index 00000000000..8beb3b4c885
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_timed_winsys.c
@@ -0,0 +1,346 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+#include "pipe/p_winsys.h"
+#include "u_timed_winsys.h"
+#include "util/u_memory.h"
+#include "util/u_time.h"
+
+
+struct timed_winsys {
+   struct pipe_winsys base;
+   struct pipe_winsys *backend;
+   uint64_t last_dump;
+   struct {
+      const char *name_key;
+      double total;
+      unsigned calls;
+   } funcs[13];
+};
+
+
+static struct timed_winsys *timed_winsys( struct pipe_winsys *winsys )
+{
+   return (struct timed_winsys *)winsys;
+}
+
+
+static uint64_t time_start( void )
+{
+   return util_time_micros();
+}
+
+
+static void time_display( struct pipe_winsys *winsys )
+{
+   struct timed_winsys *tws = timed_winsys(winsys);
+   unsigned i;
+   double overall = 0;
+
+   for (i = 0; i < Elements(tws->funcs); i++) {
+      if (tws->funcs[i].name_key) {
+         debug_printf("*** %-25s %5.3fms (%d calls, avg %.3fms)\n", 
+                      tws->funcs[i].name_key,
+                      tws->funcs[i].total,
+                      tws->funcs[i].calls,
+                      tws->funcs[i].total / tws->funcs[i].calls);
+         overall += tws->funcs[i].total;
+         tws->funcs[i].calls = 0;
+         tws->funcs[i].total = 0;
+      }
+   }
+
+   debug_printf("*** %-25s %5.3fms\n", 
+                "OVERALL WINSYS",
+                overall);
+}
+
+static void time_finish( struct pipe_winsys *winsys,
+                         long long startval, 
+                         unsigned idx,
+                         const char *name ) 
+{
+   struct timed_winsys *tws = timed_winsys(winsys);
+   uint64_t endval = util_time_micros();
+   double elapsed = (endval - startval)/1000.0;
+
+   if (endval - startval > 1000LL) 
+      debug_printf("*** %s %.3f\n", name, elapsed );
+
+   assert( tws->funcs[idx].name_key == name ||
+           tws->funcs[idx].name_key == NULL);
+
+   tws->funcs[idx].name_key = name;
+   tws->funcs[idx].total += elapsed;
+   tws->funcs[idx].calls++;
+
+   if (endval - tws->last_dump > 10LL * 1000LL * 1000LL) {
+      time_display( winsys );
+      tws->last_dump = endval;
+   }
+}
+
+
+/* Pipe has no concept of pools, but the psb driver passes a flag that
+ * can be mapped onto pools in the backend.
+ */
+static struct pipe_buffer *
+timed_buffer_create(struct pipe_winsys *winsys, 
+                    unsigned alignment, 
+                    unsigned usage, 
+                    unsigned size )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   struct pipe_buffer *buf = backend->buffer_create( backend, alignment, usage, size );
+
+   time_finish(winsys, start, 0, __FUNCTION__);
+   
+   return buf;
+}
+
+
+
+
+static struct pipe_buffer *
+timed_user_buffer_create(struct pipe_winsys *winsys,
+                             void *data, 
+                             unsigned bytes) 
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   struct pipe_buffer *buf = backend->user_buffer_create( backend, data, bytes );
+
+   time_finish(winsys, start, 1, __FUNCTION__);
+   
+   return buf;
+}
+
+
+static void *
+timed_buffer_map(struct pipe_winsys *winsys,
+                     struct pipe_buffer *buf,
+                     unsigned flags)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   void *map = backend->buffer_map( backend, buf, flags );
+
+   time_finish(winsys, start, 2, __FUNCTION__);
+   
+   return map;
+}
+
+
+static void
+timed_buffer_unmap(struct pipe_winsys *winsys,
+                       struct pipe_buffer *buf)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->buffer_unmap( backend, buf );
+
+   time_finish(winsys, start, 3, __FUNCTION__);
+}
+
+
+static void
+timed_buffer_destroy(struct pipe_winsys *winsys,
+                         struct pipe_buffer *buf)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->buffer_destroy( backend, buf );
+
+   time_finish(winsys, start, 4, __FUNCTION__);
+}
+
+
+static void
+timed_flush_frontbuffer( struct pipe_winsys *winsys,
+                         struct pipe_surface *surf,
+                         void *context_private)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->flush_frontbuffer( backend, surf, context_private );
+
+   time_finish(winsys, start, 5, __FUNCTION__);
+}
+
+
+
+
+static struct pipe_surface *
+timed_surface_alloc(struct pipe_winsys *winsys)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   struct pipe_surface *surf = backend->surface_alloc( backend );
+
+   time_finish(winsys, start, 6, __FUNCTION__);
+   
+   return surf;
+}
+
+
+
+static int
+timed_surface_alloc_storage(struct pipe_winsys *winsys,
+                              struct pipe_surface *surf,
+                              unsigned width, unsigned height,
+                              enum pipe_format format, 
+                              unsigned flags,
+                              unsigned tex_usage)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   int ret = backend->surface_alloc_storage( backend, surf, width, height, 
+                                             format, flags, tex_usage );
+
+   time_finish(winsys, start, 7, __FUNCTION__);
+   
+   return ret;
+}
+
+
+static void
+timed_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->surface_release( backend, s );
+
+   time_finish(winsys, start, 8, __FUNCTION__);
+}
+
+
+
+static const char *
+timed_get_name( struct pipe_winsys *winsys )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   const char *ret = backend->get_name( backend );
+
+   time_finish(winsys, start, 9, __FUNCTION__);
+   
+   return ret;
+}
+
+static void
+timed_fence_reference(struct pipe_winsys *winsys,
+                    struct pipe_fence_handle **ptr,
+                    struct pipe_fence_handle *fence)
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   backend->fence_reference( backend, ptr, fence );
+
+   time_finish(winsys, start, 10, __FUNCTION__);
+}
+
+
+static int
+timed_fence_signalled( struct pipe_winsys *winsys,
+                       struct pipe_fence_handle *fence,
+                       unsigned flag )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   int ret = backend->fence_signalled( backend, fence, flag );
+
+   time_finish(winsys, start, 11, __FUNCTION__);
+   
+   return ret;
+}
+
+static int
+timed_fence_finish( struct pipe_winsys *winsys,
+                     struct pipe_fence_handle *fence,
+                     unsigned flag )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   uint64_t start = time_start();
+
+   int ret = backend->fence_finish( backend, fence, flag );
+
+   time_finish(winsys, start, 12, __FUNCTION__);
+   
+   return ret;
+}
+
+static void
+timed_winsys_destroy( struct pipe_winsys *winsys )
+{
+   struct pipe_winsys *backend = timed_winsys(winsys)->backend;
+   backend->destroy( backend );
+   FREE(winsys);
+}
+
+
+
+struct pipe_winsys *u_timed_winsys_create( struct pipe_winsys *backend )
+{
+   struct timed_winsys *ws = CALLOC_STRUCT(timed_winsys);
+   
+   ws->base.user_buffer_create = timed_user_buffer_create;
+   ws->base.buffer_map = timed_buffer_map;
+   ws->base.buffer_unmap = timed_buffer_unmap;
+   ws->base.buffer_destroy = timed_buffer_destroy;
+   ws->base.buffer_create = timed_buffer_create;
+   ws->base.flush_frontbuffer = timed_flush_frontbuffer;
+   ws->base.get_name = timed_get_name;
+   ws->base.surface_alloc = timed_surface_alloc;
+   ws->base.surface_alloc_storage = timed_surface_alloc_storage;
+   ws->base.surface_release = timed_surface_release;
+   ws->base.fence_reference = timed_fence_reference;
+   ws->base.fence_signalled = timed_fence_signalled;
+   ws->base.fence_finish = timed_fence_finish;
+   ws->base.destroy = timed_winsys_destroy;
+   
+   ws->backend = backend;
+
+   return &ws->base;
+}
+
diff --git a/src/gallium/auxiliary/util/u_timed_winsys.h b/src/gallium/auxiliary/util/u_timed_winsys.h
new file mode 100644
index 00000000000..542365112d7
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_timed_winsys.h
@@ -0,0 +1,41 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+/*
+ * Authors: Keith Whitwell <keithw-at-tungstengraphics-dot-com>
+ */
+
+
+#ifndef U_TIMED_WINSYS_H
+#define U_TIMED_WINSYS_H
+
+
+struct pipe_winsys;
+struct pipe_winsys *u_timed_winsys_create( struct pipe_winsys *backend );
+
+
+#endif
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 6bace0bb11a..e989d8c2e5c 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -84,7 +84,7 @@
 #define CELL_CMD_BATCH                5
 #define CELL_CMD_RELEASE_VERTS        6
 #define CELL_CMD_STATE_FRAMEBUFFER   10
-#define CELL_CMD_STATE_DEPTH_STENCIL 11
+#define CELL_CMD_STATE_FRAGMENT_OPS  11
 #define CELL_CMD_STATE_SAMPLER       12
 #define CELL_CMD_STATE_TEXTURE       13
 #define CELL_CMD_STATE_VERTEX_INFO   14
@@ -92,9 +92,7 @@
 #define CELL_CMD_STATE_UNIFORMS      16
 #define CELL_CMD_STATE_VS_ARRAY_INFO 17
 #define CELL_CMD_STATE_BIND_VS       18
-#define CELL_CMD_STATE_BLEND         19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
-#define CELL_CMD_STATE_LOGICOP       21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
@@ -106,30 +104,24 @@
 #define CELL_BUFFER_STATUS_USED 20
 
 
+#define CELL_DEBUG_CHECKER  (1 << 0)
+#define CELL_DEBUG_SYNC     (1 << 1)
 
-/**
- */
-struct cell_command_depth_stencil_alpha_test {
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-   unsigned read_depth;         /**< Flag: should depth be read? */
-   unsigned read_stencil;       /**< Flag: should stencil be read? */
-};
 
 
-/**
- * Upload code to perform framebuffer blend operation
- */
-struct cell_command_blend {
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
-   unsigned read_fb;            /**< Flag: should framebuffer be read? */
-};
+/** Max instructions for doing per-fragment operations */
+#define SPU_MAX_FRAGMENT_OPS_INSTS 64
 
 
-struct cell_command_logicop {
-   uint64_t base;               /**< Effective address of code start. */
-   unsigned size;               /**< Size in bytes of SPE code. */
+/**
+ * Command to specify per-fragment operations state and generated code.
+ */
+struct cell_command_fragment_ops
+{
+   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   struct pipe_depth_stencil_alpha_state dsa;
+   struct pipe_blend_state blend;
+   unsigned code[SPU_MAX_FRAGMENT_OPS_INSTS];
 };
 
 
@@ -169,13 +161,15 @@ struct cell_array_info
 };
 
 
-struct cell_attribute_fetch_code {
+struct cell_attribute_fetch_code
+{
    uint64_t base;
    uint size;
 };
 
 
-struct cell_buffer_range {
+struct cell_buffer_range
+{
    uint64_t base;
    unsigned size;
 };
@@ -263,6 +257,7 @@ struct cell_init_info
 {
    unsigned id;
    unsigned num_spus;
+   unsigned debug_flags;  /**< mask of CELL_DEBUG_x flags */
    struct cell_command *cmd;
 
    /** Buffers for command batches, vertex/index data */
diff --git a/src/gallium/drivers/cell/ppu/Makefile b/src/gallium/drivers/cell/ppu/Makefile
index 0389a9554cf..8699f3f8ec2 100644
--- a/src/gallium/drivers/cell/ppu/Makefile
+++ b/src/gallium/drivers/cell/ppu/Makefile
@@ -5,7 +5,7 @@
 
 
 TOP = ../../../../..
-include $(TOP)/configs/linux-cell
+include $(TOP)/configs/current
 
 
 # This is the "top-level" cell PPU driver code, will get pulled into libGL.so
@@ -25,9 +25,9 @@ SOURCES = \
 	cell_context.c \
 	cell_draw_arrays.c \
 	cell_flush.c \
+	cell_gen_fragment.c \
 	cell_state_derived.c \
 	cell_state_emit.c \
-	cell_state_per_fragment.c \
 	cell_state_shader.c \
 	cell_pipe_state.c \
 	cell_screen.c \
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index f45e5f25b64..16882c01295 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -32,6 +32,13 @@
 
 
 
+/**
+ * Search the buffer pool for an empty/free buffer and return its index.
+ * Buffers are used for storing vertex data, state and commands which
+ * will be sent to the SPUs.
+ * If no empty buffers are available, wait for one.
+ * \return buffer index in [0, CELL_NUM_BUFFERS-1]
+ */
 uint
 cell_get_empty_buffer(struct cell_context *cell)
 {
@@ -74,6 +81,11 @@ cell_get_empty_buffer(struct cell_context *cell)
 }
 
 
+/**
+ * Flush the current batch buffer to the SPUs.
+ * An empty buffer will be found and set as the new current batch buffer
+ * for subsequent commands/data.
+ */
 void
 cell_batch_flush(struct cell_context *cell)
 {
@@ -93,11 +105,11 @@ cell_batch_flush(struct cell_context *cell)
 
    /*
    printf("cell_batch_dispatch: buf %u at %p, size %u\n",
-          batch, &cell->batch_buffer[batch][0], size);
+          batch, &cell->buffer[batch][0], size);
    */
      
    /*
-    * Build "BATCH" command and sent to all SPUs.
+    * Build "BATCH" command and send to all SPUs.
     */
    cmd_word = CELL_CMD_BATCH | (batch << 8) | (size << 16);
 
@@ -120,6 +132,9 @@ cell_batch_flush(struct cell_context *cell)
 }
 
 
+/**
+ * Return the number of bytes free in the current batch buffer.
+ */
 uint
 cell_batch_free_space(const struct cell_context *cell)
 {
@@ -129,7 +144,9 @@ cell_batch_free_space(const struct cell_context *cell)
 
 
 /**
- * Append data to current batch.
+ * Append data to the current batch buffer.
+ * \param data  address of block of bytes to append
+ * \param bytes  size of block of bytes
  */
 void
 cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
@@ -165,6 +182,10 @@ cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
 }
 
 
+/**
+ * Allocate space in the current batch buffer for 'bytes' space.
+ * \return address in batch buffer to put data
+ */
 void *
 cell_batch_alloc(struct cell_context *cell, uint bytes)
 {
@@ -172,6 +193,10 @@ cell_batch_alloc(struct cell_context *cell, uint bytes)
 }
 
 
+/**
+ * Same as \sa cell_batch_alloc, but return an address at a particular
+ * alignment.
+ */
 void *
 cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
                          uint alignment)
@@ -215,3 +240,28 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    return pos;
 }
+
+
+/**
+ * One-time init of batch buffers.
+ */
+void
+cell_init_batch_buffers(struct cell_context *cell)
+{
+   uint spu, buf;
+
+   /* init command, vertex/index buffer info */
+   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
+      cell->buffer_size[buf] = 0;
+
+      /* init batch buffer status values,
+       * mark 0th buffer as used, rest as free.
+       */
+      for (spu = 0; spu < cell->num_spus; spu++) {
+         if (buf == 0)
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
+         else
+            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
+      }
+   }
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
index a6eee0a8b18..f74dd600791 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.h
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -54,5 +54,8 @@ extern void *
 cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
                          uint alignment);
 
+extern void
+cell_init_batch_buffers(struct cell_context *cell);
+
 
 #endif /* CELL_BATCH_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index a421c95c8e8..c9c0c721bbe 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -35,6 +35,7 @@
 #include <stdint.h>
 #include "pipe/p_inlines.h"
 #include "util/u_memory.h"
+#include "util/u_pack_color.h"
 #include "cell/common.h"
 #include "cell_clear.h"
 #include "cell_context.h"
@@ -44,6 +45,27 @@
 #include "cell_state.h"
 
 
+/**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
+ * Called via pipe->clear()
+ */
 void
 cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                    unsigned clearValue)
@@ -61,13 +83,21 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
                                               PIPE_BUFFER_USAGE_GPU_WRITE);
 
    if (ps == cell->framebuffer.zsbuf) {
+      /* clear z/stencil buffer */
       surfIndex = 1;
    }
    else {
+      /* clear color buffer */
       surfIndex = 0;
+
+      if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+         clearValue = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                                    ps->format);
+      }
    }
 
 
+   /* Build a CLEAR command and place it in the current batch buffer */
    {
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 9ff4e86943b..71f1a3049d1 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -43,11 +43,11 @@
 #include "draw/draw_private.h"
 
 #include "cell/common.h"
+#include "cell_batch.h"
 #include "cell_clear.h"
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
 #include "cell_flush.h"
-#include "cell_render.h"
 #include "cell_state.h"
 #include "cell_surface.h"
 #include "cell_spu.h"
@@ -85,12 +85,20 @@ cell_draw_create(struct cell_context *cell)
 }
 
 
+#ifdef DEBUG
+static const struct debug_named_value cell_debug_flags[] = {
+   {"checker", CELL_DEBUG_CHECKER},/**< modulate tile clear color by SPU ID */
+   {"sync", CELL_DEBUG_SYNC},      /**< SPUs do synchronous DMA */
+   {NULL, 0}
+};
+#endif
+
+
 struct pipe_context *
 cell_create_context(struct pipe_screen *screen,
                     struct cell_winsys *cws)
 {
    struct cell_context *cell;
-   uint spu, buf;
 
    /* some fields need to be 16-byte aligned, so align the whole object */
    cell = (struct cell_context*) align_malloc(sizeof(struct cell_context), 16);
@@ -104,15 +112,6 @@ cell_create_context(struct pipe_screen *screen,
    cell->pipe.screen = screen;
    cell->pipe.destroy = cell_destroy_context;
 
-   /* state setters */
-   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
-   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
-
-   cell->pipe.draw_arrays = cell_draw_arrays;
-   cell->pipe.draw_elements = cell_draw_elements;
-   cell->pipe.draw_range_elements = cell_draw_range_elements;
-   cell->pipe.set_edgeflags = cell_set_edgeflags;
-
    cell->pipe.clear = cell_clear_surface;
    cell->pipe.flush = cell_flush;
 
@@ -122,20 +121,28 @@ cell_create_context(struct pipe_screen *screen,
    cell->pipe.wait_query = cell_wait_query;
 #endif
 
+   cell_init_draw_functions(cell);
    cell_init_state_functions(cell);
    cell_init_shader_functions(cell);
    cell_init_surface_functions(cell);
    cell_init_texture_functions(cell);
+   cell_init_vertex_functions(cell);
 
    cell->draw = cell_draw_create(cell);
 
    cell_init_vbuf(cell);
+
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
 
    /* convert all points/lines to tris for the time being */
    draw_wide_point_threshold(cell->draw, 0.0);
    draw_wide_line_threshold(cell->draw, 0.0);
 
+   /* get env vars or read config file to get debug flags */
+   cell->debug_flags = debug_get_flags_option("CELL_DEBUG", 
+                                              cell_debug_flags, 
+                                              0 );
+
    /*
     * SPU stuff
     */
@@ -146,20 +153,7 @@ cell_create_context(struct pipe_screen *screen,
 
    cell_start_spus(cell);
 
-   /* init command, vertex/index buffer info */
-   for (buf = 0; buf < CELL_NUM_BUFFERS; buf++) {
-      cell->buffer_size[buf] = 0;
-
-      /* init batch buffer status values,
-       * mark 0th buffer as used, rest as free.
-       */
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         if (buf == 0)
-            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_USED;
-         else
-            cell->buffer_status[spu][buf][0] = CELL_BUFFER_STATUS_FREE;
-      }
-   }
+   cell_init_batch_buffers(cell);
 
    return &cell->pipe;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index f1d1ca89a97..8cec9f45b2e 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -39,8 +39,13 @@
 #include "rtasm/rtasm_ppc_spe.h"
 #include "tgsi/tgsi_scan.h"
 
+
 struct cell_vbuf_render;
 
+
+/**
+ * Cell vertex shader state, subclass of pipe_shader_state.
+ */
 struct cell_vertex_shader_state
 {
    struct pipe_shader_state shader;
@@ -49,6 +54,9 @@ struct cell_vertex_shader_state
 };
 
 
+/**
+ * Cell fragment shader state, subclass of pipe_shader_state.
+ */
 struct cell_fragment_shader_state
 {
    struct pipe_shader_state shader;
@@ -57,7 +65,11 @@ struct cell_fragment_shader_state
 };
 
 
-struct cell_blend_state {
+/**
+ * Cell blend state atom, subclass of pipe_blend_state.
+ */
+struct cell_blend_state
+{
    struct pipe_blend_state base;
 
    /**
@@ -67,17 +79,24 @@ struct cell_blend_state {
 };
 
 
-struct cell_depth_stencil_alpha_state {
-   struct pipe_depth_stencil_alpha_state   base;
+/**
+ * Cell depth/stencil/alpha state atom, subclass of
+ * pipe_depth_stencil_alpha_state.
+ */
+struct cell_depth_stencil_alpha_state
+{
+   struct pipe_depth_stencil_alpha_state base;
 
    /**
     * Generated code to perform alpha, stencil, and depth testing on the SPE
     */
    struct spe_function code;
-
 };
 
 
+/**
+ * Per-context state, subclass of pipe_context.
+ */
 struct cell_context
 {
    struct pipe_context pipe;
@@ -144,6 +163,8 @@ struct cell_context
 
    struct spe_function attrib_fetch;
    unsigned attrib_fetch_offsets[PIPE_MAX_ATTRIBS];
+
+   unsigned debug_flags;
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
index f02dffe1245..880d5353207 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.c
@@ -34,6 +34,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
 
 #include "cell_context.h"
 #include "cell_draw_arrays.h"
@@ -76,14 +77,6 @@ cell_unmap_constant_buffers(struct cell_context *sp)
 }
 
 
-boolean
-cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
-                     unsigned start, unsigned count)
-{
-   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
-}
-
-
 
 /**
  * Draw vertex arrays, with optional indexing.
@@ -92,7 +85,7 @@ cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
  *
  * XXX should the element buffer be specified/bound with a separate function?
  */
-boolean
+static boolean
 cell_draw_range_elements(struct pipe_context *pipe,
                          struct pipe_buffer *indexBuffer,
                          unsigned indexSize,
@@ -116,7 +109,7 @@ cell_draw_range_elements(struct pipe_context *pipe,
     * Map vertex buffers
     */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
-      void *buf = pipe->winsys->buffer_map(pipe->winsys,
+      void *buf = pipe_buffer_map(pipe->screen,
                                            sp->vertex_buffer[i].buffer,
                                            PIPE_BUFFER_USAGE_CPU_READ);
       cell_flush_buffer_range(sp, buf, sp->vertex_buffer[i].buffer->size);
@@ -124,7 +117,7 @@ cell_draw_range_elements(struct pipe_context *pipe,
    }
    /* Map index buffer, if present */
    if (indexBuffer) {
-      void *mapped_indexes = pipe->winsys->buffer_map(pipe->winsys,
+      void *mapped_indexes = pipe_buffer_map(pipe->screen,
                                                       indexBuffer,
                                                       PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_element_buffer(draw, indexSize, mapped_indexes);
@@ -143,11 +136,11 @@ cell_draw_range_elements(struct pipe_context *pipe,
     */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
-      pipe->winsys->buffer_unmap(pipe->winsys, sp->vertex_buffer[i].buffer);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
    }
    if (indexBuffer) {
       draw_set_mapped_element_buffer(draw, 0, NULL);
-      pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
    }
 
    /* Note: leave drawing surfaces mapped */
@@ -157,7 +150,7 @@ cell_draw_range_elements(struct pipe_context *pipe,
 }
 
 
-boolean
+static boolean
 cell_draw_elements(struct pipe_context *pipe,
                    struct pipe_buffer *indexBuffer,
                    unsigned indexSize,
@@ -170,10 +163,29 @@ cell_draw_elements(struct pipe_context *pipe,
 }
 
 
+static boolean
+cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
+                     unsigned start, unsigned count)
+{
+   return cell_draw_elements(pipe, NULL, 0, mode, start, count);
+}
+
 
-void
+static void
 cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags)
 {
    struct cell_context *cell = cell_context(pipe);
    draw_set_edgeflags(cell->draw, edgeflags);
 }
+
+
+
+void
+cell_init_draw_functions(struct cell_context *cell)
+{
+   cell->pipe.draw_arrays = cell_draw_arrays;
+   cell->pipe.draw_elements = cell_draw_elements;
+   cell->pipe.draw_range_elements = cell_draw_range_elements;
+   cell->pipe.set_edgeflags = cell_set_edgeflags;
+}
+
diff --git a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
index cd35ec17b4e..148873aa675 100644
--- a/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
+++ b/src/gallium/drivers/cell/ppu/cell_draw_arrays.h
@@ -29,26 +29,8 @@
 #define CELL_DRAW_ARRAYS_H
 
 
-extern boolean
-cell_draw_arrays(struct pipe_context *pipe, unsigned mode,
-                 unsigned start, unsigned count);
-
-extern boolean
-cell_draw_elements(struct pipe_context *pipe,
-                   struct pipe_buffer *indexBuffer,
-                   unsigned indexSize,
-                   unsigned mode, unsigned start, unsigned count);
-
-extern boolean
-cell_draw_range_elements(struct pipe_context *pipe,
-                         struct pipe_buffer *indexBuffer,
-                         unsigned indexSize,
-                         unsigned min_index,
-                         unsigned max_index,
-                         unsigned mode, unsigned start, unsigned count);
-
 extern void
-cell_set_edgeflags(struct pipe_context *pipe, const unsigned *edgeflags);
+cell_init_draw_functions(struct cell_context *cell);
 
 
 #endif /* CELL_DRAW_ARRAYS_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index 3aaf3de6684..6596b720101 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -34,6 +34,9 @@
 #include "draw/draw_context.h"
 
 
+/**
+ * Called via pipe->flush()
+ */
 void
 cell_flush(struct pipe_context *pipe, unsigned flags,
            struct pipe_fence_handle **fence)
@@ -50,16 +53,19 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
       flags |= CELL_FLUSH_WAIT;
 
    draw_flush( cell->draw );
-   cell_flush_int(pipe, flags);
+   cell_flush_int(cell, flags);
 }
 
 
-/** internal flush */
+/**
+ * Cell internal flush function.  Send the current batch buffer to all SPUs.
+ * If flags & CELL_FLUSH_WAIT, do not return until the SPUs are idle.
+ * \param flags  bitmask of flags CELL_FLUSH_WAIT, or zero
+ */
 void
-cell_flush_int(struct pipe_context *pipe, unsigned flags)
+cell_flush_int(struct cell_context *cell, unsigned flags)
 {
    static boolean flushing = FALSE;  /* recursion catcher */
-   struct cell_context *cell = cell_context(pipe);
    uint i;
 
    ASSERT(!flushing);
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.h b/src/gallium/drivers/cell/ppu/cell_flush.h
index 8f0645c4293..509ae6239ac 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.h
+++ b/src/gallium/drivers/cell/ppu/cell_flush.h
@@ -36,7 +36,7 @@ cell_flush(struct pipe_context *pipe, unsigned flags,
            struct pipe_fence_handle **fence);
 
 extern void
-cell_flush_int(struct pipe_context *pipe, unsigned flags);
+cell_flush_int(struct cell_context *cell, unsigned flags);
 
 extern void
 cell_flush_buffer_range(struct cell_context *cell, void *ptr,
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
new file mode 100644
index 00000000000..79a82ef72b5
--- /dev/null
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -0,0 +1,862 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+
+/**
+ * Generate SPU per-fragment code (actually per-quad code).
+ * \author Brian Paul
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_ppc_spe.h"
+#include "cell_context.h"
+#include "cell_gen_fragment.h"
+
+
+
+/** Do extra optimizations? */
+#define OPTIMIZATIONS 1
+
+
+/**
+ * Generate SPE code to perform Z/depth testing.
+ *
+ * \param dsa         Gallium depth/stencil/alpha state to gen code for
+ * \param f           SPE function to append instruction onto.
+ * \param mask_reg    register containing quad/pixel "alive" mask (in/out)
+ * \param ifragZ_reg  register containing integer fragment Z values (in)
+ * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
+ * \param zmask_reg   register containing result of Z test/comparison (out)
+ */
+static void
+gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f,
+               int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
+{
+   ASSERT(dsa->depth.enabled);
+
+   switch (dsa->depth.func) {
+   case PIPE_FUNC_EQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* zmask = (ifragZ == ref) */
+      spe_ceq(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & zmask) */
+      spe_and(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* zmask = (ifragZ > ref) */
+      spe_cgt(f, zmask_reg, ifragZ_reg, ifbZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* zmask = (ref > ifragZ) */
+      spe_cgt(f, zmask_reg, ifbZ_reg, ifragZ_reg);
+      /* mask = (mask & ~zmask) */
+      spe_andc(f, mask_reg, mask_reg, zmask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = {0,0,0,0} */
+      spe_move(f, zmask_reg, mask_reg);  /* zmask = mask */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* mask unchanged */
+      spe_il(f, zmask_reg, ~0);  /* zmask = {~0,~0,~0,~0} */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+   if (dsa->depth.writemask) {
+      /*
+       * If (ztest passed) {
+       *    framebufferZ = fragmentZ;
+       * }
+       * OR,
+       * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
+       */
+      spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+   }
+}
+
+
+/**
+ * Generate SPE code to perform alpha testing.
+ *
+ * \param dsa        Gallium depth/stencil/alpha state to gen code for
+ * \param f          SPE function to append instruction onto.
+ * \param mask_reg   register containing quad/pixel "alive" mask (in/out)
+ * \param fragA_reg  register containing four fragment alpha values (in)
+ */
+static void
+gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
+               struct spe_function *f, int mask_reg, int fragA_reg)
+{
+   int ref_reg = spe_allocate_available_register(f);
+   int amask_reg = spe_allocate_available_register(f);
+
+   ASSERT(dsa->alpha.enabled);
+
+   if ((dsa->alpha.func != PIPE_FUNC_NEVER) &&
+       (dsa->alpha.func != PIPE_FUNC_ALWAYS)) {
+      /* load/splat the alpha reference float value */
+      spe_load_float(f, ref_reg, dsa->alpha.ref);
+   }
+
+   /* emit code to do the alpha comparison, updating 'mask' */
+   switch (dsa->alpha.func) {
+   case PIPE_FUNC_EQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* amask = (fragA == ref) */
+      spe_fceq(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LESS:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & amask) */
+      spe_and(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_LEQUAL:
+      /* amask = (fragA > ref) */
+      spe_fcgt(f, amask_reg, fragA_reg, ref_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL:
+      /* amask = (ref > fragA) */
+      spe_fcgt(f, amask_reg, ref_reg, fragA_reg);
+      /* mask = (mask & ~amask) */
+      spe_andc(f, mask_reg, mask_reg, amask_reg);
+      break;
+
+   case PIPE_FUNC_NEVER:
+      spe_il(f, mask_reg, 0);  /* mask = [0,0,0,0] */
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* no-op, mask unchanged */
+      break;
+
+   default:
+      ASSERT(0);
+      break;
+   }
+
+#if OPTIMIZATIONS
+   /* if mask == {0,0,0,0} we're all done, return */
+   {
+      /* re-use amask reg here */
+      int tmp_reg = amask_reg;
+      /* tmp[0] = (mask[0] | mask[1] | mask[2] | mask[3]) */
+      spe_orx(f, tmp_reg, mask_reg);
+      /* if tmp[0] == 0 then return from function call */
+      spe_biz(f, tmp_reg, SPE_REG_RA, 0, 0);
+   }
+#endif
+
+   spe_release_register(f, ref_reg);
+   spe_release_register(f, amask_reg);
+}
+
+
+
+/**
+ * Generate SPE code to implement the given blend mode for a quad of pixels.
+ * \param f          SPE function to append instruction onto.
+ * \param fragR_reg  register with fragment red values (float) (in/out)
+ * \param fragG_reg  register with fragment green values (float) (in/out)
+ * \param fragB_reg  register with fragment blue values (float) (in/out)
+ * \param fragA_reg  register with fragment alpha values (float) (in/out)
+ * \param fbRGBA_reg register with packed framebuffer colors (integer) (in)
+ */
+static void
+gen_blend(const struct pipe_blend_state *blend,
+          struct spe_function *f,
+          enum pipe_format color_format,
+          int fragR_reg, int fragG_reg, int fragB_reg, int fragA_reg,
+          int fbRGBA_reg)
+{
+   int term1R_reg = spe_allocate_available_register(f);
+   int term1G_reg = spe_allocate_available_register(f);
+   int term1B_reg = spe_allocate_available_register(f);
+   int term1A_reg = spe_allocate_available_register(f);
+
+   int term2R_reg = spe_allocate_available_register(f);
+   int term2G_reg = spe_allocate_available_register(f);
+   int term2B_reg = spe_allocate_available_register(f);
+   int term2A_reg = spe_allocate_available_register(f);
+
+   int fbR_reg = spe_allocate_available_register(f);
+   int fbG_reg = spe_allocate_available_register(f);
+   int fbB_reg = spe_allocate_available_register(f);
+   int fbA_reg = spe_allocate_available_register(f);
+
+   int one_reg = spe_allocate_available_register(f);
+   int tmp_reg = spe_allocate_available_register(f);
+
+   ASSERT(blend->blend_enable);
+
+   /* Unpack/convert framebuffer colors from four 32-bit packed colors
+    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+    */
+   {
+      int mask_reg = spe_allocate_available_register(f);
+
+      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
+      spe_fsmbi(f, mask_reg, 0x1111);
+
+      /* XXX there may be more clever ways to implement the following code */
+      switch (color_format) {
+      case PIPE_FORMAT_A8R8G8B8_UNORM:
+         /* fbB = fbB & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 8 */
+         spe_roti(f, fbG_reg, fbG_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 16 */
+         spe_roti(f, fbR_reg, fbR_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbA = fbRGBA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* fbA = fbA >> 24 */
+         spe_roti(f, fbA_reg, fbA_reg, -24);
+         break;
+
+      case PIPE_FORMAT_B8G8R8A8_UNORM:
+         /* fbA = fbA & mask */
+         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbR = fbRGBA & mask */
+         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
+         /* fbR = fbR >> 8 */
+         spe_roti(f, fbR_reg, fbR_reg, -8);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbG = fbRGBA & mask */
+         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
+         /* fbG = fbG >> 16 */
+         spe_roti(f, fbG_reg, fbG_reg, -16);
+         /* mask = mask << 8 */
+         spe_roti(f, mask_reg, mask_reg, 8);
+
+         /* fbB = fbRGBA & mask */
+         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
+         /* fbB = fbB >> 24 */
+         spe_roti(f, fbB_reg, fbB_reg, -24);
+         break;
+
+      default:
+         ASSERT(0);
+      }
+
+      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+      spe_cuflt(f, fbR_reg, fbR_reg, 8);
+      spe_cuflt(f, fbG_reg, fbG_reg, 8);
+      spe_cuflt(f, fbB_reg, fbB_reg, 8);
+      spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+      spe_release_register(f, mask_reg);
+   }
+
+
+   /*
+    * Compute Src RGB terms
+    */
+   switch (blend->rgb_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1R_reg, fragR_reg);
+      spe_move(f, term1G_reg, fragG_reg);
+      spe_move(f, term1B_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term1R_reg);
+      spe_zero(f, term1G_reg);
+      spe_zero(f, term1B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1R_reg, fragR_reg, fragR_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragG_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1R_reg, fragR_reg, fragA_reg);
+      spe_fm(f, term1G_reg, fragG_reg, fragA_reg);
+      spe_fm(f, term1B_reg, fragB_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Src Alpha term
+    */
+   switch (blend->alpha_src_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term1A_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term1A_reg, fragA_reg, fragA_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest RGB terms
+    */
+   switch (blend->rgb_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2R_reg, fbR_reg);
+      spe_move(f, term2G_reg, fbG_reg);
+      spe_move(f, term2B_reg, fbB_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2R_reg);
+      spe_zero(f, term2G_reg);
+      spe_zero(f, term2B_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_COLOR:
+      spe_fm(f, term2R_reg, fbR_reg, fragR_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragG_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragB_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2R_reg, fbR_reg, fragA_reg);
+      spe_fm(f, term2G_reg, fbG_reg, fragA_reg);
+      spe_fm(f, term2B_reg, fbB_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* term = fb * tmp */
+      spe_fm(f, term2R_reg, fbR_reg, tmp_reg);
+      spe_fm(f, term2G_reg, fbG_reg, tmp_reg);
+      spe_fm(f, term2B_reg, fbB_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Compute Dest Alpha term
+    */
+   switch (blend->alpha_dst_factor) {
+   case PIPE_BLENDFACTOR_ONE:
+      spe_move(f, term2A_reg, fbA_reg);
+      break;
+   case PIPE_BLENDFACTOR_ZERO:
+      spe_zero(f, term2A_reg);
+      break;
+   case PIPE_BLENDFACTOR_SRC_ALPHA:
+      spe_fm(f, term2A_reg, fbA_reg, fragA_reg);
+      break;
+   case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+      /* one = {1.0, 1.0, 1.0, 1.0} */
+      spe_load_float(f, one_reg, 1.0f);
+      /* tmp = one - fragA */
+      spe_fs(f, tmp_reg, one_reg, fragA_reg);
+      /* termA = fbA * tmp */
+      spe_fm(f, term2A_reg, fbA_reg, tmp_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest RGB terms
+    */
+   switch (blend->rgb_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fa(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fa(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragR_reg, term1R_reg, term2R_reg);
+      spe_fs(f, fragG_reg, term1G_reg, term2G_reg);
+      spe_fs(f, fragB_reg, term1B_reg, term2B_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   /*
+    * Combine Src/Dest A term
+    */
+   switch (blend->alpha_func) {
+   case PIPE_BLEND_ADD:
+      spe_fa(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+   case PIPE_BLEND_SUBTRACT:
+      spe_fs(f, fragA_reg, term1A_reg, term2A_reg);
+      break;
+      /* XXX more cases */
+   default:
+      ASSERT(0);
+   }
+
+   spe_release_register(f, term1R_reg);
+   spe_release_register(f, term1G_reg);
+   spe_release_register(f, term1B_reg);
+   spe_release_register(f, term1A_reg);
+
+   spe_release_register(f, term2R_reg);
+   spe_release_register(f, term2G_reg);
+   spe_release_register(f, term2B_reg);
+   spe_release_register(f, term2A_reg);
+
+   spe_release_register(f, fbR_reg);
+   spe_release_register(f, fbG_reg);
+   spe_release_register(f, fbB_reg);
+   spe_release_register(f, fbA_reg);
+
+   spe_release_register(f, one_reg);
+   spe_release_register(f, tmp_reg);
+}
+
+
+static void
+gen_logicop(const struct pipe_blend_state *blend,
+            struct spe_function *f,
+            int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+static void
+gen_colormask(uint colormask,
+              struct spe_function *f,
+              int fragRGBA_reg, int fbRGBA_reg)
+{
+   /* XXX to-do */
+   /* operate on 32-bit packed pixels, not float colors */
+}
+
+
+
+/**
+ * Generate code to pack a quad of float colors into a four 32-bit integers.
+ *
+ * \param f             SPE function to append instruction onto.
+ * \param color_format  the dest color packing format
+ * \param r_reg         register containing four red values (in/clobbered)
+ * \param g_reg         register containing four green values (in/clobbered)
+ * \param b_reg         register containing four blue values (in/clobbered)
+ * \param a_reg         register containing four alpha values (in/clobbered)
+ * \param rgba_reg      register to store the packed RGBA colors (out)
+ */
+static void
+gen_pack_colors(struct spe_function *f,
+                enum pipe_format color_format,
+                int r_reg, int g_reg, int b_reg, int a_reg,
+                int rgba_reg)
+{
+   /* Convert float[4] in [0.0,1.0] to int[4] in [0,~0], with clamping */
+   spe_cfltu(f, r_reg, r_reg, 32);
+   spe_cfltu(f, g_reg, g_reg, 32);
+   spe_cfltu(f, b_reg, b_reg, 32);
+   spe_cfltu(f, a_reg, a_reg, 32);
+
+   /* Shift the most significant bytes to least the significant positions.
+    * I.e.: reg = reg >> 24
+    */
+   spe_rotmi(f, r_reg, r_reg, -24);
+   spe_rotmi(f, g_reg, g_reg, -24);
+   spe_rotmi(f, b_reg, b_reg, -24);
+   spe_rotmi(f, a_reg, a_reg, -24);
+
+   /* Shift the color bytes according to the surface format */
+   if (color_format == PIPE_FORMAT_A8R8G8B8_UNORM) {
+      spe_roti(f, g_reg, g_reg, 8);   /* green <<= 8 */
+      spe_roti(f, r_reg, r_reg, 16);  /* red <<= 16 */
+      spe_roti(f, a_reg, a_reg, 24);  /* alpha <<= 24 */
+   }
+   else if (color_format == PIPE_FORMAT_B8G8R8A8_UNORM) {
+      spe_roti(f, r_reg, r_reg, 8);   /* red <<= 8 */
+      spe_roti(f, g_reg, g_reg, 16);  /* green <<= 16 */
+      spe_roti(f, b_reg, b_reg, 24);  /* blue <<= 24 */
+   }
+   else {
+      ASSERT(0);
+   }
+
+   /* Merge red, green, blue, alpha registers to make packed RGBA colors.
+    * Eg: after shifting according to color_format we might have:
+    *     R = {0x00ff0000, 0x00110000, 0x00220000, 0x00330000}
+    *     G = {0x0000ff00, 0x00004400, 0x00005500, 0x00006600}
+    *     B = {0x000000ff, 0x00000077, 0x00000088, 0x00000099}
+    *     A = {0xff000000, 0xaa000000, 0xbb000000, 0xcc000000}
+    * OR-ing all those together gives us four packed colors:
+    *  RGBA = {0xffffffff, 0xaa114477, 0xbb225588, 0xcc336699}
+    */
+   spe_or(f, rgba_reg, r_reg, g_reg);
+   spe_or(f, rgba_reg, rgba_reg, b_reg);
+   spe_or(f, rgba_reg, rgba_reg, a_reg);
+}
+
+
+
+
+/**
+ * Generate SPE code to implement the fragment operations (alpha test,
+ * depth test, stencil test, blending, colormask, and final
+ * framebuffer write) as specified by the current context state.
+ *
+ * Logically, this code will be called after running the fragment
+ * shader.  But under some circumstances we could run some of this
+ * code before the fragment shader to cull fragments/quads that are
+ * totally occluded/discarded.
+ *
+ * XXX we only support PIPE_FORMAT_Z24S8_UNORM z/stencil buffer right now.
+ *
+ * See the spu_default_fragment_ops() function to see how the per-fragment
+ * operations would be done with ordinary C code.
+ * The code we generate here though has no branches, is SIMD, etc and
+ * should be much faster.
+ *
+ * \param cell  the rendering context (in)
+ * \param f     the generated function (out)
+ */
+void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f)
+{
+   const struct pipe_depth_stencil_alpha_state *dsa =
+      &cell->depth_stencil->base;
+   const struct pipe_blend_state *blend = &cell->blend->base;
+   const enum pipe_format color_format = cell->framebuffer.cbufs[0]->format;
+
+   /* For SPE function calls: reg $3 = first param, $4 = second param, etc. */
+   const int x_reg = 3;  /* uint */
+   const int y_reg = 4;  /* uint */
+   const int color_tile_reg = 5;  /* tile_t * */
+   const int depth_tile_reg = 6;  /* tile_t * */
+   const int fragZ_reg = 7;   /* vector float */
+   const int fragR_reg = 8;   /* vector float */
+   const int fragG_reg = 9;   /* vector float */
+   const int fragB_reg = 10;  /* vector float */
+   const int fragA_reg = 11;  /* vector float */
+   const int mask_reg = 12;   /* vector uint */
+
+   /* offset of quad from start of tile
+    * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
+    */
+   int quad_offset_reg;
+
+   int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
+   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
+
+   spe_init_func(f, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+   spe_allocate_register(f, x_reg);
+   spe_allocate_register(f, y_reg);
+   spe_allocate_register(f, color_tile_reg);
+   spe_allocate_register(f, depth_tile_reg);
+   spe_allocate_register(f, fragZ_reg);
+   spe_allocate_register(f, fragR_reg);
+   spe_allocate_register(f, fragG_reg);
+   spe_allocate_register(f, fragB_reg);
+   spe_allocate_register(f, fragA_reg);
+   spe_allocate_register(f, mask_reg);
+
+   quad_offset_reg = spe_allocate_available_register(f);
+   fbRGBA_reg = spe_allocate_available_register(f);
+   fbZS_reg = spe_allocate_available_register(f);
+
+   /* compute offset of quad from start of tile, in bytes */
+   {
+      int x2_reg = spe_allocate_available_register(f);
+      int y2_reg = spe_allocate_available_register(f);
+
+      ASSERT(TILE_SIZE == 32);
+
+      spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
+      spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
+      spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
+      spe_a(f, quad_offset_reg, y2_reg, x2_reg);  /* offset = y2 + x2 */
+      spe_shli(f, quad_offset_reg, quad_offset_reg, 4);   /* offset *= 16 */
+
+      spe_release_register(f, x2_reg);
+      spe_release_register(f, y2_reg);
+   }
+
+
+   if (dsa->alpha.enabled) {
+      gen_alpha_test(dsa, f, mask_reg, fragA_reg);
+   }
+
+   if (dsa->depth.enabled || dsa->stencil[0].enabled) {
+      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+      boolean write_depth_stencil;
+
+      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
+      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+
+      /* fetch quad of depth/stencil values from tile at (x,y) */
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      if (dsa->depth.enabled) {
+         /* Extract Z bits from fbZS_reg into fbZ_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            int mask_reg = spe_allocate_available_register(f);
+            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
+            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
+            spe_release_register(f, mask_reg);
+            /* OK, fbZ_reg has four 24-bit Z values now */
+         }
+         else {
+            /* XXX handle other z/stencil formats */
+            ASSERT(0);
+         }
+
+         /* Convert fragZ values from float[4] to uint[4] */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* 24-bit Z values */
+            int scale_reg = spe_allocate_available_register(f);
+
+            /* scale_reg[0,1,2,3] = float(2^24-1) */
+            spe_load_float(f, scale_reg, (float) 0xffffff);
+
+            /* XXX these two instructions might be combined */
+            spe_fm(f, fragZ_reg, fragZ_reg, scale_reg); /* fragZ *= scale */
+            spe_cfltu(f, fragZ_reg, fragZ_reg, 0);  /* fragZ = (int) fragZ */
+
+            spe_release_register(f, scale_reg);
+         }
+         else {
+            /* XXX handle 16-bit Z format */
+            ASSERT(0);
+         }
+      }
+
+      if (dsa->stencil[0].enabled) {
+         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX extract with a shift */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+            /* XXX extract with a mask */
+            ASSERT(0);
+         }
+      }
+
+
+      if (dsa->stencil[0].enabled) {
+         /* XXX this may involve depth testing too */
+         // gen_stencil_test(dsa, f, ... );
+         ASSERT(0);
+      }
+      else if (dsa->depth.enabled) {
+         int zmask_reg = spe_allocate_available_register(f);
+         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         spe_release_register(f, zmask_reg);
+      }
+
+      /* do we need to write Z and/or Stencil back into framebuffer? */
+      write_depth_stencil = (dsa->depth.writemask |
+                             dsa->stencil[0].write_mask |
+                             dsa->stencil[1].write_mask);
+
+      if (write_depth_stencil) {
+         /* Merge latest Z and Stencil values into fbZS_reg.
+          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+          * fbS_reg has four 8-bit Z values in bits [7..0].
+          */
+         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+         }
+         else if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+                  zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+            /* XXX to do */
+            ASSERT(0);
+         }
+         else {
+            /* bad zs_format */
+            ASSERT(0);
+         }
+
+         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+      }
+
+      spe_release_register(f, fbZ_reg);
+      spe_release_register(f, fbS_reg);
+   }
+
+
+   /* Get framebuffer quad/colors.  We'll need these for blending,
+    * color masking, and to obey the quad/pixel mask.
+    * Load: fbRGBA_reg = memory[color_tile + quad_offset]
+    * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
+    * we could skip this load.
+    */
+   spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
+
+
+   if (blend->blend_enable) {
+      gen_blend(blend, f, color_format,
+                fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
+   }
+
+   /*
+    * Write fragment colors to framebuffer/tile.
+    * This involves converting the fragment colors from float[4] to the
+    * tile's specific format and obeying the quad/pixel mask.
+    */
+   {
+      int rgba_reg = spe_allocate_available_register(f);
+
+      /* Pack four float colors as four 32-bit int colors */
+      gen_pack_colors(f, color_format,
+                      fragR_reg, fragG_reg, fragB_reg, fragA_reg,
+                      rgba_reg);
+
+      if (blend->logicop_enable) {
+         gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
+      }
+
+      if (blend->colormask != 0xf) {
+         gen_colormask(blend->colormask, f, rgba_reg, fbRGBA_reg);
+      }
+
+
+      /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
+       * if (mask[i])
+       *    rgba[i] = rgba[i];
+       * else
+       *    rgba[i] = framebuffer[i];
+       */
+      spe_selb(f, rgba_reg, fbRGBA_reg, rgba_reg, mask_reg);
+
+      /* Store updated quad in tile:
+       * memory[color_tile + quad_offset] = rgba_reg;
+       */
+      spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
+
+      spe_release_register(f, rgba_reg);
+   }
+
+   printf("gen_fragment_ops nr instructions: %u\n", f->num_inst);
+
+   spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
+
+
+   spe_release_register(f, fbRGBA_reg);
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, quad_offset_reg);
+}
+
diff --git a/src/gallium/include/pipe/p_pointer.h b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
index 3a1e6be88e7..0ea0fc690c8 100644
--- a/src/gallium/include/pipe/p_pointer.h
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.h
@@ -1,6 +1,6 @@
 /**************************************************************************
  * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -25,71 +25,14 @@
  * 
  **************************************************************************/
 
-#ifndef P_POINTER_H
-#define P_POINTER_H
 
-#include "p_compiler.h"
+#ifndef CELL_GEN_FRAGMENT_H
+#define CELL_GEN_FRAGMENT_H
 
-#ifdef __cplusplus
-extern "C" {
-#endif
 
-static INLINE intptr_t
-pointer_to_intptr( const void *p )
-{
-   union {
-      const void *p;
-      intptr_t i;
-   } pi;
-   pi.p = p;
-   return pi.i;
-}
+extern void
+gen_fragment_function(struct cell_context *cell, struct spe_function *f);
 
-static INLINE void *
-intptr_to_pointer( intptr_t i )
-{
-   union {
-      void *p;
-      intptr_t i;
-   } pi;
-   pi.i = i;
-   return pi.p;
-}
 
-static INLINE uintptr_t
-pointer_to_uintptr( const void *ptr )
-{
-   union {
-      const void *p;
-      uintptr_t u;
-   } pu;
-   pu.p = ptr;
-   return pu.u;
-}
+#endif /* CELL_GEN_FRAGMENT_H */
 
-static INLINE void *
-uintptr_to_pointer( uintptr_t u )
-{
-   union {
-      void *p;
-      uintptr_t u;
-   } pu;
-   pu.u = u;
-   return pu.p;
-}
-
-/**
- * Return a pointer aligned to next multiple of N bytes.
- */
-static INLINE void *
-align_pointer( const void *unaligned, uintptr_t alignment )
-{
-   uintptr_t aligned = (pointer_to_uintptr( unaligned ) + alignment - 1) & ~(alignment - 1);
-   return uintptr_to_pointer( aligned );
-}
-
-#ifdef __cplusplus
-}
-#endif
-
-#endif /* P_POINTER_H */
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index fe5437023b9..e04cf5f274a 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -34,6 +34,7 @@
 #include "pipe/p_inlines.h"
 #include "draw/draw_context.h"
 #include "cell_context.h"
+#include "cell_flush.h"
 #include "cell_state.h"
 #include "cell_texture.h"
 #include "cell_state_per_fragment.h"
@@ -130,8 +131,9 @@ cell_delete_depth_stencil_alpha_state(struct pipe_context *pipe, void *depth)
 }
 
 
-static void cell_set_clip_state( struct pipe_context *pipe,
-			     const struct pipe_clip_state *clip )
+static void
+cell_set_clip_state(struct pipe_context *pipe,
+                    const struct pipe_clip_state *clip)
 {
    struct cell_context *cell = cell_context(pipe);
 
@@ -310,8 +312,21 @@ cell_set_framebuffer_state(struct pipe_context *pipe,
          cell->zsbuf_map = NULL;
       }
 
-      /* update my state */
-      cell->framebuffer = *fb;
+      /* Finish any pending rendering to the current surface before
+       * installing a new surface!
+       */
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
+
+      /* update my state
+       * (this is also where old surfaces will finally get freed)
+       */
+      cell->framebuffer.width = fb->width;
+      cell->framebuffer.height = fb->height;
+      cell->framebuffer.num_cbufs = fb->num_cbufs;
+      for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+         pipe_surface_reference(&cell->framebuffer.cbufs[i], fb->cbufs[i]);
+      }
+      pipe_surface_reference(&cell->framebuffer.zsbuf, fb->zsbuf);
 
       /* map new surfaces */
       if (csurf)
diff --git a/src/gallium/drivers/cell/ppu/cell_spu.c b/src/gallium/drivers/cell/ppu/cell_spu.c
index 973c0b1aa12..9508227e298 100644
--- a/src/gallium/drivers/cell/ppu/cell_spu.c
+++ b/src/gallium/drivers/cell/ppu/cell_spu.c
@@ -26,6 +26,11 @@
  **************************************************************************/
 
 
+/**
+ * Utility/wrappers for communicating with the SPUs.
+ */
+
+
 #include <pthread.h>
 
 #include "cell_spu.h"
@@ -40,6 +45,9 @@ helpful headers:
 */
 
 
+/**
+ * Cell/SPU info that's not per-context.
+ */
 struct cell_global_info cell_global;
 
 
@@ -74,7 +82,11 @@ wait_mbox_message(spe_context_ptr_t ctx)
 }
 
 
-static void *cell_thread_function(void *arg)
+/**
+ * Called by pthread_create() to spawn an SPU thread.
+ */
+static void *
+cell_thread_function(void *arg)
 {
    struct cell_init_info *init = (struct cell_init_info *) arg;
    unsigned entry = SPE_DEFAULT_ENTRY;
@@ -92,7 +104,10 @@ static void *cell_thread_function(void *arg)
 
 
 /**
- * Create the SPU threads
+ * Create the SPU threads.  This is done once during driver initialization.
+ * This involves setting the the "init" message which is sent to each SPU.
+ * The init message specifies an SPU id, total number of SPUs, location
+ * and number of batch buffers, etc.
  */
 void
 cell_start_spus(struct cell_context *cell)
@@ -100,7 +115,6 @@ cell_start_spus(struct cell_context *cell)
    static boolean one_time_init = FALSE;
    uint i, j;
 
-
    if (one_time_init) {
       fprintf(stderr, "PPU: Multiple rendering contexts not yet supported "
 	      "on Cell.\n");
@@ -120,6 +134,7 @@ cell_start_spus(struct cell_context *cell)
    for (i = 0; i < cell->num_spus; i++) {
       cell_global.inits[i].id = i;
       cell_global.inits[i].num_spus = cell->num_spus;
+      cell_global.inits[i].debug_flags = cell->debug_flags;
       cell_global.inits[i].cmd = &cell_global.command[i];
       for (j = 0; j < CELL_NUM_BUFFERS; j++) {
          cell_global.inits[i].buffers[j] = cell->buffer[j];
@@ -137,14 +152,17 @@ cell_start_spus(struct cell_context *cell)
          exit(1);
       }
       
-      pthread_create(&cell_global.spe_threads[i], NULL, &cell_thread_function,
-		     &cell_global.inits[i]);
+      pthread_create(&cell_global.spe_threads[i], /* returned thread handle */
+                     NULL,                        /* pthread attribs */
+                     &cell_thread_function,       /* start routine */
+		     &cell_global.inits[i]);      /* thread argument */
    }
 }
 
 
 /**
  * Tell all the SPUs to stop/exit.
+ * This is done when the driver's exiting / cleaning up.
  */
 void
 cell_spu_exit(struct cell_context *cell)
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index 82580ea35ab..a7771a55a31 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -48,19 +48,17 @@
 #define CELL_NEW_VERTEX_INFO   0x8000
 
 
-void cell_set_vertex_elements(struct pipe_context *,
-                              unsigned count,
-                              const struct pipe_vertex_element *);
+extern void
+cell_update_derived( struct cell_context *softpipe );
 
-void cell_set_vertex_buffers(struct pipe_context *,
-                             unsigned count,
-                             const struct pipe_vertex_buffer *);
 
-void cell_update_derived( struct cell_context *softpipe );
+extern void
+cell_init_shader_functions(struct cell_context *cell);
 
 
-void
-cell_init_shader_functions(struct cell_context *cell);
+extern void
+cell_init_vertex_functions(struct cell_context *cell);
+
 
 #endif /* CELL_STATE_H */
 
diff --git a/src/gallium/drivers/cell/ppu/cell_state_derived.c b/src/gallium/drivers/cell/ppu/cell_state_derived.c
index 8ab938a02aa..efc4f78364b 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_derived.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_derived.c
@@ -35,21 +35,6 @@
 #include "cell_state_emit.h"
 
 
-static int
-find_vs_output(const struct cell_vertex_shader_state *vs,
-               uint semantic_name,
-               uint semantic_index)
-{
-   uint i;
-   for (i = 0; i < vs->info.num_outputs; i++) {
-      if (vs->info.output_semantic_name[i] == semantic_name &&
-          vs->info.output_semantic_index[i] == semantic_index)
-         return i;
-   }
-   return -1;
-}
-
-
 /**
  * Determine how to map vertex program outputs to fragment program inputs.
  * Basically, this will be used when computing the triangle interpolation
@@ -58,7 +43,6 @@ find_vs_output(const struct cell_vertex_shader_state *vs,
 static void
 calculate_vertex_layout( struct cell_context *cell )
 {
-   const struct cell_vertex_shader_state *vs = cell->vs;
    const struct cell_fragment_shader_state *fs = cell->fs;
    const enum interp_mode colorInterp
       = cell->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
@@ -82,7 +66,7 @@ calculate_vertex_layout( struct cell_context *cell )
    vinfo->num_attribs = 0;
 
    /* we always want to emit vertex pos */
-   src = find_vs_output(vs, TGSI_SEMANTIC_POSITION, 0);
+   src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_POSITION, 0);
    assert(src >= 0);
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_POS, src);
 
@@ -98,14 +82,14 @@ calculate_vertex_layout( struct cell_context *cell )
          break;
 
       case TGSI_SEMANTIC_COLOR:
-         src = find_vs_output(vs, TGSI_SEMANTIC_COLOR, 
-                              fs->info.input_semantic_index[i]);
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_COLOR, 
+                                   fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, colorInterp, src);
          break;
 
       case TGSI_SEMANTIC_FOG:
-         src = find_vs_output(vs, TGSI_SEMANTIC_FOG, 0);
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_FOG, 0);
 #if 1
          if (src < 0) /* XXX temp hack, try demos/fogcoord.c with this */
             src = 0;
@@ -116,7 +100,7 @@ calculate_vertex_layout( struct cell_context *cell )
 
       case TGSI_SEMANTIC_GENERIC:
          /* this includes texcoords and varying vars */
-         src = find_vs_output(vs, TGSI_SEMANTIC_GENERIC,
+         src = draw_find_vs_output(cell->draw, TGSI_SEMANTIC_GENERIC,
                               fs->info.input_semantic_index[i]);
          assert(src >= 0);
          draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, src);
@@ -163,6 +147,9 @@ compute_cliprect(struct cell_context *sp)
 
 
 
+/**
+ * Update derived state, send current state to SPUs prior to rendering.
+ */
 void cell_update_derived( struct cell_context *cell )
 {
    if (cell->dirty & (CELL_NEW_RASTERIZER |
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 9d88c1cf3d2..180b89c1f66 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -27,6 +27,7 @@
 
 #include "util/u_memory.h"
 #include "cell_context.h"
+#include "cell_gen_fragment.h"
 #include "cell_state.h"
 #include "cell_state_emit.h"
 #include "cell_state_per_fragment.h"
@@ -47,27 +48,13 @@ emit_state_cmd(struct cell_context *cell, uint cmd,
 }
 
 
-
+/**
+ * For state marked as 'dirty', construct a state-update command block
+ * and insert it into the current batch buffer.
+ */
 void
 cell_emit_state(struct cell_context *cell)
 {
-   if (cell->dirty & (CELL_NEW_FRAMEBUFFER | CELL_NEW_BLEND)) {
-      struct cell_command_logicop logicop;
-
-      if (cell->logic_op.store != NULL) {
-	 spe_release_func(& cell->logic_op);
-      }
-
-      cell_generate_logic_op(& cell->logic_op,
-			     & cell->blend->base,
-			     cell->framebuffer.cbufs[0]);
-
-      logicop.base = (intptr_t) cell->logic_op.store;
-      logicop.size = 64 * 4;
-      emit_state_cmd(cell, CELL_CMD_STATE_LOGICOP, &logicop,
-		     sizeof(logicop));
-   }
-
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
@@ -80,44 +67,33 @@ cell_emit_state(struct cell_context *cell)
       fb->depth_format = zbuf ? zbuf->format : PIPE_FORMAT_NONE;
       fb->width = cell->framebuffer.width;
       fb->height = cell->framebuffer.height;
+#if 0
+      printf("EMIT color format %s\n", pf_name(fb->color_format));
+      printf("EMIT depth format %s\n", pf_name(fb->depth_format));
+#endif
    }
 
-   if (cell->dirty & CELL_NEW_BLEND) {
-      struct cell_command_blend blend;
 
-      if (cell->blend != NULL) {
-         blend.base = (intptr_t) cell->blend->code.store;
-         blend.size = (char *) cell->blend->code.csr
-             - (char *) cell->blend->code.store;
-         blend.read_fb = TRUE;
-      } else {
-         blend.base = 0;
-         blend.size = 0;
-         blend.read_fb = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_BLEND, &blend, sizeof(blend));
-   }
-
-   if (cell->dirty & CELL_NEW_DEPTH_STENCIL) {
-      struct cell_command_depth_stencil_alpha_test dsat;
-      
-
-      if (cell->depth_stencil != NULL) {
-	 dsat.base = (intptr_t) cell->depth_stencil->code.store;
-	 dsat.size = (char *) cell->depth_stencil->code.csr
-	     - (char *) cell->depth_stencil->code.store;
-	 dsat.read_depth = TRUE;
-	 dsat.read_stencil = FALSE;
-      } else {
-	 dsat.base = 0;
-	 dsat.size = 0;
-	 dsat.read_depth = FALSE;
-	 dsat.read_stencil = FALSE;
-      }
-
-      emit_state_cmd(cell, CELL_CMD_STATE_DEPTH_STENCIL, &dsat,
-		     sizeof(dsat));
+   if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
+                      CELL_NEW_DEPTH_STENCIL |
+                      CELL_NEW_BLEND)) {
+      /* XXX we don't want to always do codegen here.  We should have
+       * a hash/lookup table to cache previous results...
+       */
+      struct cell_command_fragment_ops *fops
+            = cell_batch_alloc(cell, sizeof(*fops));
+      struct spe_function spe_code;
+
+      /* generate new code */
+      gen_fragment_function(cell, &spe_code);
+      /* put the new code into the batch buffer */
+      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(&fops->code, spe_code.store,
+             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+      fops->dsa = cell->depth_stencil->base;
+      fops->blend = cell->blend->base;
+      /* free codegen buffer */
+      spe_release_func(&spe_code);
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
@@ -157,7 +133,8 @@ cell_emit_state(struct cell_context *cell)
       emit_state_cmd(cell, CELL_CMD_STATE_VERTEX_INFO,
                      &cell->vertex_info, sizeof(struct vertex_info));
    }
-   
+
+#if 0
    if (cell->dirty & CELL_NEW_VS) {
       const struct draw_context *const draw = cell->draw;
       struct cell_shader_info info;
@@ -170,7 +147,7 @@ cell_emit_state(struct cell_context *cell)
       info.immediates = (uintptr_t) draw->vs.machine.Imms;
       info.num_immediates = draw->vs.machine.ImmLimit / 4;
 
-      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS,
-		     & info, sizeof(info));
+      emit_state_cmd(cell, CELL_CMD_STATE_BIND_VS, &info, sizeof(info));
    }
+#endif
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
index 53ae3aa50e7..78cb446c14a 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_per_fragment.c
@@ -132,9 +132,9 @@ emit_alpha_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to perform Z testing.  Four Z values are tested at once.
  * \param dsa        Current depth-test state
  * \param f          Function to which code should be appended
- * \param m          Mask of allocated / free SPE registers
  * \param mask       Index of register to contain depth-pass mask
  * \param stored     Index of register containing values from depth buffer
  * \param calculated Index of register containing per-fragment depth values
@@ -198,6 +198,7 @@ emit_depth_test(struct pipe_depth_stencil_alpha_state *dsa,
 
 
 /**
+ * Generate code to apply the stencil operation (after testing).
  * \note Emits a maximum of 5 instructions.
  *
  * \warning
@@ -222,9 +223,13 @@ emit_stencil_op(struct spe_function *f,
       spe_il(f, result, ref);
       break;
    case PIPE_STENCIL_OP_INCR:
+      /* clamp = [0xff, 0xff, 0xff, 0xff] */
       spe_il(f, clamp, 0x0ff);
+      /* result[i] = in[i] + 1 */
       spe_ai(f, result, in, 1);
+      /* clamp_mask[i] = (result[i] > 0xff) */
       spe_clgti(f, clamp_mask, result, 0x0ff);
+      /* result[i] = clamp_mask[i] ? clamp[i] : result[i] */
       spe_selb(f, result, result, clamp, clamp_mask);
       break;
    case PIPE_STENCIL_OP_DECR:
@@ -259,10 +264,10 @@ emit_stencil_op(struct spe_function *f,
 
 
 /**
+ * Generate code to do stencil test.  Four pixels are tested at once.
  * \param dsa        Depth / stencil test state
  * \param face       0 for front face, 1 for back face
  * \param f          Function to append instructions to
- * \param reg_mask   Mask of allocated registers
  * \param mask       Register containing mask of fragments passing the
  *                   alpha test
  * \param depth_mask Register containing mask of fragments passing the
@@ -310,13 +315,14 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
 
    switch (dsa->stencil[face].func) {
    case PIPE_FUNC_NEVER:
-      spe_il(f, stencil_mask, 0);
+      spe_il(f, stencil_mask, 0);   /* stencil_mask[0..3] = [0,0,0,0] */
       break;
 
    case PIPE_FUNC_NOTEQUAL:
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_EQUAL:
+      /* stencil_mask[i] = (stored[i] == ref) */
       spe_ceqi(f, stencil_mask, stored, ref);
       break;
 
@@ -324,6 +330,8 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GREATER:
+      complement = TRUE;
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
       break;
 
@@ -331,8 +339,11 @@ emit_stencil_test(struct pipe_depth_stencil_alpha_state *dsa,
       complement = TRUE;
       /* FALLTHROUGH */
    case PIPE_FUNC_GEQUAL:
+      /* stencil_mask[i] = (stored[i] > ref) */
       spe_clgti(f, stencil_mask, stored, ref);
+      /* tmp[i] = (stored[i] == ref) */
       spe_ceqi(f, tmp, stored, ref);
+      /* stencil_mask[i] = stencil_mask[i] | tmp[i] */
       spe_or(f, stencil_mask, stencil_mask, tmp);
       break;
 
@@ -461,7 +472,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
     * + 25 (front stencil) + 25 (back stencil) + 4 = 63 instructions.  Round
     * up to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Allocate registers for the function's input parameters.  Cleverly (and
@@ -540,7 +551,7 @@ cell_generate_depth_stencil_test(struct cell_depth_stencil_alpha_state *cdsa)
          spe_selb(f, depth, depth, zvals, mask);
    }
 
-   spe_bi(f, 0, 0, 0);
+   spe_bi(f, 0, 0, 0);  /* return from function call */
 
 
 #if 0
@@ -956,7 +967,7 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
     * + 4 (fragment mask) + 1 (return) = 55 instlructions.  Round up to 64 to
     * make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    const int frag[4] = {
@@ -1144,9 +1155,10 @@ cell_generate_alpha_blend(struct cell_blend_state *cb)
 }
 
 
-int PC_OFFSET(const struct spe_function *f, const void *d)
+static int
+PC_OFFSET(const struct spe_function *f, const void *d)
 {
-   const intptr_t pc = (intptr_t) f->csr;
+   const intptr_t pc = (intptr_t) &f->store[f->num_inst];
    const intptr_t ea = ~0x0f & (intptr_t) d;
 
    return (ea - pc) >> 2;
@@ -1178,7 +1190,7 @@ cell_generate_logic_op(struct spe_function *f,
     * bytes (equiv. to 8 instructions) are needed for data storage.  Round up
     * to 64 to make it a happy power-of-two.
     */
-   spe_init_func(f, 4 * 64);
+   spe_init_func(f, SPE_INST_SIZE * 64);
 
 
    /* Pixel colors in framebuffer format in AoS layout.
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 86bcad05e9e..97e44eeb1a4 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -53,7 +53,10 @@ cell_vertex_shader_state(void *shader)
 }
 
 
-
+/**
+ * Create fragment shader state.
+ * Called via pipe->create_fs_state()
+ */
 static void *
 cell_create_fs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
@@ -77,6 +80,9 @@ cell_create_fs_state(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called via pipe->bind_fs_state()
+ */
 static void
 cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 {
@@ -88,6 +94,9 @@ cell_bind_fs_state(struct pipe_context *pipe, void *fs)
 }
 
 
+/**
+ * Called via pipe->delete_fs_state()
+ */
 static void
 cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 {
@@ -98,6 +107,10 @@ cell_delete_fs_state(struct pipe_context *pipe, void *fs)
 }
 
 
+/**
+ * Create vertex shader state.
+ * Called via pipe->create_vs_state()
+ */
 static void *
 cell_create_vs_state(struct pipe_context *pipe,
                      const struct pipe_shader_state *templ)
@@ -128,6 +141,9 @@ cell_create_vs_state(struct pipe_context *pipe,
 }
 
 
+/**
+ * Called via pipe->bind_vs_state()
+ */
 static void
 cell_bind_vs_state(struct pipe_context *pipe, void *vs)
 {
@@ -142,6 +158,9 @@ cell_bind_vs_state(struct pipe_context *pipe, void *vs)
 }
 
 
+/**
+ * Called via pipe->delete_vs_state()
+ */
 static void
 cell_delete_vs_state(struct pipe_context *pipe, void *vs)
 {
@@ -154,6 +173,9 @@ cell_delete_vs_state(struct pipe_context *pipe, void *vs)
 }
 
 
+/**
+ * Called via pipe->set_constant_buffer()
+ */
 static void
 cell_set_constant_buffer(struct pipe_context *pipe,
                          uint shader, uint index,
@@ -166,7 +188,7 @@ cell_set_constant_buffer(struct pipe_context *pipe,
    assert(index == 0);
 
    /* note: reference counting */
-   pipe_buffer_reference(ws,
+   winsys_buffer_reference(ws,
                         &cell->constants[shader].buffer,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
diff --git a/src/gallium/drivers/cell/ppu/cell_state_vertex.c b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
index 114684c2a33..fbe55c84721 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_vertex.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_vertex.c
@@ -35,7 +35,7 @@
 #include "draw/draw_context.h"
 
 
-void
+static void
 cell_set_vertex_elements(struct pipe_context *pipe,
                          unsigned count,
                          const struct pipe_vertex_element *elements)
@@ -53,7 +53,7 @@ cell_set_vertex_elements(struct pipe_context *pipe,
 }
 
 
-void
+static void
 cell_set_vertex_buffers(struct pipe_context *pipe,
                         unsigned count,
                         const struct pipe_vertex_buffer *buffers)
@@ -69,3 +69,11 @@ cell_set_vertex_buffers(struct pipe_context *pipe,
 
    draw_set_vertex_buffers(cell->draw, count, buffers);
 }
+
+
+void
+cell_init_vertex_functions(struct cell_context *cell)
+{
+   cell->pipe.set_vertex_buffers = cell_set_vertex_buffers;
+   cell->pipe.set_vertex_elements = cell_set_vertex_elements;
+}
diff --git a/src/gallium/drivers/cell/ppu/cell_surface.c b/src/gallium/drivers/cell/ppu/cell_surface.c
index d9e3b510dc0..732c64082ef 100644
--- a/src/gallium/drivers/cell/ppu/cell_surface.c
+++ b/src/gallium/drivers/cell/ppu/cell_surface.c
@@ -25,108 +25,13 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
-#include "util/u_memory.h"
 #include "util/u_rect.h"
-#include "util/u_tile.h"
-
 #include "cell_context.h"
-#include "cell_surface.h"
-
-
-static void
-cell_surface_copy(struct pipe_context *pipe,
-                  boolean do_flip,
-                  struct pipe_surface *dst,
-                  unsigned dstx, unsigned dsty,
-                  struct pipe_surface *src,
-                  unsigned srcx, unsigned srcy,
-                  unsigned width, unsigned height)
-{
-   assert( dst->cpp == src->cpp );
-
-   pipe_copy_rect(pipe_surface_map(dst, PIPE_BUFFER_USAGE_CPU_WRITE),
-                  &dst->block,
-                  dst->stride,
-                  dstx, dsty,
-                  width, height,
-                  pipe_surface_map(src, PIPE_BUFFER_USAGE_CPU_READ),
-                  do_flip ? -src->stride : src->stride,
-                  srcx, do_flip ? height - 1 - srcy : srcy);
-
-   pipe_surface_unmap(src);
-   pipe_surface_unmap(dst);
-}
-
-
-static void *
-get_pointer(struct pipe_surface *dst, void *dst_map, unsigned x, unsigned y)
-{
-   return (char *)dst_map + y / dst->block.height * dst->stride + x / dst->block.width * dst->block.size;
-}
-
-
-#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
-
-
-/**
- * Fill a rectangular sub-region.  Need better logic about when to
- * push buffers into AGP - will currently do so whenever possible.
- */
-static void
-cell_surface_fill(struct pipe_context *pipe,
-                  struct pipe_surface *dst,
-                  unsigned dstx, unsigned dsty,
-                  unsigned width, unsigned height, unsigned value)
-{
-   unsigned i, j;
-   void *dst_map = pipe_surface_map(dst, PIPE_BUFFER_USAGE_CPU_WRITE);
-
-   assert(dst->stride > 0);
-
-   switch (dst->block.size) {
-   case 1:
-   case 2:
-   case 4:
-      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
-      break;
-   case 8:
-      {
-         /* expand the 4-byte clear value to an 8-byte value */
-         ushort *row = (ushort *) get_pointer(dst, dst_map, dstx, dsty);
-         ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
-         ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
-         ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
-         ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
-         val0 = (val0 << 8) | val0;
-         val1 = (val1 << 8) | val1;
-         val2 = (val2 << 8) | val2;
-         val3 = (val3 << 8) | val3;
-         for (i = 0; i < height; i++) {
-            for (j = 0; j < width; j++) {
-               row[j*4+0] = val0;
-               row[j*4+1] = val1;
-               row[j*4+2] = val2;
-               row[j*4+3] = val3;
-            }
-            row += dst->stride/2;
-         }
-      }
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   pipe_surface_unmap( dst );
-}
 
 
 void
 cell_init_surface_functions(struct cell_context *cell)
 {
-   cell->pipe.surface_copy = cell_surface_copy;
-   cell->pipe.surface_fill = cell_surface_fill;
+   cell->pipe.surface_copy = util_surface_copy;
+   cell->pipe.surface_fill = util_surface_fill;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index 5a0942bbd6e..b6590dfb86e 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -63,19 +63,30 @@ cell_texture_layout(struct cell_texture * spt)
    spt->buffer_size = 0;
 
    for ( level = 0 ; level <= pt->last_level ; level++ ) {
+      unsigned size;
+      unsigned w_tile, h_tile;
+
+      /* width, height, rounded up to tile size */
+      w_tile = align(width, TILE_SIZE);
+      h_tile = align(height, TILE_SIZE);
+
       pt->width[level] = width;
       pt->height[level] = height;
       pt->depth[level] = depth;
-      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, width);  
-      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, height);  
+      pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
+      pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
 
       spt->stride[level] = pt->nblocksx[level] * pt->block.size;
 
       spt->level_offset[level] = spt->buffer_size;
 
-      spt->buffer_size += (pt->nblocksy[level] *
-			  ((pt->target == PIPE_TEXTURE_CUBE) ? 6 : depth) *
-                           pt->nblocksx[level] * pt->block.size);
+      size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
+      if (pt->target == PIPE_TEXTURE_CUBE)
+         size *= 6;
+      else
+         size *= depth;
+
+      spt->buffer_size += size;
 
       width  = minify(width);
       height = minify(height);
@@ -85,8 +96,8 @@ cell_texture_layout(struct cell_texture * spt)
 
 
 static struct pipe_texture *
-cell_texture_create_screen(struct pipe_screen *screen,
-                           const struct pipe_texture *templat)
+cell_texture_create(struct pipe_screen *screen,
+                    const struct pipe_texture *templat)
 {
    struct pipe_winsys *ws = screen->winsys;
    struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
@@ -113,8 +124,8 @@ cell_texture_create_screen(struct pipe_screen *screen,
 
 
 static void
-cell_texture_release_screen(struct pipe_screen *screen,
-                            struct pipe_texture **pt)
+cell_texture_release(struct pipe_screen *screen,
+                     struct pipe_texture **pt)
 {
    if (!*pt)
       return;
@@ -130,7 +141,7 @@ cell_texture_release_screen(struct pipe_screen *screen,
       DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
       */
 
-      pipe_buffer_reference(screen->winsys, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &spt->buffer, NULL);
 
       FREE(spt);
    }
@@ -138,6 +149,7 @@ cell_texture_release_screen(struct pipe_screen *screen,
 }
 
 
+#if 0
 static void
 cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
                     uint face, uint levelsMask)
@@ -145,13 +157,14 @@ cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
    /* XXX TO DO:  re-tile the texture data ... */
 
 }
+#endif
 
 
 static struct pipe_surface *
-cell_get_tex_surface_screen(struct pipe_screen *screen,
-                            struct pipe_texture *pt,
-                            unsigned face, unsigned level, unsigned zslice,
-                            unsigned usage)
+cell_get_tex_surface(struct pipe_screen *screen,
+                     struct pipe_texture *pt,
+                     unsigned face, unsigned level, unsigned zslice,
+                     unsigned usage)
 {
    struct pipe_winsys *ws = screen->winsys;
    struct cell_texture *spt = cell_texture(pt);
@@ -161,7 +174,7 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
    if (ps) {
       assert(ps->refcount);
       assert(ps->winsys);
-      pipe_buffer_reference(ws, &ps->buffer, spt->buffer);
+      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
       ps->format = pt->format;
       ps->block = pt->block;
       ps->width = pt->width[level];
@@ -174,12 +187,17 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
 
       /* XXX may need to override usage flags (see sp_texture.c) */
 
+      pipe_texture_reference(&ps->texture, pt); 
+      ps->face = face;
+      ps->level = level;
+      ps->zslice = zslice;
 
       if (pt->target == PIPE_TEXTURE_CUBE || pt->target == PIPE_TEXTURE_3D) {
 	 ps->offset += ((pt->target == PIPE_TEXTURE_CUBE) ? face : zslice) *
 		       ps->nblocksy *
 		       ps->stride;
-      } else {
+      }
+      else {
 	 assert(face == 0);
 	 assert(zslice == 0);
       }
@@ -189,6 +207,11 @@ cell_get_tex_surface_screen(struct pipe_screen *screen,
 
 
 
+/**
+ * Copy tile data from linear layout to tiled layout.
+ * XXX this should be rolled into the future surface-creation code.
+ * XXX also need "untile" code...
+ */
 static void
 tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
 {
@@ -219,6 +242,7 @@ tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
 
 /**
  * Convert linear texture image data to tiled format for SPU usage.
+ * XXX recast this in terms of pipe_surfaces (aka texture views).
  */
 static void
 cell_tile_texture(struct cell_context *cell,
@@ -285,6 +309,21 @@ cell_update_texture_mapping(struct cell_context *cell)
 }
 
 
+static void 
+cell_tex_surface_release(struct pipe_screen *screen, 
+                         struct pipe_surface **s)
+{
+   /* Effectively do the texture_update work here - if texture images
+    * needed post-processing to put them into hardware layout, this is
+    * where it would happen.  For softpipe, nothing to do.
+    */
+   assert ((*s)->texture);
+   pipe_texture_reference(&(*s)->texture, NULL); 
+
+   screen->winsys->surface_release(screen->winsys, s);
+}
+
+
 static void *
 cell_surface_map( struct pipe_screen *screen,
                   struct pipe_surface *surface,
@@ -297,7 +336,7 @@ cell_surface_map( struct pipe_screen *screen,
       return NULL;
    }
 
-   map = screen->winsys->buffer_map( screen->winsys, surface->buffer, flags );
+   map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
 
@@ -323,7 +362,7 @@ static void
 cell_surface_unmap(struct pipe_screen *screen,
                    struct pipe_surface *surface)
 {
-   screen->winsys->buffer_unmap( screen->winsys, surface->buffer );
+   pipe_buffer_unmap( screen, surface->buffer );
 }
 
 
@@ -333,12 +372,15 @@ cell_init_texture_functions(struct cell_context *cell)
    /*cell->pipe.texture_update = cell_texture_update;*/
 }
 
+
 void
 cell_init_screen_texture_funcs(struct pipe_screen *screen)
 {
-   screen->texture_create = cell_texture_create_screen;
-   screen->texture_release = cell_texture_release_screen;
-   screen->get_tex_surface = cell_get_tex_surface_screen;
+   screen->texture_create = cell_texture_create;
+   screen->texture_release = cell_texture_release;
+
+   screen->get_tex_surface = cell_get_tex_surface;
+   screen->tex_surface_release = cell_tex_surface_release;
 
    screen->surface_map = cell_surface_map;
    screen->surface_unmap = cell_surface_unmap;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index e4230c7a5ff..aa63435b934 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -26,6 +26,11 @@
  **************************************************************************/
 
 /**
+ * Vertex buffer code.  The draw module transforms vertices to window
+ * coords, etc. and emits the vertices into buffer supplied by this module.
+ * When a vertex buffer is full, or we flush, we'll send the vertex data
+ * to the SPUs.
+ *
  * Authors
  *  Brian Paul
  */
@@ -113,7 +118,7 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
    }
 
    cvbr->vertex_buf = ~0;
-   cell_flush_int(&cell->pipe, 0x0);
+   cell_flush_int(cell, 0x0);
 
    assert(vertices == cvbr->vertex_buffer);
    cvbr->vertex_buffer = NULL;
@@ -121,12 +126,13 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
 
 
 
-static void
+static boolean
 cell_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct cell_vbuf_render *cvbr = cell_vbuf_render(vbr);
    cvbr->prim = prim;
    /*printf("cell_set_prim %u\n", prim);*/
+   return TRUE;
 }
 
 
@@ -244,7 +250,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
 #if 0
    /* helpful for debug */
-   cell_flush_int(&cell->pipe, CELL_FLUSH_WAIT);
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
 #endif
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 2ece0250f6f..566df7f59e3 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -297,10 +297,9 @@ void cell_update_vertex_fetch(struct draw_context *draw)
 
 
    /* Each fetch function can be a maximum of 34 instructions (note: this is
-    * actually a slight over-estimate).  That means (34 * 4) = 136 bytes
-    * each maximum.
+    * actually a slight over-estimate).
     */
-   spe_init_func(p, 136 * unique_attr_formats);
+   spe_init_func(p, 34 * SPE_INST_SIZE * unique_attr_formats);
 
 
    /* Allocate registers for the function's input parameters.
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
index 3658947715f..2b10c116fa3 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_shader.c
@@ -135,7 +135,7 @@ cell_vertex_shader_queue_flush(struct draw_context *draw)
       vs->num_elts = n;
       send_mbox_message(cell_global.spe_contexts[0], CELL_CMD_VS_EXECUTE);
 
-      cell_flush_int(& cell->pipe, CELL_FLUSH_WAIT);
+      cell_flush_int(cell, CELL_FLUSH_WAIT);
    }
 
    draw->vs.post_nr = draw->vs.queue_nr;
diff --git a/src/gallium/drivers/cell/spu/Makefile b/src/gallium/drivers/cell/spu/Makefile
index 8e83610790e..1ae0dfb8c10 100644
--- a/src/gallium/drivers/cell/spu/Makefile
+++ b/src/gallium/drivers/cell/spu/Makefile
@@ -5,7 +5,7 @@
 
 
 TOP = ../../../../..
-include $(TOP)/configs/linux-cell
+include $(TOP)/configs/current
 
 
 PROG = g3d
@@ -22,12 +22,15 @@ SOURCES = \
 	spu_render.c \
 	spu_texture.c \
 	spu_tile.c \
-	spu_tri.c \
+	spu_tri.c
+
+OLD_SOURCES = \
 	spu_exec.c \
 	spu_util.c \
 	spu_vertex_fetch.c \
 	spu_vertex_shader.c
 
+
 SPU_OBJECTS = $(SOURCES:.c=.o) \
 
 SPU_ASM_OUT = $(SOURCES:.c=.s) \
@@ -43,7 +46,7 @@ INCLUDE_DIRS = \
 	$(SPU_CC) $(SPU_CFLAGS) -c $<
 
 .c.s:
-	$(SPU_CC) $(SPU_CFLAGS) -S $<
+	$(SPU_CC) $(SPU_CFLAGS) -O3 -S $<
 
 
 # The .a file will be linked into the main/PPU executable
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index e9fee8a3a61..fd8dc6ded3e 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -79,14 +79,14 @@ spu_pack_color_shuffle(vector float rgba, vector unsigned char shuffle)
 
 
 static INLINE vector float
-spu_unpack_color(uint color)
+spu_unpack_B8G8R8A8(uint color)
 {
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             0, 0, 0, 0,
-                             5, 5, 5, 5,
                              10, 10, 10, 10,
+                             5, 5, 5, 5,
+                             0, 0, 0, 0,
                              15, 15, 15, 15}) );
    return spu_convtf(color_u4, 32);
 }
diff --git a/src/gallium/drivers/cell/spu/spu_exec.c b/src/gallium/drivers/cell/spu/spu_exec.c
index 89c61136a4c..e27df2dfb38 100644
--- a/src/gallium/drivers/cell/spu/spu_exec.c
+++ b/src/gallium/drivers/cell/spu/spu_exec.c
@@ -382,10 +382,10 @@ fetch_src_file_channel(
          break;
 
       case TGSI_FILE_IMMEDIATE:
-         assert( index->i[0] < (int) mach->ImmLimit );
-         assert( index->i[1] < (int) mach->ImmLimit );
-         assert( index->i[2] < (int) mach->ImmLimit );
-         assert( index->i[3] < (int) mach->ImmLimit );
+         ASSERT( index->i[0] < (int) mach->ImmLimit );
+         ASSERT( index->i[1] < (int) mach->ImmLimit );
+         ASSERT( index->i[2] < (int) mach->ImmLimit );
+         ASSERT( index->i[3] < (int) mach->ImmLimit );
 
          chan->f[0] = mach->Imms[index->i[0]][swizzle];
          chan->f[1] = mach->Imms[index->i[1]][swizzle];
@@ -409,7 +409,7 @@ fetch_src_file_channel(
          break;
 
       default:
-         assert( 0 );
+         ASSERT( 0 );
       }
       break;
 
@@ -422,7 +422,7 @@ fetch_src_file_channel(
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -471,7 +471,7 @@ fetch_source(
          index.q = si_shli(index.q, 12);
          break;
       default:
-         assert( 0 );
+         ASSERT( 0 );
       }
 
       index.i[0] += reg->SrcRegisterDim.Index;
@@ -558,7 +558,7 @@ store_dest(
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
       return;
    }
 
@@ -582,11 +582,11 @@ store_dest(
       break;
 
    case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -769,7 +769,7 @@ exec_tex(struct spu_exec_machine *mach,
       break;
 
    default:
-      assert (0);
+      ASSERT (0);
    }
 
    FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
@@ -861,7 +861,7 @@ exec_declaration(struct spu_exec_machine *mach,
             break;
 
          default:
-            assert( 0 );
+            ASSERT( 0 );
          }
 
          if( mask == TGSI_WRITEMASK_XYZW ) {
@@ -971,11 +971,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_EXP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_LOG:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_MUL:
@@ -1151,24 +1151,24 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CND:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CND0:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DOT2ADD:
       /* TGSI_OPCODE_DP2A */
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_INDEX:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_NEGATE:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_FRAC:
@@ -1181,7 +1181,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_CLAMP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_FLOOR:
@@ -1276,7 +1276,7 @@ exec_instruction(
       break;
 
     case TGSI_OPCODE_MULTIPLYMATRIX:
-       assert (0);
+       ASSERT (0);
        break;
 
     case TGSI_OPCODE_ABS:
@@ -1290,7 +1290,7 @@ exec_instruction(
        break;
 
    case TGSI_OPCODE_RCC:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DPH:
@@ -1353,23 +1353,23 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_PK2H:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK2US:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK4B:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_PK4UB:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_RFL:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_SEQ:
@@ -1384,7 +1384,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SFL:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_SGT:
@@ -1429,7 +1429,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_STR:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TEX:
@@ -1452,7 +1452,7 @@ exec_instruction(
       /* src[1] = d[strq]/dx */
       /* src[2] = d[strq]/dy */
       /* src[3] = sampler unit */
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXL:
@@ -1470,35 +1470,35 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_UP2H:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP2US:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP4B:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_UP4UB:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_X2D:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ARA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ARR:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_BRA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CAL:
@@ -1507,14 +1507,14 @@ exec_instruction(
          /* do the call */
 
          /* push the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
          mach->CondStack[mach->CondStackTop++] = mach->CondMask;
-         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
          mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
          mach->ContStack[mach->ContStackTop++] = mach->ContMask;
 
-         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
          mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
 
          /* note that PC was already incremented above */
@@ -1538,13 +1538,13 @@ exec_instruction(
          *pc = mach->CallStack[--mach->CallStackTop];
 
          /* pop the Cond, Loop, Cont stacks */
-         assert(mach->CondStackTop > 0);
+         ASSERT(mach->CondStackTop > 0);
          mach->CondMask = mach->CondStack[--mach->CondStackTop];
-         assert(mach->LoopStackTop > 0);
+         ASSERT(mach->LoopStackTop > 0);
          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
-         assert(mach->ContStackTop > 0);
+         ASSERT(mach->ContStackTop > 0);
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
-         assert(mach->FuncStackTop > 0);
+         ASSERT(mach->FuncStackTop > 0);
          mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
 
          UPDATE_EXEC_MASK(mach);
@@ -1552,7 +1552,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SSG:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CMP:
@@ -1592,11 +1592,11 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NRM:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_DIV:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_DP2:
@@ -1615,7 +1615,7 @@ exec_instruction(
 
    case TGSI_OPCODE_IF:
       /* push CondMask */
-      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
       mach->CondStack[mach->CondStackTop++] = mach->CondMask;
       FETCH( &r[0], 0, CHAN_X );
       /* update CondMask */
@@ -1639,7 +1639,7 @@ exec_instruction(
       /* invert CondMask wrt previous mask */
       {
          uint prevMask;
-         assert(mach->CondStackTop > 0);
+         ASSERT(mach->CondStackTop > 0);
          prevMask = mach->CondStack[mach->CondStackTop - 1];
          mach->CondMask = ~mach->CondMask & prevMask;
          UPDATE_EXEC_MASK(mach);
@@ -1649,7 +1649,7 @@ exec_instruction(
 
    case TGSI_OPCODE_ENDIF:
       /* pop CondMask */
-      assert(mach->CondStackTop > 0);
+      ASSERT(mach->CondStackTop > 0);
       mach->CondMask = mach->CondStack[--mach->CondStackTop];
       UPDATE_EXEC_MASK(mach);
       break;
@@ -1660,19 +1660,19 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_REP:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_ENDREP:
-       assert (0);
+       ASSERT (0);
        break;
 
    case TGSI_OPCODE_PUSHA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_POPA:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_CEIL:
@@ -1746,7 +1746,7 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_MOD:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_XOR:
@@ -1759,15 +1759,15 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_SAD:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXF:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_TXQ:
-      assert (0);
+      ASSERT (0);
       break;
 
    case TGSI_OPCODE_EMIT:
@@ -1784,9 +1784,9 @@ exec_instruction(
       /* fall-through (for now) */
    case TGSI_OPCODE_BGNLOOP2:
       /* push LoopMask and ContMasks */
-      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
-      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
       mach->ContStack[mach->ContStackTop++] = mach->ContMask;
       break;
 
@@ -1794,7 +1794,7 @@ exec_instruction(
       /* fall-through (for now at least) */
    case TGSI_OPCODE_ENDLOOP2:
       /* Restore ContMask, but don't pop */
-      assert(mach->ContStackTop > 0);
+      ASSERT(mach->ContStackTop > 0);
       mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
       if (mach->LoopMask) {
          /* repeat loop: jump to instruction just past BGNLOOP */
@@ -1802,10 +1802,10 @@ exec_instruction(
       }
       else {
          /* exit loop: pop LoopMask */
-         assert(mach->LoopStackTop > 0);
+         ASSERT(mach->LoopStackTop > 0);
          mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
          /* pop ContMask */
-         assert(mach->ContStackTop > 0);
+         ASSERT(mach->ContStackTop > 0);
          mach->ContMask = mach->ContStack[--mach->ContStackTop];
       }
       UPDATE_EXEC_MASK(mach);
@@ -1834,26 +1834,26 @@ exec_instruction(
       break;
 
    case TGSI_OPCODE_NOISE1:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE2:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE3:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOISE4:
-      assert( 0 );
+      ASSERT( 0 );
       break;
 
    case TGSI_OPCODE_NOP:
       break;
 
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
@@ -1874,11 +1874,11 @@ spu_exec_machine_run( struct spu_exec_machine *mach )
    mach->FuncMask = 0xf;
    mach->ExecMask = 0xf;
 
-   mach->CondStackTop = 0; /* temporarily subvert this assertion */
-   assert(mach->CondStackTop == 0);
-   assert(mach->LoopStackTop == 0);
-   assert(mach->ContStackTop == 0);
-   assert(mach->CallStackTop == 0);
+   mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
+   ASSERT(mach->CondStackTop == 0);
+   ASSERT(mach->LoopStackTop == 0);
+   ASSERT(mach->ContStackTop == 0);
+   ASSERT(mach->CallStackTop == 0);
 
    mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
    mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
diff --git a/src/gallium/drivers/cell/spu/spu_main.c b/src/gallium/drivers/cell/spu/spu_main.c
index e04ffeb9b16..2a7cb75f592 100644
--- a/src/gallium/drivers/cell/spu/spu_main.c
+++ b/src/gallium/drivers/cell/spu/spu_main.c
@@ -34,6 +34,7 @@
 
 #include "spu_main.h"
 #include "spu_render.h"
+#include "spu_per_fragment_op.h"
 #include "spu_texture.h"
 #include "spu_tile.h"
 //#include "spu_test.h"
@@ -46,7 +47,7 @@
 /*
 helpful headers:
 /usr/lib/gcc/spu/4.1.1/include/spu_mfcio.h
-/opt/ibm/cell-sdk/prototype/sysroot/usr/include/libmisc.h
+/opt/cell/sdk/usr/include/libmisc.h
 */
 
 boolean Debug = FALSE;
@@ -55,17 +56,13 @@ struct spu_global spu;
 
 struct spu_vs_context draw;
 
-static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
-    ALIGN16_ATTRIB;
 
-static unsigned char depth_stencil_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
-
-static unsigned char fb_blend_code_buffer[4 * 64]
+/**
+ * Buffers containing dynamically generated SPU code:
+ */
+static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
     ALIGN16_ATTRIB;
 
-static unsigned char logicop_code_buffer[4 * 64]
-    ALIGN16_ATTRIB;
 
 
 /**
@@ -136,54 +133,75 @@ really_clear_tiles(uint surfaceIndex)
 static void
 cmd_clear_surface(const struct cell_command_clear_surface *clear)
 {
-   const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
-   uint i;
-
    if (Debug)
       printf("SPU %u: CLEAR SURF %u to 0x%08x\n", spu.init.id,
              clear->surface, clear->value);
 
-#define CLEAR_OPT 1
-#if CLEAR_OPT
-   /* set all tile's status to CLEAR */
    if (clear->surface == 0) {
-      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
       spu.fb.color_clear_value = clear->value;
+      if (spu.init.debug_flags & CELL_DEBUG_CHECKER) {
+         uint x = (spu.init.id << 4) | (spu.init.id << 12) |
+            (spu.init.id << 20) | (spu.init.id << 28);
+         spu.fb.color_clear_value ^= x;
+      }
    }
    else {
-      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
       spu.fb.depth_clear_value = clear->value;
    }
-   return;
-#endif
 
+#define CLEAR_OPT 1
+#if CLEAR_OPT
+
+   /* Simply set all tiles' status to CLEAR.
+    * When we actually begin rendering into a tile, we'll initialize it to
+    * the clear value.  If any tiles go untouched during the frame,
+    * really_clear_tiles() will set them to the clear value.
+    */
    if (clear->surface == 0) {
-      spu.fb.color_clear_value = clear->value;
-      clear_c_tile(&spu.ctile);
+      memset(spu.ctile_status, TILE_STATUS_CLEAR, sizeof(spu.ctile_status));
    }
    else {
-      spu.fb.depth_clear_value = clear->value;
-      clear_z_tile(&spu.ztile);
+      memset(spu.ztile_status, TILE_STATUS_CLEAR, sizeof(spu.ztile_status));
    }
 
+#else
+
+   /*
+    * This path clears the whole framebuffer to the clear color right now.
+    */
+
    /*
    printf("SPU: %s num=%d w=%d h=%d\n",
           __FUNCTION__, num_tiles, spu.fb.width_tiles, spu.fb.height_tiles);
    */
 
-   for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
-      uint tx = i % spu.fb.width_tiles;
-      uint ty = i / spu.fb.width_tiles;
-      if (clear->surface == 0)
-         put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
-      else
-         put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
-      /* XXX we don't want this here, but it fixes bad tile results */
+   /* init a single tile to the clear value */
+   if (clear->surface == 0) {
+      clear_c_tile(&spu.ctile);
+   }
+   else {
+      clear_z_tile(&spu.ztile);
    }
 
-#if 0
-   wait_on_mask(1 << TAG_SURFACE_CLEAR);
-#endif
+   /* walk over my tiles, writing the 'clear' tile's data */
+   {
+      const uint num_tiles = spu.fb.width_tiles * spu.fb.height_tiles;
+      uint i;
+      for (i = spu.init.id; i < num_tiles; i += spu.init.num_spus) {
+         uint tx = i % spu.fb.width_tiles;
+         uint ty = i / spu.fb.width_tiles;
+         if (clear->surface == 0)
+            put_tile(tx, ty, &spu.ctile, TAG_SURFACE_CLEAR, 0);
+         else
+            put_tile(tx, ty, &spu.ztile, TAG_SURFACE_CLEAR, 1);
+      }
+   }
+
+   if (spu.init.debug_flags & CELL_DEBUG_SYNC) {
+      wait_on_mask(1 << TAG_SURFACE_CLEAR);
+   }
+
+#endif /* CLEAR_OPT */
 
    if (Debug)
       printf("SPU %u: CLEAR SURF done\n", spu.init.id);
@@ -201,6 +219,31 @@ cmd_release_verts(const struct cell_command_release_verts *release)
 }
 
 
+/**
+ * Process a CELL_CMD_STATE_FRAGMENT_OPS command.
+ * This involves installing new fragment ops SPU code.
+ * If this function is never called, we'll use a regular C fallback function
+ * for fragment processing.
+ */
+static void
+cmd_state_fragment_ops(const struct cell_command_fragment_ops *fops)
+{
+   if (Debug)
+      printf("SPU %u: CMD_STATE_FRAGMENT_OPS\n", spu.init.id);
+   /* Copy SPU code from batch buffer to spu buffer */
+   memcpy(spu.fragment_ops_code, fops->code, SPU_MAX_FRAGMENT_OPS_INSTS * 4);
+   /* Copy state info */
+   memcpy(&spu.depth_stencil_alpha, &fops->dsa, sizeof(fops->dsa));
+   memcpy(&spu.blend, &fops->blend, sizeof(fops->blend));
+
+   /* Point function pointer at new code */
+   spu.fragment_ops = (spu_fragment_ops_func) spu.fragment_ops_code;
+
+   spu.read_depth = spu.depth_stencil_alpha.depth.enabled;
+   spu.read_stencil = spu.depth_stencil_alpha.stencil[0].enabled;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -227,87 +270,24 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 
    switch (spu.fb.depth_format) {
    case PIPE_FORMAT_Z32_UNORM:
+      spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0xffffffffu;
+      break;
    case PIPE_FORMAT_Z24S8_UNORM:
    case PIPE_FORMAT_S8Z24_UNORM:
+   case PIPE_FORMAT_Z24X8_UNORM:
+   case PIPE_FORMAT_X8Z24_UNORM:
       spu.fb.zsize = 4;
+      spu.fb.zscale = (float) 0x00ffffffu;
       break;
    case PIPE_FORMAT_Z16_UNORM:
       spu.fb.zsize = 2;
+      spu.fb.zscale = (float) 0xffffu;
       break;
    default:
       spu.fb.zsize = 0;
       break;
    }
-
-   if (spu.fb.color_format == PIPE_FORMAT_A8R8G8B8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              12, 0, 4, 8, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else if (spu.fb.color_format == PIPE_FORMAT_B8G8R8A8_UNORM)
-      spu.color_shuffle = ((vector unsigned char) {
-                              8, 4, 0, 12, 0, 0, 0, 0, 
-                              0, 0, 0, 0, 0, 0, 0, 0});
-   else
-      ASSERT(0);
-}
-
-
-static void
-cmd_state_blend(const struct cell_command_blend *state)
-{
-   if (Debug)
-      printf("SPU %u: BLEND: enabled %d\n",
-             spu.init.id,
-             (state->size != 0));
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(fb_blend_code_buffer,
-              (unsigned int) state->base,  /* src */
-              ROUNDUP16(state->size),
-              TAG_BATCH_BUFFER,
-              0, /* tid */
-              0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-      spu.blend = (blend_func) fb_blend_code_buffer;
-      spu.read_fb = state->read_fb;
-   } else {
-      spu.read_fb = FALSE;
-   }
-}
-
-
-static void
-cmd_state_depth_stencil(const struct cell_command_depth_stencil_alpha_test *state)
-{
-   if (Debug)
-      printf("SPU %u: DEPTH_STENCIL: ztest %d\n",
-             spu.init.id,
-             state->read_depth);
-
-   ASSERT_ALIGN16(state->base);
-
-   if (state->size != 0) {
-      mfc_get(depth_stencil_code_buffer,
-	      (unsigned int) state->base,  /* src */
-	      ROUNDUP16(state->size),
-	      TAG_BATCH_BUFFER,
-	      0, /* tid */
-	      0  /* rid */);
-      wait_on_mask(1 << TAG_BATCH_BUFFER);
-   } else {
-      /* If there is no code, emit a return instruction.
-       */
-      depth_stencil_code_buffer[0] = 0x35;
-      depth_stencil_code_buffer[1] = 0x00;
-      depth_stencil_code_buffer[2] = 0x00;
-      depth_stencil_code_buffer[3] = 0x00;
-   }
-
-   spu.frag_test = (frag_test_func) depth_stencil_code_buffer;
-   spu.read_depth = state->read_depth;
-   spu.read_stencil = state->read_stencil;
 }
 
 
@@ -381,6 +361,21 @@ cmd_state_vs_array_info(const struct cell_array_info *vs_info)
 
 
 static void
+cmd_state_attrib_fetch(const struct cell_attribute_fetch_code *code)
+{
+   mfc_get(attribute_fetch_code_buffer,
+           (unsigned int) code->base,  /* src */
+           code->size,
+           TAG_BATCH_BUFFER,
+           0, /* tid */
+           0  /* rid */);
+   wait_on_mask(1 << TAG_BATCH_BUFFER);
+
+   draw.vertex_fetch.code = attribute_fetch_code_buffer;
+}
+
+
+static void
 cmd_finish(void)
 {
    if (Debug)
@@ -395,7 +390,9 @@ cmd_finish(void)
 
 
 /**
- * Execute a batch of commands
+ * Execute a batch of commands which was sent to us by the PPU.
+ * See the cell_emit_state.c code to see where the commands come from.
+ *
  * The opcode param encodes the location of the buffer and its size.
  */
 static void
@@ -432,16 +429,14 @@ cmd_batch(uint opcode)
       printf("SPU %u: release batch buf %u\n", spu.init.id, buf);
    release_buffer(buf);
 
+   /*
+    * Loop over commands in the batch buffer
+    */
    for (pos = 0; pos < usize; /* no incr */) {
       switch (buffer[pos]) {
-      case CELL_CMD_STATE_FRAMEBUFFER:
-         {
-            struct cell_command_framebuffer *fb
-               = (struct cell_command_framebuffer *) &buffer[pos];
-            cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
-         }
-         break;
+      /*
+       * rendering commands
+       */
       case CELL_CMD_CLEAR_SURFACE:
          {
             struct cell_command_clear_surface *clr
@@ -459,26 +454,24 @@ cmd_batch(uint opcode)
             pos += pos_incr;
          }
          break;
-      case CELL_CMD_RELEASE_VERTS:
+      /*
+       * state-update commands
+       */
+      case CELL_CMD_STATE_FRAMEBUFFER:
          {
-            struct cell_command_release_verts *release
-               = (struct cell_command_release_verts *) &buffer[pos];
-            cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
+            struct cell_command_framebuffer *fb
+               = (struct cell_command_framebuffer *) &buffer[pos];
+            cmd_state_framebuffer(fb);
+            pos += sizeof(*fb) / 8;
          }
          break;
-      case CELL_CMD_FINISH:
-         cmd_finish();
-         pos += 1;
-         break;
-      case CELL_CMD_STATE_BLEND:
-         cmd_state_blend((struct cell_command_blend *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_blend)) / 8);
-         break;
-      case CELL_CMD_STATE_DEPTH_STENCIL:
-         cmd_state_depth_stencil((struct cell_command_depth_stencil_alpha_test *)
-                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_depth_stencil_alpha_test)) / 8);
+      case CELL_CMD_STATE_FRAGMENT_OPS:
+         {
+            struct cell_command_fragment_ops *fops
+               = (struct cell_command_fragment_ops *) &buffer[pos];
+            cmd_state_fragment_ops(fops);
+            pos += sizeof(*fops) / 8;
+         }
          break;
       case CELL_CMD_STATE_SAMPLER:
          {
@@ -514,42 +507,32 @@ cmd_batch(uint opcode)
          pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
          break;
       case CELL_CMD_STATE_BIND_VS:
+#if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
+#endif
          pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
          break;
-      case CELL_CMD_STATE_ATTRIB_FETCH: {
-         struct cell_attribute_fetch_code *code =
-             (struct cell_attribute_fetch_code *) &buffer[pos+1];
-
-              mfc_get(attribute_fetch_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-         draw.vertex_fetch.code = attribute_fetch_code_buffer;
+      case CELL_CMD_STATE_ATTRIB_FETCH:
+         cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
+                                &buffer[pos+1]);
          pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
          break;
-      }
-      case CELL_CMD_STATE_LOGICOP: {
-         struct cell_command_logicop *code =
-             (struct cell_command_logicop *) &buffer[pos+1];
-
-              mfc_get(logicop_code_buffer,
-                      (unsigned int) code->base,  /* src */
-                      code->size,
-                      TAG_BATCH_BUFFER,
-                      0, /* tid */
-                      0  /* rid */);
-         wait_on_mask(1 << TAG_BATCH_BUFFER);
-
-	 spu.logicop = (logicop_func) logicop_code_buffer;
-         pos += (1 + ROUNDUP8(sizeof(struct cell_command_logicop)) / 8);
+      /*
+       * misc commands
+       */
+      case CELL_CMD_FINISH:
+         cmd_finish();
+         pos += 1;
+         break;
+      case CELL_CMD_RELEASE_VERTS:
+         {
+            struct cell_command_release_verts *release
+               = (struct cell_command_release_verts *) &buffer[pos];
+            cmd_release_verts(release);
+            pos += sizeof(*release) / 8;
+         }
          break;
-      }
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
 	 struct cell_buffer_range *br = (struct cell_buffer_range *)
 	     &buffer[pos+1];
@@ -618,7 +601,9 @@ main_loop(void)
          exitFlag = 1;
          break;
       case CELL_CMD_VS_EXECUTE:
+#if 0
          spu_execute_vertex_shader(&draw, &cmd.vs);
+#endif
          break;
       case CELL_CMD_BATCH:
          cmd_batch(opcode);
@@ -643,6 +628,11 @@ one_time_init(void)
    memset(spu.ctile_status, TILE_STATUS_DEFINED, sizeof(spu.ctile_status));
    memset(spu.ztile_status, TILE_STATUS_DEFINED, sizeof(spu.ztile_status));
    invalidate_tex_cache();
+
+   /* Install default/fallback fragment processing function.
+    * This will normally be overriden by a code-gen'd function.
+    */
+   spu.fragment_ops = spu_fallback_fragment_ops;
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index e962e1426c6..d40539da83b 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,10 @@
 #define MAX_HEIGHT 1024
 
 
+/**
+ * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
+ * The data may be addressed through several different types.
+ */
 typedef union {
    ushort us[TILE_SIZE][TILE_SIZE];
    uint   ui[TILE_SIZE][TILE_SIZE];
@@ -56,38 +60,23 @@ typedef union {
 #define TILE_STATUS_GETTING 5  /**< mfc_get() called but not yet arrived */
 
 
-struct spu_frag_test_results {
-   qword mask;
-   qword depth;
-   qword stencil;
-};
-
-typedef struct spu_frag_test_results (*frag_test_func)(qword frag_mask,
-    qword pixel_depth, qword pixel_stencil, qword frag_depth,
-    qword frag_alpha, qword facing);
-
-
-struct spu_blend_results {
-   qword r;
-   qword g;
-   qword b;
-   qword a;
-};
+/** Function for sampling textures */
+typedef vector float (*spu_sample_texture_func)(uint unit,
+                                                vector float texcoord);
 
-typedef struct spu_blend_results (*blend_func)(
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword const_r, qword const_g, qword const_b, qword const_a);
+/** Function for performing per-fragment ops */
+typedef void (*spu_fragment_ops_func)(uint x, uint y,
+                                      tile_t *colorTile,
+                                      tile_t *depthStencilTile,
+                                      vector float fragZ,
+                                      vector float fragRed,
+                                      vector float fragGreen,
+                                      vector float fragBlue,
+                                      vector float fragAlpha,
+                                      vector unsigned int mask);
 
-typedef struct spu_blend_results (*logicop_func)(
-    qword pixel_r, qword pixel_g, qword pixel_b, qword pixel_a,
-    qword frag_r, qword frag_g, qword frag_b, qword frag_a,
-    qword frag_mask);
-
-
-typedef vector float (*sample_texture_func)(uint unit, vector float texcoord);
-
-struct spu_framebuffer {
+struct spu_framebuffer
+{
    void *color_start;              /**< addr of color surface in main memory */
    void *depth_start;              /**< addr of depth surface in main memory */
    enum pipe_format color_format;
@@ -99,6 +88,7 @@ struct spu_framebuffer {
    uint depth_clear_value;
 
    uint zsize;                     /**< 0, 2 or 4 bytes per Z */
+   float zscale;                   /**< 65535.0, 2^24-1 or 2^32-1 */
 } ALIGN16_ATTRIB;
 
 
@@ -115,35 +105,31 @@ struct spu_texture
 
 
 /**
- * All SPU global/context state will be in singleton object of this type:
+ * All SPU global/context state will be in a singleton object of this type:
  */
 struct spu_global
 {
+   /** One-time init/constant info */
    struct cell_init_info init;
 
+   /*
+    * Current state
+    */
    struct spu_framebuffer fb;
-   boolean read_depth;
-   boolean read_stencil;
-   frag_test_func frag_test;
-   
-   boolean read_fb;
-   blend_func blend;
-   qword const_blend_color[4] ALIGN16_ATTRIB;
-
-   logicop_func logicop;
-
+   struct pipe_depth_stencil_alpha_state depth_stencil_alpha;
+   struct pipe_blend_state blend;
    struct pipe_sampler_state sampler[PIPE_MAX_SAMPLERS];
    struct spu_texture texture[PIPE_MAX_SAMPLERS];
-
    struct vertex_info vertex_info;
 
-   /* XXX more state to come */
-
-
-   /** current color and Z tiles */
+   /** Current color and Z tiles */
    tile_t ctile ALIGN16_ATTRIB;
    tile_t ztile ALIGN16_ATTRIB;
 
+   /** Read depth/stencil tiles? */
+   boolean read_depth;
+   boolean read_stencil;
+
    /** Current tiles' status */
    ubyte cur_ctile_status, cur_ztile_status;
 
@@ -151,11 +137,13 @@ struct spu_global
    ubyte ctile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
    ubyte ztile_status[MAX_HEIGHT/TILE_SIZE][MAX_WIDTH/TILE_SIZE] ALIGN16_ATTRIB;
 
+   /** Current fragment ops machine code */
+   uint fragment_ops_code[SPU_MAX_FRAGMENT_OPS_INSTS];
+   /** Current fragment ops function */
+   spu_fragment_ops_func fragment_ops;
 
-   /** for converting RGBA to PIPE_FORMAT_x colors */
-   vector unsigned char color_shuffle;
-
-   sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   /** Current texture sampler function */
+   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index b4cffeeb32a..03dd547845b 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -1,211 +1,475 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 /**
- * \file spu_per_fragment_op.c
- * SPU implementation various per-fragment operations.
- *
- * \author Ian Romanick <[email protected]>
+ * \author Brian Paul
  */
 
+
+#include <transpose_matrix4x4.h>
 #include "pipe/p_format.h"
 #include "spu_main.h"
+#include "spu_colorpack.h"
 #include "spu_per_fragment_op.h"
 
-#define ZERO 0x80
-
-static void
-read_ds_quad(tile_t *buffer, unsigned x, unsigned y,
-             enum pipe_format depth_format, qword *depth,
-             qword *stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
-
-      const qword shuf_vec = (qword) {
-         ZERO, ZERO, 0, 1, ZERO, ZERO, 2, 3,
-         ZERO, ZERO, 4, 5, ZERO, ZERO, 6, 7
-      };
 
+#define LINEAR_QUAD_LAYOUT 1
 
-      /* At even X values we want the first 4 shorts, and at odd X values we
-       * want the second 4 shorts.
-       */
-      qword bias = (qword) spu_splats((unsigned char) ((ix & 0x01) << 3));
-      qword bias_mask = si_fsmbi(0x3333);
-      qword sv = si_a(shuf_vec, si_and(bias_mask, bias));
-
-      *depth = si_shufb(*ptr, *ptr, sv);
-      *stencil = si_il(0);
-      break;
-   }
-
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-
-      *depth = *ptr;
-      *stencil = si_il(0);
-      break;
-   }
-      
 
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0xEEEE);
-
-      *depth = si_rotmai(si_and(*ptr, mask), -8);
-      *stencil = si_andc(*ptr, mask);
-      break;
+/**
+ * Called by rasterizer for each quad after the shader has run.  Do
+ * all the per-fragment operations including alpha test, z test,
+ * stencil test, blend, colormask and logicops.  This is a
+ * fallback/debug function.  In reality we'll use a generated function
+ * produced by the PPU.  But this function is useful for
+ * debug/validation.
+ */
+void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragR,
+                          vector float fragG,
+                          vector float fragB,
+                          vector float fragA,
+                          vector unsigned int mask)
+{
+   vector float frag_aos[4];
+   unsigned int c0, c1, c2, c3;
+
+   /* do alpha test */
+   if (spu.depth_stencil_alpha.alpha.enabled) {
+      vector float ref = spu_splats(spu.depth_stencil_alpha.alpha.ref);
+      vector unsigned int amask;
+
+      switch (spu.depth_stencil_alpha.alpha.func) {
+      case PIPE_FUNC_LESS:
+         amask = spu_cmpgt(ref, fragA);  /* mask = (fragA < ref) */
+         break;
+      case PIPE_FUNC_GREATER:
+         amask = spu_cmpgt(fragA, ref);  /* mask = (fragA > ref) */
+         break;
+      case PIPE_FUNC_GEQUAL:
+         amask = spu_cmpgt(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_LEQUAL:
+         amask = spu_cmpgt(fragA, ref);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_EQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         break;
+      case PIPE_FUNC_NOTEQUAL:
+         amask = spu_cmpeq(ref, fragA);
+         amask = spu_nor(amask, amask);
+         break;
+      case PIPE_FUNC_ALWAYS:
+         amask = spu_splats(0xffffffffU);
+         break;
+      case PIPE_FUNC_NEVER:
+         amask = spu_splats( 0x0U);
+         break;
+      default:
+         ;
+      }
+
+      mask = spu_and(mask, amask);
    }
 
-
-   case PIPE_FORMAT_S8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-
-      *depth = si_and(*ptr, si_fsmbi(0x7777));
-      *stencil = si_andi(si_roti(*ptr, 8), 0x0ff);
-      break;
+   /* Z and/or stencil testing... */
+   if (spu.depth_stencil_alpha.depth.enabled ||
+       spu.depth_stencil_alpha.stencil[0].enabled) {
+
+      /* get four Z/Stencil values from tile */
+      vector unsigned int mask24 = spu_splats((unsigned int)0x00ffffffU);
+      vector unsigned int ifbZS = depthStencilTile->ui4[y/2][x/2];
+      vector unsigned int ifbZ = spu_and(ifbZS, mask24);
+      vector unsigned int ifbS = spu_andc(ifbZS, mask24);
+
+      if (spu.depth_stencil_alpha.stencil[0].enabled) {
+         /* do stencil test */
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM);
+
+      }
+      else if (spu.depth_stencil_alpha.depth.enabled) {
+         /* do depth test */
+
+         ASSERT(spu.fb.depth_format == PIPE_FORMAT_S8Z24_UNORM ||
+                spu.fb.depth_format == PIPE_FORMAT_X8Z24_UNORM);
+
+         vector unsigned int ifragZ;
+         vector unsigned int zmask;
+
+         /* convert four fragZ from float to uint */
+         fragZ = spu_mul(fragZ, spu_splats((float) 0xffffff));
+         ifragZ = spu_convtu(fragZ, 0);
+
+         /* do depth comparison, setting zmask with results */
+         switch (spu.depth_stencil_alpha.depth.func) {
+         case PIPE_FUNC_LESS:
+            zmask = spu_cmpgt(ifbZ, ifragZ);  /* mask = (ifragZ < ifbZ) */
+            break;
+         case PIPE_FUNC_GREATER:
+            zmask = spu_cmpgt(ifragZ, ifbZ);  /* mask = (ifbZ > ifragZ) */
+            break;
+         case PIPE_FUNC_GEQUAL:
+            zmask = spu_cmpgt(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_LEQUAL:
+            zmask = spu_cmpgt(ifragZ, ifbZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_EQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            break;
+         case PIPE_FUNC_NOTEQUAL:
+            zmask = spu_cmpeq(ifbZ, ifragZ);
+            zmask = spu_nor(zmask, zmask);
+            break;
+         case PIPE_FUNC_ALWAYS:
+            zmask = spu_splats(0xffffffffU);
+            break;
+         case PIPE_FUNC_NEVER:
+            zmask = spu_splats( 0x0U);
+            break;
+         default:
+            ;
+         }
+
+         mask = spu_and(mask, zmask);
+
+         /* merge framebuffer Z and fragment Z according to the mask */
+         ifbZ = spu_or(spu_and(ifragZ, mask),
+                       spu_andc(ifbZ, mask));
+      }
+
+      if (spu_extract(spu_orx(mask), 0)) {
+         /* put new fragment Z/Stencil values back into Z/Stencil tile */
+         depthStencilTile->ui4[y/2][x/2] = spu_or(ifbZ, ifbS);
+
+         spu.cur_ztile_status = TILE_STATUS_DIRTY;
+      }
    }
 
-
-   default:
-      assert(0);
-      break;
+   if (spu.blend.blend_enable) {
+      /* blending terms, misc regs */
+      vector float term1r, term1g, term1b, term1a;
+      vector float term2r, term2g, term2b, term2a;
+      vector float one, tmp;
+
+      vector float fbRGBA[4];  /* current framebuffer colors */
+
+      /* get colors from framebuffer/tile */
+      {
+         vector float fc[4];
+         uint c0, c1, c2, c3;
+
+#if LINEAR_QUAD_LAYOUT /* See comments/diagram below */
+         c0 = colorTile->ui[y][x*2+0];
+         c1 = colorTile->ui[y][x*2+1];
+         c2 = colorTile->ui[y][x*2+2];
+         c3 = colorTile->ui[y][x*2+3];
+#else
+         c0 = colorTile->ui[y+0][x+0];
+         c1 = colorTile->ui[y+0][x+1];
+         c2 = colorTile->ui[y+1][x+0];
+         c3 = colorTile->ui[y+1][x+1];
+#endif
+         switch (spu.fb.color_format) {
+         case PIPE_FORMAT_B8G8R8A8_UNORM:
+            fc[0] = spu_unpack_B8G8R8A8(c0);
+            fc[1] = spu_unpack_B8G8R8A8(c1);
+            fc[2] = spu_unpack_B8G8R8A8(c2);
+            fc[3] = spu_unpack_B8G8R8A8(c3);
+            break;
+         case PIPE_FORMAT_A8R8G8B8_UNORM:
+            fc[0] = spu_unpack_A8R8G8B8(c0);
+            fc[1] = spu_unpack_A8R8G8B8(c1);
+            fc[2] = spu_unpack_A8R8G8B8(c2);
+            fc[3] = spu_unpack_A8R8G8B8(c3);
+            break;
+         default:
+            ASSERT(0);
+         }
+         _transpose_matrix4x4(fbRGBA, fc);
+      }
+
+      /*
+       * Compute Src RGB terms
+       */
+      switch (spu.blend.rgb_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1r = fragR;
+         term1g = fragG;
+         term1b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term1r =
+         term1g =
+         term1b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1r = spu_mul(fragR, fragR);
+         term1g = spu_mul(fragG, fragG);
+         term1b = spu_mul(fragB, fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1r = spu_mul(fragR, fragA);
+         term1g = spu_mul(fragG, fragA);
+         term1b = spu_mul(fragB, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Src Alpha term
+       */
+      switch (spu.blend.alpha_src_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term1a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term1a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term1a = spu_mul(fragA, fragA);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest RGB terms
+       */
+      switch (spu.blend.rgb_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2r = fragR;
+         term2g = fragG;
+         term2b = fragB;
+         break;
+      case PIPE_BLENDFACTOR_ZERO:
+         term2r =
+         term2g =
+         term2b = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2r = spu_mul(fbRGBA[0], fragR);
+         term2g = spu_mul(fbRGBA[1], fragG);
+         term2b = spu_mul(fbRGBA[2], fragB);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2r = spu_mul(fbRGBA[0], fragA);
+         term2g = spu_mul(fbRGBA[1], fragA);
+         term2b = spu_mul(fbRGBA[2], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2r = spu_mul(fbRGBA[0], tmp);
+         term2g = spu_mul(fbRGBA[1], tmp);
+         term2b = spu_mul(fbRGBA[2], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Compute Dest Alpha term
+       */
+      switch (spu.blend.alpha_dst_factor) {
+      case PIPE_BLENDFACTOR_ONE:
+         term2a = fragA;
+         break;
+      case PIPE_BLENDFACTOR_SRC_COLOR:
+         term2a = spu_splats(0.0f);
+         break;
+      case PIPE_BLENDFACTOR_SRC_ALPHA:
+         term2a = spu_mul(fbRGBA[3], fragA);
+         break;
+      case PIPE_BLENDFACTOR_INV_SRC_ALPHA:
+         one = spu_splats(1.0f);
+         tmp = spu_sub(one, fragA);
+         term2a = spu_mul(fbRGBA[3], tmp);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest RGB terms
+       */
+      switch (spu.blend.rgb_func) {
+      case PIPE_BLEND_ADD:
+         fragR = spu_add(term1r, term2r);
+         fragG = spu_add(term1g, term2g);
+         fragB = spu_add(term1b, term2b);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragR = spu_sub(term1r, term2r);
+         fragG = spu_sub(term1g, term2g);
+         fragB = spu_sub(term1b, term2b);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
+
+      /*
+       * Combine Src/Dest A term
+       */
+      switch (spu.blend.alpha_func) {
+      case PIPE_BLEND_ADD:
+         fragA = spu_add(term1a, term2a);
+         break;
+      case PIPE_BLEND_SUBTRACT:
+         fragA = spu_sub(term1a, term2a);
+         break;
+      /* XXX more cases */
+      default:
+         ASSERT(0);
+      }
    }
-}
-
-
-static void
-write_ds_quad(tile_t *buffer, unsigned x, unsigned y,
-              enum pipe_format depth_format,
-              qword depth, qword stencil)
-{
-   const int ix = x / 2;
-   const int iy = y / 2;
-
-   (void) stencil;
 
-   switch (depth_format) {
-   case PIPE_FORMAT_Z16_UNORM: {
-      qword *ptr = (qword *) &buffer->us8[iy][ix / 2];
 
-      qword sv = ((ix & 0x01) == 0) 
-          ? (qword) { 2, 3, 6, 7, 10, 11, 14, 15,
-                      24, 25, 26, 27, 28, 29, 30, 31 }
-          : (qword) { 16, 17, 18, 19, 20 , 21, 22, 23,
-                      2, 3, 6, 7, 10, 11, 14, 15 };
-      *ptr = si_shufb(depth, *ptr, sv);
-      break;
+   /*
+    * Convert RRRR,GGGG,BBBB,AAAA to RGBA,RGBA,RGBA,RGBA.
+    */
+#if 0
+   /* original code */
+   {
+      vector float frag_soa[4];
+      frag_soa[0] = fragR;
+      frag_soa[1] = fragG;
+      frag_soa[2] = fragB;
+      frag_soa[3] = fragA;
+      _transpose_matrix4x4(frag_aos, frag_soa);
    }
-
-
-   case PIPE_FORMAT_Z32_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      *ptr = depth;
+#else
+   /* short-cut relying on function parameter layout: */
+   _transpose_matrix4x4(frag_aos, &fragR);
+   (void) fragG;
+   (void) fragB;
+#endif
+
+   /*
+    * Pack float colors into 32-bit RGBA words.
+    */
+   switch (spu.fb.color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      c0 = spu_pack_A8R8G8B8(frag_aos[0]);
+      c1 = spu_pack_A8R8G8B8(frag_aos[1]);
+      c2 = spu_pack_A8R8G8B8(frag_aos[2]);
+      c3 = spu_pack_A8R8G8B8(frag_aos[3]);
       break;
-   }
-
 
-   case PIPE_FORMAT_Z24S8_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0xEEEE);
-
-      depth = si_shli(depth, 8);
-      *ptr = si_selb(stencil, depth, mask);
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      c0 = spu_pack_B8G8R8A8(frag_aos[0]);
+      c1 = spu_pack_B8G8R8A8(frag_aos[1]);
+      c2 = spu_pack_B8G8R8A8(frag_aos[2]);
+      c3 = spu_pack_B8G8R8A8(frag_aos[3]);
       break;
+   default:
+      fprintf(stderr, "SPU: Bad pixel format in spu_default_fragment_ops\n");
+      ASSERT(0);
    }
 
 
-   case PIPE_FORMAT_S8Z24_UNORM: {
-      qword *ptr = (qword *) &buffer->ui4[iy][ix];
-      qword mask = si_fsmbi(0x7777);
-
-      stencil = si_shli(stencil, 24);
-      *ptr = si_selb(stencil, depth, mask);
-      break;
+   /*
+    * Color masking
+    */
+   if (spu.blend.colormask != 0xf) {
+      /* XXX to do */
+      /* apply color mask to 32-bit packed colors */
    }
 
 
-   default:
-      assert(0);
-      break;
+   /*
+    * Logic Ops
+    */
+   if (spu.blend.logicop_enable) {
+      /* XXX to do */
+      /* apply logicop to 32-bit packed colors */
    }
-}
 
 
-qword
-spu_do_depth_stencil(int x, int y,
-                     qword frag_mask, qword frag_depth, qword frag_alpha,
-                     qword facing)
-{
-   struct spu_frag_test_results  result;
-   qword pixel_depth;
-   qword pixel_stencil;
-
-   /* All of this preable code (everthing before the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
+   /*
+    * If mask is non-zero, mark tile as dirty.
     */
-   if (spu.read_depth || spu.read_stencil) {
-      read_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                   &pixel_depth, &pixel_stencil);
+   if (spu_extract(spu_orx(mask), 0)) {
+      spu.cur_ctile_status = TILE_STATUS_DIRTY;
    }
-   
-   switch (spu.fb.depth_format) {
-   case PIPE_FORMAT_Z16_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x0000ffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z32_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0xffffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   case PIPE_FORMAT_Z24S8_UNORM:
-   case PIPE_FORMAT_S8Z24_UNORM:
-      frag_depth = si_fm(frag_depth, (qword)spu_splats((float)(0x00ffffffu)));
-      frag_depth = si_cfltu(frag_depth, 0);
-      break;
-   default:
-      ASSERT(0);
-      break;
+   else {
+      return;
    }
 
-   result = (*spu.frag_test)(frag_mask, pixel_depth, pixel_stencil,
-                             frag_depth, frag_alpha, facing);
-
 
-   /* This code (everthing after the call to frag_test) should
-    * be generated on the PPU and upload to the SPU.
+   /*
+    * Write new quad colors to the framebuffer/tile.
+    * Only write pixels where the corresponding mask word is set.
     */
-   if (spu.read_depth || spu.read_stencil) {
-      write_ds_quad(&spu.ztile, x, y, spu.fb.depth_format,
-                    result.depth, result.stencil);
-   }
-
-   return result.mask;
+#if LINEAR_QUAD_LAYOUT
+   /*
+    * Quad layout:
+    *  +--+--+--+--+
+    *  |p0|p1|p2|p3|
+    *  +--+--+--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y][x*2] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y][x*2+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y][x*2+2] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y][x*2+3] = c3;
+#else
+   /*
+    * Quad layout:
+    *  +--+--+
+    *  |p0|p1|
+    *  +--+--+
+    *  |p2|p3|
+    *  +--+--+
+    */
+   if (spu_extract(mask, 0))
+      colorTile->ui[y+0][x+0] = c0;
+   if (spu_extract(mask, 1))
+      colorTile->ui[y+0][x+1] = c1;
+   if (spu_extract(mask, 2))
+      colorTile->ui[y+1][x+0] = c2;
+   if (spu_extract(mask, 3))
+      colorTile->ui[y+1][x+1] = c3;
+#endif
 }
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index 65712586992..f817abf0463 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -1,32 +1,44 @@
-/*
- * (C) Copyright IBM Corporation 2008
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * on the rights to use, copy, modify, merge, publish, distribute, sub
- * license, and/or sell copies of the Software, and to permit persons to whom
- * the Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
- * AUTHORS, COPYRIGHT HOLDERS, AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
- * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
- * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
- * USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
 
 #ifndef SPU_PER_FRAGMENT_OP
 #define SPU_PER_FRAGMENT_OP
 
-extern qword
-spu_do_depth_stencil(int x, int y, qword frag_mask, qword frag_depth,
-		     qword frag_alpha, qword facing);
+
+extern void
+spu_fallback_fragment_ops(uint x, uint y,
+                          tile_t *colorTile,
+                          tile_t *depthStencilTile,
+                          vector float fragZ,
+                          vector float fragRed,
+                          vector float fragGreen,
+                          vector float fragBlue,
+                          vector float fragAlpha,
+                          vector unsigned int mask);
+
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 5051774f00c..117b8a36f80 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -97,7 +97,7 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    const qword offset_y = si_andi((qword) y, 0x1f);
 
    const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
-   const qword tile_size = (qword) spu_splats(sizeof(tile_t));
+   const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
    tile_offset = si_mpy((qword) tile_offset, tile_size);
diff --git a/src/gallium/drivers/cell/spu/spu_tile.c b/src/gallium/drivers/cell/spu/spu_tile.c
index 12dc2463283..216a33126b7 100644
--- a/src/gallium/drivers/cell/spu/spu_tile.c
+++ b/src/gallium/drivers/cell/spu/spu_tile.c
@@ -31,6 +31,9 @@
 #include "spu_main.h"
 
 
+/**
+ * Get tile of color or Z values from main memory, put into SPU memory.
+ */
 void
 get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
 {
@@ -56,6 +59,9 @@ get_tile(uint tx, uint ty, tile_t *tile, int tag, int zBuf)
 }
 
 
+/**
+ * Move tile of color or Z values from SPU memory to main memory.
+ */
 void
 put_tile(uint tx, uint ty, const tile_t *tile, int tag, int zBuf)
 {
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 2a4e0b423ca..f02cdd1f763 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -38,7 +38,6 @@
 #include "spu_texture.h"
 #include "spu_tile.h"
 #include "spu_tri.h"
-#include "spu_per_fragment_op.h"
 
 
 /** Masks are uint[4] vectors with each element being 0 or 0xffffffff */
@@ -209,7 +208,7 @@ clip_emit_quad(struct setup_stage *setup)
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
- * Eg: four colors will be compute.
+ * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
 eval_coeff(uint slot, float x, float y, vector float result[4])
@@ -255,31 +254,6 @@ eval_z(float x, float y)
 }
 
 
-static INLINE mask_t
-do_depth_test(int x, int y, mask_t quadmask)
-{
-   float4 zvals;
-   mask_t mask;
-
-   if (spu.fb.depth_format == PIPE_FORMAT_NONE)
-      return quadmask;
-
-   zvals.v = eval_z((float) x, (float) y);
-
-   mask = (mask_t) spu_do_depth_stencil(x - setup.cliprect_minx,
-					y - setup.cliprect_miny,
-					(qword) quadmask, 
-					(qword) zvals.v,
-					(qword) spu_splats((unsigned char) 0x0ffu),
-					(qword) spu_splats((unsigned int) 0x01u));
-
-   if (spu_extract(spu_orx(mask), 0))
-      spu.cur_ztile_status = TILE_STATUS_DIRTY;
-
-   return mask;
-}
-
-
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -289,18 +263,6 @@ do_depth_test(int x, int y, mask_t quadmask)
 static INLINE void
 emit_quad( int x, int y, mask_t mask )
 {
-#if 0
-   struct softpipe_context *sp = setup.softpipe;
-   setup.quad.x0 = x;
-   setup.quad.y0 = y;
-   setup.quad.mask = mask;
-   sp->quad.first->run(sp->quad.first, &setup.quad);
-#else
-
-   if (spu.read_depth) {
-      mask = do_depth_test(x, y, mask);
-   }
-
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
       const int ix = x - setup.cliprect_minx;
@@ -308,6 +270,7 @@ emit_quad( int x, int y, mask_t mask )
       vector float colors[4];
 
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
+      spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
       if (spu.texture[0].start) {
          /* texture mapping */
@@ -355,55 +318,29 @@ emit_quad( int x, int y, mask_t mask )
       }
 
 
-      /* Convert fragment data from AoS to SoA format.
-       */
-      qword soa_frag[4];
-      _transpose_matrix4x4((vec_float4 *) soa_frag, colors);
+      {
+         /* Convert fragment data from AoS to SoA format.
+          * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
+          * This is temporary!
+          */
+         vector float soa_frag[4];
+         _transpose_matrix4x4(soa_frag, colors);
 
-      /* Read the current framebuffer values.
-       */
-      const qword pix[4] = {
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+0][ix+1]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+0]),
-         (qword) spu_splats(spu.ctile.ui[iy+1][ix+1]),
-      };
+         float4 fragZ;
 
-      qword soa_pix[4];
+         fragZ.v = eval_z((float) x, (float) y);
 
-      if (spu.read_fb) {
-         /* Convert pixel data from AoS to SoA format.
+         /* Do all per-fragment/quad operations here, including:
+          *  alpha test, z test, stencil test, blend and framebuffer writing.
           */
-         vec_float4 aos_pix[4] = {
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+0][ix+1]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+0]),
-            spu_unpack_A8R8G8B8(spu.ctile.ui[iy+1][ix+1]),
-         };
-
-         _transpose_matrix4x4((vec_float4 *) soa_pix, aos_pix);
+         spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
+                          fragZ.v,
+                          soa_frag[0], soa_frag[1],
+                          soa_frag[2], soa_frag[3],
+                          mask);
       }
 
-
-      struct spu_blend_results result =
-          (*spu.blend)(soa_frag[0], soa_frag[1], soa_frag[2], soa_frag[3],
-                       soa_pix[0], soa_pix[1], soa_pix[2], soa_pix[3],
-                       spu.const_blend_color[0], spu.const_blend_color[1],
-                       spu.const_blend_color[2], spu.const_blend_color[3]);
-
-
-      /* Convert final pixel data from SoA to AoS format.
-       */
-      result = (*spu.logicop)(pix[0], pix[1], pix[2], pix[3],
-                              result.r, result.g, result.b, result.a,
-                              (qword) mask);
-
-      spu.ctile.ui[iy+0][ix+0] = spu_extract((vec_uint4) result.r, 0);
-      spu.ctile.ui[iy+0][ix+1] = spu_extract((vec_uint4) result.g, 0);
-      spu.ctile.ui[iy+1][ix+0] = spu_extract((vec_uint4) result.b, 0);
-      spu.ctile.ui[iy+1][ix+1] = spu_extract((vec_uint4) result.a, 0);
    }
-#endif
 }
 
 
diff --git a/src/gallium/drivers/cell/spu/spu_util.c b/src/gallium/drivers/cell/spu/spu_util.c
index b25ca4eafc0..b8a0d4a265f 100644
--- a/src/gallium/drivers/cell/spu/spu_util.c
+++ b/src/gallium/drivers/cell/spu/spu_util.c
@@ -1,4 +1,5 @@
 
+#include "cell/common.h"
 #include "pipe/p_shader_tokens.h"
 #include "pipe/p_debug.h"
 #include "tgsi/tgsi_parse.h"
@@ -20,7 +21,7 @@ tgsi_util_get_src_register_swizzle(
    case 3:
       return reg->SwizzleW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -40,7 +41,7 @@ tgsi_util_get_src_register_extswizzle(
    case 3:
       return reg->ExtSwizzleW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -60,12 +61,12 @@ tgsi_util_get_full_src_register_extswizzle(
       &reg->SrcRegisterExtSwz,
       component );
 
-   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
-   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
-   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
-   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
-   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   ASSERT (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   ASSERT (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   ASSERT (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   ASSERT (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
 
    /*
     * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
@@ -95,7 +96,7 @@ tgsi_util_get_src_register_extnegate(
    case 3:
       return reg->NegateW;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
    return 0;
 }
@@ -120,7 +121,7 @@ tgsi_util_set_src_register_extnegate(
       reg->NegateW = negate;
       break;
    default:
-      assert( 0 );
+      ASSERT( 0 );
    }
 }
 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
index 26f23637492..03375d84a57 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_fetch.c
@@ -92,7 +92,7 @@ static void generic_vertex_fetch(struct spu_vs_context *draw,
    unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
    unsigned attr;
 
-   assert(count <= 4);
+   ASSERT(count <= 4);
 
 #if DRAW_DBG
    printf("SPU: %s count = %u, nr_attrs = %u\n", 
diff --git a/src/gallium/drivers/cell/spu/spu_vertex_shader.c b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
index f81d19fea1c..fbe5b34d397 100644
--- a/src/gallium/drivers/cell/spu/spu_vertex_shader.c
+++ b/src/gallium/drivers/cell/spu/spu_vertex_shader.c
@@ -112,7 +112,7 @@ run_vertex_program(struct spu_vs_context *draw,
    const float *scale = draw->viewport.scale;
    const float *trans = draw->viewport.translate;
 
-   assert(count <= 4);
+   ASSERT(count <= 4);
 
    machine->Processor = TGSI_PROCESSOR_VERTEX;
 
diff --git a/src/gallium/drivers/i915simple/i915_context.c b/src/gallium/drivers/i915simple/i915_context.c
index c6776716a2f..6dd3eda85dc 100644
--- a/src/gallium/drivers/i915simple/i915_context.c
+++ b/src/gallium/drivers/i915simple/i915_context.c
@@ -35,6 +35,7 @@
 #include "draw/draw_context.h"
 #include "pipe/p_defines.h"
 #include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "pipe/p_screen.h"
 
@@ -72,7 +73,7 @@ i915_draw_range_elements(struct pipe_context *pipe,
     */
    for (i = 0; i < i915->num_vertex_buffers; i++) {
       void *buf
-         = pipe->winsys->buffer_map(pipe->winsys,
+         = pipe_buffer_map(pipe->screen,
                                     i915->vertex_buffer[i].buffer,
                                     PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_vertex_buffer(draw, i, buf);
@@ -80,7 +81,7 @@ i915_draw_range_elements(struct pipe_context *pipe,
    /* Map index buffer, if present */
    if (indexBuffer) {
       void *mapped_indexes
-         = pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+         = pipe_buffer_map(pipe->screen, indexBuffer,
                                     PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_element_buffer_range(draw, indexSize,
 					   min_index,
@@ -105,11 +106,11 @@ i915_draw_range_elements(struct pipe_context *pipe,
     * unmap vertex/index buffers
     */
    for (i = 0; i < i915->num_vertex_buffers; i++) {
-      pipe->winsys->buffer_unmap(pipe->winsys, i915->vertex_buffer[i].buffer);
+      pipe_buffer_unmap(pipe->screen, i915->vertex_buffer[i].buffer);
       draw_set_mapped_vertex_buffer(draw, i, NULL);
    }
    if (indexBuffer) {
-      pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
       draw_set_mapped_element_buffer_range(draw, 0, start, start + count - 1, NULL);
    }
 
diff --git a/src/gallium/drivers/i915simple/i915_prim_vbuf.c b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
index e4ece550985..4fda1ab64f5 100644
--- a/src/gallium/drivers/i915simple/i915_prim_vbuf.c
+++ b/src/gallium/drivers/i915simple/i915_prim_vbuf.c
@@ -115,7 +115,7 @@ i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
 {
    struct i915_vbuf_render *i915_render = i915_vbuf_render(render);
    struct i915_context *i915 = i915_render->i915;
-   struct pipe_winsys *winsys = i915->pipe.winsys;
+   struct pipe_screen *screen = i915->pipe.screen;
    size_t size = (size_t)vertex_size * (size_t)nr_vertices;
 
    /* FIXME: handle failure */
@@ -124,20 +124,20 @@ i915_vbuf_render_allocate_vertices( struct vbuf_render *render,
    if (i915_render->vbo_size > size + i915_render->vbo_offset && !i915->vbo_flushed) {
    } else {
       i915->vbo_flushed = 0;
-      pipe_buffer_reference(winsys, &i915_render->vbo, NULL);
+      pipe_buffer_reference(screen, &i915_render->vbo, NULL);
    }
 
    if (!i915_render->vbo) {
       i915_render->vbo_size = MAX2(size, i915_render->vbo_alloc_size);
       i915_render->vbo_offset = 0;
-      i915_render->vbo = winsys->buffer_create(winsys,
-					       64,
-					       I915_BUFFER_USAGE_LIT_VERTEX,
-					       i915_render->vbo_size);
-      i915_render->vbo_ptr = winsys->buffer_map(winsys,
-						i915_render->vbo,
-						PIPE_BUFFER_USAGE_CPU_WRITE);
-      winsys->buffer_unmap(winsys, i915_render->vbo);
+      i915_render->vbo = pipe_buffer_create(screen,
+                                            64,
+                                            I915_BUFFER_USAGE_LIT_VERTEX,
+                                            i915_render->vbo_size);
+      i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                             i915_render->vbo,
+                                             PIPE_BUFFER_USAGE_CPU_WRITE);
+      pipe_buffer_unmap(screen, i915_render->vbo);
    }
 
    i915->vbo = i915_render->vbo;
@@ -488,7 +488,7 @@ static struct vbuf_render *
 i915_vbuf_render_create( struct i915_context *i915 )
 {
    struct i915_vbuf_render *i915_render = CALLOC_STRUCT(i915_vbuf_render);
-   struct pipe_winsys *winsys = i915->pipe.winsys;
+   struct pipe_screen *screen = i915->pipe.screen;
 
    i915_render->i915 = i915;
    
@@ -510,14 +510,14 @@ i915_vbuf_render_create( struct i915_context *i915 )
    i915_render->vbo_alloc_size = 128 * 4096;
    i915_render->vbo_size = i915_render->vbo_alloc_size;
    i915_render->vbo_offset = 0;
-   i915_render->vbo = winsys->buffer_create(winsys,
-					    64,
-					    I915_BUFFER_USAGE_LIT_VERTEX,
-					    i915_render->vbo_size);
-   i915_render->vbo_ptr = winsys->buffer_map(winsys,
-					     i915_render->vbo,
-					     PIPE_BUFFER_USAGE_CPU_WRITE);
-   winsys->buffer_unmap(winsys, i915_render->vbo);
+   i915_render->vbo = pipe_buffer_create(screen,
+                                         64,
+                                         I915_BUFFER_USAGE_LIT_VERTEX,
+                                         i915_render->vbo_size);
+   i915_render->vbo_ptr = pipe_buffer_map(screen,
+                                          i915_render->vbo,
+                                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   pipe_buffer_unmap(screen, i915_render->vbo);
 
    return &i915_render->base;
 }
diff --git a/src/gallium/drivers/i915simple/i915_screen.c b/src/gallium/drivers/i915simple/i915_screen.c
index e9e40c3f0b2..1c976082df7 100644
--- a/src/gallium/drivers/i915simple/i915_screen.c
+++ b/src/gallium/drivers/i915simple/i915_screen.c
@@ -28,6 +28,7 @@
 
 #include "util/u_memory.h"
 #include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
 #include "util/u_string.h"
 
 #include "i915_reg.h"
@@ -207,7 +208,7 @@ i915_surface_map( struct pipe_screen *screen,
                   struct pipe_surface *surface,
                   unsigned flags )
 {
-   char *map = screen->winsys->buffer_map( screen->winsys, surface->buffer, flags );
+   char *map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
 
@@ -226,7 +227,7 @@ static void
 i915_surface_unmap(struct pipe_screen *screen,
                    struct pipe_surface *surface)
 {
-   screen->winsys->buffer_unmap( screen->winsys, surface->buffer );
+   pipe_buffer_unmap( screen, surface->buffer );
 }
 
 
diff --git a/src/gallium/drivers/i915simple/i915_texture.c b/src/gallium/drivers/i915simple/i915_texture.c
index 32344da4d5a..bd87217063c 100644
--- a/src/gallium/drivers/i915simple/i915_texture.c
+++ b/src/gallium/drivers/i915simple/i915_texture.c
@@ -80,7 +80,7 @@ static unsigned
 power_of_two(unsigned x)
 {
    unsigned value = 1;
-   while (value <= x)
+   while (value < x)
       value = value << 1;
    return value;
 }
@@ -207,7 +207,7 @@ i945_miptree_layout_2d( struct i915_texture *tex )
    unsigned nblocksy = pt->nblocksy[0];
 
 #if 0 /* used for tiled display targets */
-   if (pt->last_level == 0 && pt->cpp == 4)
+   if (pt->last_level == 0 && pt->block.size == 4)
       if (i915_displaytarget_layout(tex))
 	 return;
 #endif
@@ -645,7 +645,7 @@ i915_texture_release(struct pipe_screen *screen,
       DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
       */
 
-      pipe_buffer_reference(screen->winsys, &tex->buffer, NULL);
+      pipe_buffer_reference(screen, &tex->buffer, NULL);
 
       for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
          if (tex->image_offset[i])
@@ -684,7 +684,7 @@ i915_get_tex_surface(struct pipe_screen *screen,
       ps->refcount = 1;
       ps->winsys = ws;
       pipe_texture_reference(&ps->texture, pt);
-      pipe_buffer_reference(ws, &ps->buffer, tex->buffer);
+      pipe_buffer_reference(screen, &ps->buffer, tex->buffer);
       ps->format = pt->format;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
@@ -728,7 +728,7 @@ i915_texture_blanket(struct pipe_screen * screen,
    i915_miptree_set_level_info(tex, 0, 1, base->width[0], base->height[0], 1);
    i915_miptree_set_image_offset(tex, 0, 0, 0, 0);
 
-   pipe_buffer_reference(screen->winsys, &tex->buffer, buffer);
+   pipe_buffer_reference(screen, &tex->buffer, buffer);
 
    return &tex->base;
 }
@@ -756,7 +756,7 @@ i915_tex_surface_release(struct pipe_screen *screen,
       }
 
       pipe_texture_reference(&surf->texture, NULL);
-      pipe_buffer_reference(screen->winsys, &surf->buffer, NULL);
+      pipe_buffer_reference(screen, &surf->buffer, NULL);
       FREE(surf);
    }
 
diff --git a/src/gallium/drivers/i965simple/brw_state_pool.c b/src/gallium/drivers/i965simple/brw_state_pool.c
index 78d4c0e411b..007dc8f9deb 100644
--- a/src/gallium/drivers/i965simple/brw_state_pool.c
+++ b/src/gallium/drivers/i965simple/brw_state_pool.c
@@ -92,10 +92,10 @@ static void brw_init_pool( struct brw_context *brw,
    pool->size = size;
    pool->brw = brw;
 
-   pool->buffer = brw->pipe.winsys->buffer_create(brw->pipe.winsys,
-						  4096,
-                                                  0 /*  DRM_BO_FLAG_MEM_TT */,
-                                                  size);
+   pool->buffer = pipe_buffer_create(brw->pipe.screen,
+                                     4096,
+                                     0 /*  DRM_BO_FLAG_MEM_TT */,
+                                     size);
 }
 
 static void brw_destroy_pool( struct brw_context *brw,
@@ -103,7 +103,7 @@ static void brw_destroy_pool( struct brw_context *brw,
 {
    struct brw_mem_pool *pool = &brw->pool[pool_id];
 
-   pipe_buffer_reference( pool->brw->pipe.winsys,
+   pipe_buffer_reference( pool->brw->pipe.screen,
 			  &pool->buffer,
 			  NULL );
 }
diff --git a/src/gallium/drivers/i965simple/brw_tex_layout.c b/src/gallium/drivers/i965simple/brw_tex_layout.c
index 05eda9d1f26..cc0c665e021 100644
--- a/src/gallium/drivers/i965simple/brw_tex_layout.c
+++ b/src/gallium/drivers/i965simple/brw_tex_layout.c
@@ -330,7 +330,7 @@ brw_texture_release_screen(struct pipe_screen *screen,
       DBG("%s deleting %p\n", __FUNCTION__, (void *) tex);
       */
 
-      pipe_buffer_reference(ws, &tex->buffer, NULL);
+      winsys_buffer_reference(ws, &tex->buffer, NULL);
 
       for (i = 0; i < PIPE_MAX_TEXTURE_LEVELS; i++)
          if (tex->image_offset[i])
@@ -369,7 +369,7 @@ brw_get_tex_surface_screen(struct pipe_screen *screen,
    if (ps) {
       assert(ps->format);
       assert(ps->refcount);
-      pipe_buffer_reference(ws, &ps->buffer, tex->buffer);
+      winsys_buffer_reference(ws, &ps->buffer, tex->buffer);
       ps->format = pt->format;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
diff --git a/src/gallium/drivers/softpipe/sp_clear.c b/src/gallium/drivers/softpipe/sp_clear.c
index 12367068917..dfa46c9fb70 100644
--- a/src/gallium/drivers/softpipe/sp_clear.c
+++ b/src/gallium/drivers/softpipe/sp_clear.c
@@ -31,6 +31,7 @@
 
 
 #include "pipe/p_defines.h"
+#include "util/u_pack_color.h"
 #include "sp_clear.h"
 #include "sp_context.h"
 #include "sp_surface.h"
@@ -39,8 +40,28 @@
 
 
 /**
+ * Convert packed pixel from one format to another.
+ */
+static unsigned
+convert_color(enum pipe_format srcFormat, unsigned srcColor,
+              enum pipe_format dstFormat)
+{
+   ubyte r, g, b, a;
+   unsigned dstColor;
+
+   util_unpack_color_ub(srcFormat, &srcColor, &r, &g, &b, &a);
+   util_pack_color_ub(r, g, b, a, dstFormat, &dstColor);
+
+   return dstColor;
+}
+
+
+
+/**
  * Clear the given surface to the specified value.
  * No masking, no scissor (clear entire buffer).
+ * Note: when clearing a color buffer, the clearValue is always
+ * encoded as PIPE_FORMAT_A8R8G8B8_UNORM.
  */
 void
 softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
@@ -66,7 +87,15 @@ softpipe_clear(struct pipe_context *pipe, struct pipe_surface *ps,
 
    for (i = 0; i < softpipe->framebuffer.num_cbufs; i++) {
       if (ps == sp_tile_cache_get_surface(softpipe->cbuf_cache[i])) {
-         sp_tile_cache_clear(softpipe->cbuf_cache[i], clearValue);
+         unsigned cv;
+         if (ps->format != PIPE_FORMAT_A8R8G8B8_UNORM) {
+            cv = convert_color(PIPE_FORMAT_A8R8G8B8_UNORM, clearValue,
+                               ps->format);
+         }
+         else {
+            cv = clearValue;
+         }
+         sp_tile_cache_clear(softpipe->cbuf_cache[i], cv);
          softpipe->framebuffer.cbufs[i]->status = PIPE_SURFACE_STATUS_CLEAR;
       }
    }
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index dda90f760a3..cd1e6663d86 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -92,17 +92,19 @@ static void softpipe_destroy( struct pipe_context *pipe )
    if (softpipe->draw)
       draw_destroy( softpipe->draw );
 
-   softpipe->quad.polygon_stipple->destroy( softpipe->quad.polygon_stipple );
-   softpipe->quad.earlyz->destroy( softpipe->quad.earlyz );
-   softpipe->quad.shade->destroy( softpipe->quad.shade );
-   softpipe->quad.alpha_test->destroy( softpipe->quad.alpha_test );
-   softpipe->quad.depth_test->destroy( softpipe->quad.depth_test );
-   softpipe->quad.stencil_test->destroy( softpipe->quad.stencil_test );
-   softpipe->quad.occlusion->destroy( softpipe->quad.occlusion );
-   softpipe->quad.coverage->destroy( softpipe->quad.coverage );
-   softpipe->quad.blend->destroy( softpipe->quad.blend );
-   softpipe->quad.colormask->destroy( softpipe->quad.colormask );
-   softpipe->quad.output->destroy( softpipe->quad.output );
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      softpipe->quad[i].polygon_stipple->destroy( softpipe->quad[i].polygon_stipple );
+      softpipe->quad[i].earlyz->destroy( softpipe->quad[i].earlyz );
+      softpipe->quad[i].shade->destroy( softpipe->quad[i].shade );
+      softpipe->quad[i].alpha_test->destroy( softpipe->quad[i].alpha_test );
+      softpipe->quad[i].depth_test->destroy( softpipe->quad[i].depth_test );
+      softpipe->quad[i].stencil_test->destroy( softpipe->quad[i].stencil_test );
+      softpipe->quad[i].occlusion->destroy( softpipe->quad[i].occlusion );
+      softpipe->quad[i].coverage->destroy( softpipe->quad[i].coverage );
+      softpipe->quad[i].blend->destroy( softpipe->quad[i].blend );
+      softpipe->quad[i].colormask->destroy( softpipe->quad[i].colormask );
+      softpipe->quad[i].output->destroy( softpipe->quad[i].output );
+   }
 
    for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++)
       sp_destroy_tile_cache(softpipe->cbuf_cache[i]);
@@ -113,7 +115,7 @@ static void softpipe_destroy( struct pipe_context *pipe )
 
    for (i = 0; i < Elements(softpipe->constants); i++) {
       if (softpipe->constants[i].buffer) {
-         pipe_buffer_reference(ws, &softpipe->constants[i].buffer, NULL);
+         winsys_buffer_reference(ws, &softpipe->constants[i].buffer, NULL);
       }
    }
 
@@ -205,17 +207,19 @@ softpipe_create( struct pipe_screen *screen,
 
 
    /* setup quad rendering stages */
-   softpipe->quad.polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
-   softpipe->quad.earlyz = sp_quad_earlyz_stage(softpipe);
-   softpipe->quad.shade = sp_quad_shade_stage(softpipe);
-   softpipe->quad.alpha_test = sp_quad_alpha_test_stage(softpipe);
-   softpipe->quad.depth_test = sp_quad_depth_test_stage(softpipe);
-   softpipe->quad.stencil_test = sp_quad_stencil_test_stage(softpipe);
-   softpipe->quad.occlusion = sp_quad_occlusion_stage(softpipe);
-   softpipe->quad.coverage = sp_quad_coverage_stage(softpipe);
-   softpipe->quad.blend = sp_quad_blend_stage(softpipe);
-   softpipe->quad.colormask = sp_quad_colormask_stage(softpipe);
-   softpipe->quad.output = sp_quad_output_stage(softpipe);
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      softpipe->quad[i].polygon_stipple = sp_quad_polygon_stipple_stage(softpipe);
+      softpipe->quad[i].earlyz = sp_quad_earlyz_stage(softpipe);
+      softpipe->quad[i].shade = sp_quad_shade_stage(softpipe);
+      softpipe->quad[i].alpha_test = sp_quad_alpha_test_stage(softpipe);
+      softpipe->quad[i].depth_test = sp_quad_depth_test_stage(softpipe);
+      softpipe->quad[i].stencil_test = sp_quad_stencil_test_stage(softpipe);
+      softpipe->quad[i].occlusion = sp_quad_occlusion_stage(softpipe);
+      softpipe->quad[i].coverage = sp_quad_coverage_stage(softpipe);
+      softpipe->quad[i].blend = sp_quad_blend_stage(softpipe);
+      softpipe->quad[i].colormask = sp_quad_colormask_stage(softpipe);
+      softpipe->quad[i].output = sp_quad_output_stage(softpipe);
+   }
 
    /*
     * Create drawing context and plug our rendering stage into it.
@@ -257,3 +261,4 @@ softpipe_create( struct pipe_screen *screen,
    softpipe_destroy(&softpipe->pipe);
    return NULL;
 }
+
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index 078886f93c9..2b9a2a8ee52 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -45,6 +45,10 @@
  */
 #define USE_DRAW_STAGE_PSTIPPLE 1
 
+/* Number of threads working on individual quads.
+ * Setting to 1 disables this feature.
+ */
+#define SP_NUM_QUAD_THREADS 1
 
 struct softpipe_winsys;
 struct softpipe_vbuf_render;
@@ -133,7 +137,7 @@ struct softpipe_context {
       struct quad_stage *output;
 
       struct quad_stage *first; /**< points to one of the above stages */
-   } quad;
+   } quad[SP_NUM_QUAD_THREADS];
 
    /** The primitive drawing context */
    struct draw_context *draw;
@@ -151,13 +155,11 @@ struct softpipe_context {
 };
 
 
-
-
 static INLINE struct softpipe_context *
 softpipe_context( struct pipe_context *pipe )
 {
    return (struct softpipe_context *)pipe;
 }
 
-
 #endif /* SP_CONTEXT_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_draw_arrays.c b/src/gallium/drivers/softpipe/sp_draw_arrays.c
index 12b44a82118..424bd568460 100644
--- a/src/gallium/drivers/softpipe/sp_draw_arrays.c
+++ b/src/gallium/drivers/softpipe/sp_draw_arrays.c
@@ -34,6 +34,7 @@
 #include "pipe/p_defines.h"
 #include "pipe/p_context.h"
 #include "pipe/p_winsys.h"
+#include "pipe/p_inlines.h"
 
 #include "sp_context.h"
 #include "sp_state.h"
@@ -135,7 +136,7 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
     */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       void *buf
-         = pipe->winsys->buffer_map(pipe->winsys,
+         = pipe_buffer_map(pipe->screen,
                                     sp->vertex_buffer[i].buffer,
                                     PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_vertex_buffer(draw, i, buf);
@@ -143,7 +144,7 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
    /* Map index buffer, if present */
    if (indexBuffer) {
       void *mapped_indexes
-         = pipe->winsys->buffer_map(pipe->winsys, indexBuffer,
+         = pipe_buffer_map(pipe->screen, indexBuffer,
                                     PIPE_BUFFER_USAGE_CPU_READ);
       draw_set_mapped_element_buffer_range(draw, indexSize,
                                            min_index,
@@ -164,11 +165,11 @@ softpipe_draw_range_elements(struct pipe_context *pipe,
     */
    for (i = 0; i < sp->num_vertex_buffers; i++) {
       draw_set_mapped_vertex_buffer(draw, i, NULL);
-      pipe->winsys->buffer_unmap(pipe->winsys, sp->vertex_buffer[i].buffer);
+      pipe_buffer_unmap(pipe->screen, sp->vertex_buffer[i].buffer);
    }
    if (indexBuffer) {
       draw_set_mapped_element_buffer(draw, 0, NULL);
-      pipe->winsys->buffer_unmap(pipe->winsys, indexBuffer);
+      pipe_buffer_unmap(pipe->screen, indexBuffer);
    }
 
 
diff --git a/src/gallium/drivers/softpipe/sp_flush.c b/src/gallium/drivers/softpipe/sp_flush.c
index e03994b63b7..401764bb439 100644
--- a/src/gallium/drivers/softpipe/sp_flush.c
+++ b/src/gallium/drivers/softpipe/sp_flush.c
@@ -73,6 +73,19 @@ softpipe_flush( struct pipe_context *pipe,
       softpipe_unmap_surfaces(softpipe);
    }
 
+   /* Enable to dump BMPs of the color/depth buffers each frame */
+#if 0
+   if(flags & PIPE_FLUSH_FRAME) {
+      static unsigned frame_no = 1;
+      static char filename[256];
+      util_snprintf(filename, sizeof(filename), "cbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, softpipe->framebuffer.cbufs[0]);
+      util_snprintf(filename, sizeof(filename), "zsbuf_%u.bmp", frame_no);
+      debug_dump_surface_bmp(filename, softpipe->framebuffer.zsbuf);
+      ++frame_no;
+   }
+#endif
+   
    if (fence)
       *fence = NULL;
 }
diff --git a/src/gallium/drivers/softpipe/sp_fs_exec.c b/src/gallium/drivers/softpipe/sp_fs_exec.c
index d0456731bea..701ee4c72f2 100644
--- a/src/gallium/drivers/softpipe/sp_fs_exec.c
+++ b/src/gallium/drivers/softpipe/sp_fs_exec.c
@@ -106,7 +106,7 @@ exec_run( const struct sp_fragment_shader *base,
 
    /* Compute X, Y, Z, W vals for this quad */
    sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->x0, (float)quad->y0, 
+		       (float)quad->input.x0, (float)quad->input.y0, 
 		       &machine->QuadPos);
    
    return tgsi_exec_machine_run( machine );
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 35653a8e48c..496ed43df26 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -88,7 +88,7 @@ fs_sse_run( const struct sp_fragment_shader *base,
 
    /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
    sp_setup_pos_vector(quad->posCoef, 
-		       (float)quad->x0, (float)quad->y0, 
+		       (float)quad->input.x0, (float)quad->input.y0, 
 		       machine->Temps);
 
    /* init kill mask */
diff --git a/src/gallium/drivers/softpipe/sp_headers.h b/src/gallium/drivers/softpipe/sp_headers.h
index ae2ee210fc9..4a42cb3c192 100644
--- a/src/gallium/drivers/softpipe/sp_headers.h
+++ b/src/gallium/drivers/softpipe/sp_headers.h
@@ -59,20 +59,31 @@
  * Encodes everything we need to know about a 2x2 pixel block.  Uses
  * "Channel-Serial" or "SoA" layout.  
  */
-struct quad_header {
+struct quad_header_input
+{
    int x0;
    int y0;
-   unsigned mask:4;
+   float coverage[QUAD_SIZE];    /** fragment coverage for antialiasing */
    unsigned facing:1;   /**< Front (0) or back (1) facing? */
    unsigned prim:2;     /**< PRIM_POINT, LINE, TRI */
+};
+
+struct quad_header_inout
+{
+   unsigned mask:4;
+};
 
-   struct {
-      /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
-      float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
-      float depth[QUAD_SIZE];
-   } outputs;
+struct quad_header_output
+{
+   /** colors in SOA format (rrrr, gggg, bbbb, aaaa) */
+   float color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS][QUAD_SIZE];
+   float depth[QUAD_SIZE];
+};
 
-   float coverage[QUAD_SIZE];    /** fragment coverage for antialiasing */
+struct quad_header {
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   struct quad_header_output output;
 
    const struct tgsi_interp_coef *coef;
    const struct tgsi_interp_coef *posCoef;
@@ -80,5 +91,5 @@ struct quad_header {
    unsigned nr_attrs;
 };
 
-
 #endif /* SP_HEADERS_H */
+
diff --git a/src/gallium/drivers/softpipe/sp_quad.c b/src/gallium/drivers/softpipe/sp_quad.c
index bc83d78ea16..892ef87ee9f 100644
--- a/src/gallium/drivers/softpipe/sp_quad.c
+++ b/src/gallium/drivers/softpipe/sp_quad.c
@@ -33,29 +33,33 @@
 static void
 sp_push_quad_first(
    struct softpipe_context *sp,
-   struct quad_stage       *quad )
+   struct quad_stage *quad,
+   uint i )
 {
-   quad->next = sp->quad.first;
-   sp->quad.first = quad;
+   quad->next = sp->quad[i].first;
+   sp->quad[i].first = quad;
 }
 
 static void
 sp_build_depth_stencil(
-   struct softpipe_context *sp )
+   struct softpipe_context *sp,
+   uint i )
 {
    if (sp->depth_stencil->stencil[0].enabled ||
        sp->depth_stencil->stencil[1].enabled) {
-      sp_push_quad_first( sp, sp->quad.stencil_test );
+      sp_push_quad_first( sp, sp->quad[i].stencil_test, i );
    }
    else if (sp->depth_stencil->depth.enabled &&
             sp->framebuffer.zsbuf) {
-      sp_push_quad_first( sp, sp->quad.depth_test );
+      sp_push_quad_first( sp, sp->quad[i].depth_test, i );
    }
 }
 
 void
 sp_build_quad_pipeline(struct softpipe_context *sp)
 {
+   uint i;
+
    boolean early_depth_test =
                sp->depth_stencil->depth.enabled &&
                sp->framebuffer.zsbuf &&
@@ -64,49 +68,51 @@ sp_build_quad_pipeline(struct softpipe_context *sp)
                !sp->fs->info.writes_z;
 
    /* build up the pipeline in reverse order... */
-
-   sp->quad.first = sp->quad.output;
-
-   if (sp->blend->colormask != 0xf) {
-      sp_push_quad_first( sp, sp->quad.colormask );
-   }
-
-   if (sp->blend->blend_enable ||
-       sp->blend->logicop_enable) {
-      sp_push_quad_first( sp, sp->quad.blend );
-   }
-
-   if (sp->depth_stencil->depth.occlusion_count) {
-      sp_push_quad_first( sp, sp->quad.occlusion );
-   }
-
-   if (sp->rasterizer->poly_smooth ||
-       sp->rasterizer->line_smooth ||
-       sp->rasterizer->point_smooth) {
-      sp_push_quad_first( sp, sp->quad.coverage );
-   }
-
-   if (!early_depth_test) {
-      sp_build_depth_stencil( sp );
-   }
-
-   if (sp->depth_stencil->alpha.enabled) {
-      sp_push_quad_first( sp, sp->quad.alpha_test );
-   }
-
-   /* XXX always enable shader? */
-   if (1) {
-      sp_push_quad_first( sp, sp->quad.shade );
-   }
-
-   if (early_depth_test) {
-      sp_build_depth_stencil( sp );
-      sp_push_quad_first( sp, sp->quad.earlyz );
-   }
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first = sp->quad[i].output;
+
+      if (sp->blend->colormask != 0xf) {
+         sp_push_quad_first( sp, sp->quad[i].colormask, i );
+      }
+
+      if (sp->blend->blend_enable ||
+          sp->blend->logicop_enable) {
+         sp_push_quad_first( sp, sp->quad[i].blend, i );
+      }
+
+      if (sp->depth_stencil->depth.occlusion_count) {
+         sp_push_quad_first( sp, sp->quad[i].occlusion, i );
+      }
+
+      if (sp->rasterizer->poly_smooth ||
+          sp->rasterizer->line_smooth ||
+          sp->rasterizer->point_smooth) {
+         sp_push_quad_first( sp, sp->quad[i].coverage, i );
+      }
+
+      if (!early_depth_test) {
+         sp_build_depth_stencil( sp, i );
+      }
+
+      if (sp->depth_stencil->alpha.enabled) {
+         sp_push_quad_first( sp, sp->quad[i].alpha_test, i );
+      }
+
+      /* XXX always enable shader? */
+      if (1) {
+         sp_push_quad_first( sp, sp->quad[i].shade, i );
+      }
+
+      if (early_depth_test) {
+         sp_build_depth_stencil( sp, i );
+         sp_push_quad_first( sp, sp->quad[i].earlyz, i );
+      }
 
 #if !USE_DRAW_STAGE_PSTIPPLE
-   if (sp->rasterizer->poly_stipple_enable) {
-      sp_push_quad_first( sp, sp->quad.polygon_stipple );
-   }
+      if (sp->rasterizer->poly_stipple_enable) {
+         sp_push_quad_first( sp, sp->quad[i].polygon_stipple, i );
+      }
 #endif
+   }
 }
+
diff --git a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
index 7d3580fb4f2..5bebd141e92 100644
--- a/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_alpha_test.c
@@ -17,11 +17,10 @@ alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
    const float ref = softpipe->depth_stencil->alpha.ref;
    unsigned passMask = 0x0, j;
    const uint cbuf = 0; /* only output[0].alpha is tested */
-   const float *aaaa = quad->outputs.color[cbuf][3];
+   const float *aaaa = quad->output.color[cbuf][3];
 
    switch (softpipe->depth_stencil->alpha.func) {
    case PIPE_FUNC_NEVER:
-      quad->mask = 0x0;
       break;
    case PIPE_FUNC_LESS:
       /*
@@ -76,9 +75,9 @@ alpha_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
-   quad->mask &= passMask;
+   quad->inout.mask &= passMask;
 
-   if (quad->mask)
+   if (quad->inout.mask)
       qs->next->run(qs->next, quad);
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_blend.c b/src/gallium/drivers/softpipe/sp_quad_blend.c
index a834accb863..6f64c6e584c 100644
--- a/src/gallium/drivers/softpipe/sp_quad_blend.c
+++ b/src/gallium/drivers/softpipe/sp_quad_blend.c
@@ -114,14 +114,14 @@ logicop_quad(struct quad_stage *qs, struct quad_header *quad)
       struct softpipe_cached_tile *
          tile = sp_get_cached_tile(softpipe,
                                    softpipe->cbuf_cache[cbuf],
-                                   quad->x0, quad->y0);
-      float (*quadColor)[4] = quad->outputs.color[cbuf];
+                                   quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
       uint i, j;
 
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
@@ -244,14 +244,14 @@ blend_quad(struct quad_stage *qs, struct quad_header *quad)
       struct softpipe_cached_tile *tile
          = sp_get_cached_tile(softpipe,
                               softpipe->cbuf_cache[cbuf],
-                              quad->x0, quad->y0);
-      float (*quadColor)[4] = quad->outputs.color[cbuf];
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
       uint i, j;
 
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
diff --git a/src/gallium/drivers/softpipe/sp_quad_colormask.c b/src/gallium/drivers/softpipe/sp_quad_colormask.c
index f72f31db973..f32bdfab784 100644
--- a/src/gallium/drivers/softpipe/sp_quad_colormask.c
+++ b/src/gallium/drivers/softpipe/sp_quad_colormask.c
@@ -56,14 +56,14 @@ colormask_quad(struct quad_stage *qs, struct quad_header *quad)
       struct softpipe_cached_tile *tile
          = sp_get_cached_tile(softpipe,
                               softpipe->cbuf_cache[cbuf],
-                              quad->x0, quad->y0);
-      float (*quadColor)[4] = quad->outputs.color[cbuf];
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
       uint i, j;
 
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = (quad->x0 & (TILE_SIZE-1)) + (j & 1);
-         int y = (quad->y0 & (TILE_SIZE-1)) + (j >> 1);
+         int x = (quad->input.x0 & (TILE_SIZE-1)) + (j & 1);
+         int y = (quad->input.y0 & (TILE_SIZE-1)) + (j >> 1);
          for (i = 0; i < 4; i++) {
             dest[i][j] = tile->data.color[y][x][i];
          }
diff --git a/src/gallium/drivers/softpipe/sp_quad_coverage.c b/src/gallium/drivers/softpipe/sp_quad_coverage.c
index ad907ec25fe..ee29aa7dfea 100644
--- a/src/gallium/drivers/softpipe/sp_quad_coverage.c
+++ b/src/gallium/drivers/softpipe/sp_quad_coverage.c
@@ -47,19 +47,19 @@ coverage_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
 
-   if ((softpipe->rasterizer->poly_smooth && quad->prim == PRIM_TRI) ||
-       (softpipe->rasterizer->line_smooth && quad->prim == PRIM_LINE) ||
-       (softpipe->rasterizer->point_smooth && quad->prim == PRIM_POINT)) {
+   if ((softpipe->rasterizer->poly_smooth && quad->input.prim == PRIM_TRI) ||
+       (softpipe->rasterizer->line_smooth && quad->input.prim == PRIM_LINE) ||
+       (softpipe->rasterizer->point_smooth && quad->input.prim == PRIM_POINT)) {
       uint cbuf;
 
       /* loop over colorbuffer outputs */
       for (cbuf = 0; cbuf < softpipe->framebuffer.num_cbufs; cbuf++) {
-         float (*quadColor)[4] = quad->outputs.color[cbuf];
+         float (*quadColor)[4] = quad->output.color[cbuf];
          unsigned j;
          for (j = 0; j < QUAD_SIZE; j++) {
-            assert(quad->coverage[j] >= 0.0);
-            assert(quad->coverage[j] <= 1.0);
-         quadColor[3][j] *= quad->coverage[j];
+            assert(quad->input.coverage[j] >= 0.0);
+            assert(quad->input.coverage[j] <= 1.0);
+         quadColor[3][j] *= quad->input.coverage[j];
          }
       }
    }
diff --git a/src/gallium/drivers/softpipe/sp_quad_depth_test.c b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
index 227cb2014e1..523bd3e0801 100644
--- a/src/gallium/drivers/softpipe/sp_quad_depth_test.c
+++ b/src/gallium/drivers/softpipe/sp_quad_depth_test.c
@@ -60,7 +60,7 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
    unsigned zmask = 0;
    unsigned j;
    struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->x0, quad->y0);
+      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
 
    assert(ps); /* shouldn't get here if there's no zbuffer */
 
@@ -79,12 +79,12 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          float scale = 65535.0;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->outputs.depth[j] * scale);
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             bzzzz[j] = tile->data.depth16[y][x];
          }
       }
@@ -94,12 +94,12 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          double scale = (double) (uint) ~0UL;
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->outputs.depth[j] * scale);
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             bzzzz[j] = tile->data.depth32[y][x];
          }
       }
@@ -111,12 +111,12 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->outputs.depth[j] * scale);
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             bzzzz[j] = tile->data.depth32[y][x] & 0xffffff;
          }
       }
@@ -128,12 +128,12 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          float scale = (float) ((1 << 24) - 1);
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            qzzzz[j] = (unsigned) (quad->outputs.depth[j] * scale);
+            qzzzz[j] = (unsigned) (quad->output.depth[j] * scale);
          }
 
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             bzzzz[j] = tile->data.depth32[y][x] >> 8;
          }
       }
@@ -192,14 +192,14 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
-   quad->mask &= zmask;
+   quad->inout.mask &= zmask;
 
    if (softpipe->depth_stencil->depth.writemask) {
       
       /* This is also efficient with sse / spe instructions: 
        */
       for (j = 0; j < QUAD_SIZE; j++) {
-	 if (quad->mask & (1 << j)) {
+	 if (quad->inout.mask & (1 << j)) {
 	    bzzzz[j] = qzzzz[j];
 	 }
       }
@@ -208,8 +208,8 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
       switch (format) {
       case PIPE_FORMAT_Z16_UNORM:
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             tile->data.depth16[y][x] = (ushort) bzzzz[j];
          }
          break;
@@ -218,15 +218,15 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          /* (yes, this falls through to a different case than above) */
       case PIPE_FORMAT_Z32_UNORM:
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             tile->data.depth32[y][x] = bzzzz[j];
          }
          break;
       case PIPE_FORMAT_S8Z24_UNORM:
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             uint s8z24 = tile->data.depth32[y][x];
             s8z24 = (s8z24 & 0xff000000) | bzzzz[j];
             tile->data.depth32[y][x] = s8z24;
@@ -234,8 +234,8 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          break;
       case PIPE_FORMAT_Z24S8_UNORM:
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             uint z24s8 = tile->data.depth32[y][x];
             z24s8 = (z24s8 & 0xff) | (bzzzz[j] << 8);
             tile->data.depth32[y][x] = z24s8;
@@ -243,8 +243,8 @@ sp_depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
          break;
       case PIPE_FORMAT_Z24X8_UNORM:
          for (j = 0; j < QUAD_SIZE; j++) {
-            int x = quad->x0 % TILE_SIZE + (j & 1);
-            int y = quad->y0 % TILE_SIZE + (j >> 1);
+            int x = quad->input.x0 % TILE_SIZE + (j & 1);
+            int y = quad->input.y0 % TILE_SIZE + (j >> 1);
             tile->data.depth32[y][x] = bzzzz[j] << 8;
          }
          break;
@@ -260,7 +260,7 @@ depth_test_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    sp_depth_test_quad(qs, quad);
 
-   if (quad->mask)
+   if (quad->inout.mask)
       qs->next->run(qs->next, quad);
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_earlyz.c b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
index 5a66a866993..6e2dde304ea 100644
--- a/src/gallium/drivers/softpipe/sp_quad_earlyz.c
+++ b/src/gallium/drivers/softpipe/sp_quad_earlyz.c
@@ -45,16 +45,16 @@ earlyz_quad(
    struct quad_stage    *qs,
    struct quad_header   *quad )
 {
-   const float fx = (float) quad->x0;
-   const float fy = (float) quad->y0;
+   const float fx = (float) quad->input.x0;
+   const float fy = (float) quad->input.y0;
    const float dzdx = quad->posCoef->dadx[2];
    const float dzdy = quad->posCoef->dady[2];
    const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
-   quad->outputs.depth[0] = z0;
-   quad->outputs.depth[1] = z0 + dzdx;
-   quad->outputs.depth[2] = z0 + dzdy;
-   quad->outputs.depth[3] = z0 + dzdx + dzdy;
+   quad->output.depth[0] = z0;
+   quad->output.depth[1] = z0 + dzdx;
+   quad->output.depth[2] = z0 + dzdy;
+   quad->output.depth[3] = z0 + dzdx + dzdy;
 
    qs->next->run( qs->next, quad );
 }
diff --git a/src/gallium/drivers/softpipe/sp_quad_fs.c b/src/gallium/drivers/softpipe/sp_quad_fs.c
index 5499ba5361f..1f0cb3e0355 100644
--- a/src/gallium/drivers/softpipe/sp_quad_fs.c
+++ b/src/gallium/drivers/softpipe/sp_quad_fs.c
@@ -85,7 +85,7 @@ shade_quad(
    machine->InterpCoefs = quad->coef;
 
    /* run shader */
-   quad->mask &= softpipe->fs->run( softpipe->fs, 
+   quad->inout.mask &= softpipe->fs->run( softpipe->fs, 
 				    &qss->machine,
 				    quad );
 
@@ -101,16 +101,16 @@ shade_quad(
          case TGSI_SEMANTIC_COLOR:
             {
                uint cbuf = sem_index[i];
-               memcpy(quad->outputs.color[cbuf],
+               memcpy(quad->output.color[cbuf],
                       &machine->Outputs[i].xyzw[0].f[0],
-                      sizeof(quad->outputs.color[0]) );
+                      sizeof(quad->output.color[0]) );
             }
             break;
          case TGSI_SEMANTIC_POSITION:
             {
                uint j;
                for (j = 0; j < 4; j++) {
-                  quad->outputs.depth[j] = machine->Outputs[0].xyzw[2].f[j];
+                  quad->output.depth[j] = machine->Outputs[0].xyzw[2].f[j];
                }
                z_written = TRUE;
             }
@@ -122,20 +122,20 @@ shade_quad(
    if (!z_written) {
       /* compute Z values now, as in the quad earlyz stage */
       /* XXX we should really only do this if the earlyz stage is not used */
-      const float fx = (float) quad->x0;
-      const float fy = (float) quad->y0;
+      const float fx = (float) quad->input.x0;
+      const float fy = (float) quad->input.y0;
       const float dzdx = quad->posCoef->dadx[2];
       const float dzdy = quad->posCoef->dady[2];
       const float z0 = quad->posCoef->a0[2] + dzdx * fx + dzdy * fy;
 
-      quad->outputs.depth[0] = z0;
-      quad->outputs.depth[1] = z0 + dzdx;
-      quad->outputs.depth[2] = z0 + dzdy;
-      quad->outputs.depth[3] = z0 + dzdx + dzdy;
+      quad->output.depth[0] = z0;
+      quad->output.depth[1] = z0 + dzdx;
+      quad->output.depth[2] = z0 + dzdy;
+      quad->output.depth[3] = z0 + dzdx + dzdy;
    }
 
    /* shader may cull fragments */
-   if( quad->mask ) {
+   if( quad->inout.mask ) {
       qs->next->run( qs->next, quad );
    }
 }
diff --git a/src/gallium/drivers/softpipe/sp_quad_occlusion.c b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
index db13e73ae35..169bd82876d 100644
--- a/src/gallium/drivers/softpipe/sp_quad_occlusion.c
+++ b/src/gallium/drivers/softpipe/sp_quad_occlusion.c
@@ -54,7 +54,7 @@ occlusion_count_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    struct softpipe_context *softpipe = qs->softpipe;
 
-   softpipe->occlusion_count += count_bits(quad->mask);
+   softpipe->occlusion_count += count_bits(quad->inout.mask);
 
    qs->next->run(qs->next, quad);
 }
diff --git a/src/gallium/drivers/softpipe/sp_quad_output.c b/src/gallium/drivers/softpipe/sp_quad_output.c
index b64646a449f..d05e12d1d95 100644
--- a/src/gallium/drivers/softpipe/sp_quad_output.c
+++ b/src/gallium/drivers/softpipe/sp_quad_output.c
@@ -41,8 +41,8 @@ static void
 output_quad(struct quad_stage *qs, struct quad_header *quad)
 {
    /* in-tile pos: */
-   const int itx = quad->x0 % TILE_SIZE;
-   const int ity = quad->y0 % TILE_SIZE;
+   const int itx = quad->input.x0 % TILE_SIZE;
+   const int ity = quad->input.y0 % TILE_SIZE;
 
    struct softpipe_context *softpipe = qs->softpipe;
    uint cbuf;
@@ -52,13 +52,13 @@ output_quad(struct quad_stage *qs, struct quad_header *quad)
       struct softpipe_cached_tile *tile
          = sp_get_cached_tile(softpipe,
                               softpipe->cbuf_cache[cbuf],
-                              quad->x0, quad->y0);
-      float (*quadColor)[4] = quad->outputs.color[cbuf];
+                              quad->input.x0, quad->input.y0);
+      float (*quadColor)[4] = quad->output.color[cbuf];
       int i, j;
 
       /* get/swizzle dest colors */
       for (j = 0; j < QUAD_SIZE; j++) {
-         if (quad->mask & (1 << j)) {
+         if (quad->inout.mask & (1 << j)) {
             int x = itx + (j & 1);
             int y = ity + (j >> 1);
             for (i = 0; i < 4; i++) { /* loop over color chans */
diff --git a/src/gallium/drivers/softpipe/sp_quad_stencil.c b/src/gallium/drivers/softpipe/sp_quad_stencil.c
index ce9562e07c6..abb54877487 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stencil.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stencil.c
@@ -206,9 +206,9 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
    ubyte ref, wrtMask, valMask;
    ubyte stencilVals[QUAD_SIZE];
    struct softpipe_cached_tile *tile
-      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->x0, quad->y0);
+      = sp_get_cached_tile(softpipe, softpipe->zsbuf_cache, quad->input.x0, quad->input.y0);
    uint j;
-   uint face = quad->facing;
+   uint face = quad->input.facing;
 
    if (!softpipe->depth_stencil->stencil[1].enabled) {
       /* single-sided stencil test, use front (face=0) state */
@@ -231,22 +231,22 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
    switch (ps->format) {
    case PIPE_FORMAT_S8Z24_UNORM:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->x0 % TILE_SIZE + (j & 1);
-         int y = quad->y0 % TILE_SIZE + (j >> 1);
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
          stencilVals[j] = tile->data.depth32[y][x] >> 24;
       }
       break;
    case PIPE_FORMAT_Z24S8_UNORM:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->x0 % TILE_SIZE + (j & 1);
-         int y = quad->y0 % TILE_SIZE + (j >> 1);
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
          stencilVals[j] = tile->data.depth32[y][x] & 0xff;
       }
       break;
    case PIPE_FORMAT_S8_UNORM:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->x0 % TILE_SIZE + (j & 1);
-         int y = quad->y0 % TILE_SIZE + (j >> 1);
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
          stencilVals[j] = tile->data.stencil8[y][x];
       }
       break;
@@ -258,35 +258,35 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
    {
       unsigned passMask, failMask;
       passMask = do_stencil_test(stencilVals, func, ref, valMask);
-      failMask = quad->mask & ~passMask;
-      quad->mask &= passMask;
+      failMask = quad->inout.mask & ~passMask;
+      quad->inout.mask &= passMask;
 
       if (failOp != PIPE_STENCIL_OP_KEEP) {
          apply_stencil_op(stencilVals, failMask, failOp, ref, wrtMask);
       }
    }
 
-   if (quad->mask) {
+   if (quad->inout.mask) {
       /* now the pixels that passed the stencil test are depth tested */
       if (softpipe->depth_stencil->depth.enabled) {
-         const unsigned origMask = quad->mask;
+         const unsigned origMask = quad->inout.mask;
 
          sp_depth_test_quad(qs, quad);  /* quad->mask is updated */
 
          /* update stencil buffer values according to z pass/fail result */
          if (zFailOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned failMask = origMask & ~quad->mask;
+            const unsigned failMask = origMask & ~quad->inout.mask;
             apply_stencil_op(stencilVals, failMask, zFailOp, ref, wrtMask);
          }
 
          if (zPassOp != PIPE_STENCIL_OP_KEEP) {
-            const unsigned passMask = origMask & quad->mask;
+            const unsigned passMask = origMask & quad->inout.mask;
             apply_stencil_op(stencilVals, passMask, zPassOp, ref, wrtMask);
          }
       }
       else {
          /* no depth test, apply Zpass operator to stencil buffer values */
-         apply_stencil_op(stencilVals, quad->mask, zPassOp, ref, wrtMask);
+         apply_stencil_op(stencilVals, quad->inout.mask, zPassOp, ref, wrtMask);
       }
 
    }
@@ -295,8 +295,8 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
    switch (ps->format) {
    case PIPE_FORMAT_S8Z24_UNORM:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->x0 % TILE_SIZE + (j & 1);
-         int y = quad->y0 % TILE_SIZE + (j >> 1);
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
          uint s8z24 = tile->data.depth32[y][x];
          s8z24 = (stencilVals[j] << 24) | (s8z24 & 0xffffff);
          tile->data.depth32[y][x] = s8z24;
@@ -304,8 +304,8 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
       break;
    case PIPE_FORMAT_Z24S8_UNORM:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->x0 % TILE_SIZE + (j & 1);
-         int y = quad->y0 % TILE_SIZE + (j >> 1);
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
          uint z24s8 = tile->data.depth32[y][x];
          z24s8 = (z24s8 & 0xffffff00) | stencilVals[j];
          tile->data.depth32[y][x] = z24s8;
@@ -313,8 +313,8 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
       break;
    case PIPE_FORMAT_S8_UNORM:
       for (j = 0; j < QUAD_SIZE; j++) {
-         int x = quad->x0 % TILE_SIZE + (j & 1);
-         int y = quad->y0 % TILE_SIZE + (j >> 1);
+         int x = quad->input.x0 % TILE_SIZE + (j & 1);
+         int y = quad->input.y0 % TILE_SIZE + (j >> 1);
          tile->data.stencil8[y][x] = stencilVals[j];
       }
       break;
@@ -322,7 +322,7 @@ stencil_test_quad(struct quad_stage *qs, struct quad_header *quad)
       assert(0);
    }
 
-   if (quad->mask)
+   if (quad->inout.mask)
       qs->next->run(qs->next, quad);
 }
 
diff --git a/src/gallium/drivers/softpipe/sp_quad_stipple.c b/src/gallium/drivers/softpipe/sp_quad_stipple.c
index a39ecc2e9d4..ccf37f6be59 100644
--- a/src/gallium/drivers/softpipe/sp_quad_stipple.c
+++ b/src/gallium/drivers/softpipe/sp_quad_stipple.c
@@ -19,17 +19,17 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
    static const uint bit31 = 1 << 31;
    static const uint bit30 = 1 << 30;
 
-   if (quad->prim == PRIM_TRI) {
+   if (quad->input.prim == PRIM_TRI) {
       struct softpipe_context *softpipe = qs->softpipe;
       /* need to invert Y to index into OpenGL's stipple pattern */
       int y0, y1;
       uint stipple0, stipple1;
       if (softpipe->rasterizer->origin_lower_left) {
-         y0 = softpipe->framebuffer.height - 1 - quad->y0;
+         y0 = softpipe->framebuffer.height - 1 - quad->input.y0;
          y1 = y0 - 1;
       }
       else {
-         y0 = quad->y0;
+         y0 = quad->input.y0;
          y1 = y0 + 1;
       }
       stipple0 = softpipe->poly_stipple.stipple[y0 % 32];
@@ -37,18 +37,18 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
 
 #if 1
       {
-      const int col0 = quad->x0 % 32;
+      const int col0 = quad->input.x0 % 32;
       if ((stipple0 & (bit31 >> col0)) == 0)
-         quad->mask &= ~MASK_TOP_LEFT;
+         quad->inout.mask &= ~MASK_TOP_LEFT;
 
       if ((stipple0 & (bit30 >> col0)) == 0)
-         quad->mask &= ~MASK_TOP_RIGHT;
+         quad->inout.mask &= ~MASK_TOP_RIGHT;
 
       if ((stipple1 & (bit31 >> col0)) == 0)
-         quad->mask &= ~MASK_BOTTOM_LEFT;
+         quad->inout.mask &= ~MASK_BOTTOM_LEFT;
 
       if ((stipple1 & (bit30 >> col0)) == 0)
-         quad->mask &= ~MASK_BOTTOM_RIGHT;
+         quad->inout.mask &= ~MASK_BOTTOM_RIGHT;
       }
 #else
       /* We'd like to use this code, but we'd need to redefine
@@ -56,11 +56,11 @@ stipple_quad(struct quad_stage *qs, struct quad_header *quad)
        * and similarly for the BOTTOM bits.  But that may have undesirable
        * side effects elsewhere.
        */
-      const int col0 = 30 - (quad->x0 % 32);
-      quad->mask &= (((stipple0 >> col0) & 0x3) | 
+      const int col0 = 30 - (quad->input.x0 % 32);
+      quad->inout.mask &= (((stipple0 >> col0) & 0x3) | 
                      (((stipple1 >> col0) & 0x3) << 2));
 #endif
-      if (!quad->mask)
+      if (!quad->inout.mask)
          return;
    }
 
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index 87336ab6e31..bc8263c33e3 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -43,6 +43,7 @@
 #include "draw/draw_private.h"
 #include "draw/draw_vertex.h"
 #include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 
@@ -61,6 +62,87 @@ struct edge {
    int lines;		/**< number of lines on this edge */
 };
 
+#if SP_NUM_QUAD_THREADS > 1
+
+/* Set to 1 if you want other threads to be instantly
+ * notified of pending jobs.
+ */
+#define INSTANT_NOTEMPTY_NOTIFY 0
+
+struct thread_info
+{
+   struct setup_context *setup;
+   uint id;
+   pipe_thread handle;
+};
+
+struct quad_job;
+
+typedef void (* quad_job_routine)( struct setup_context *setup, uint thread, struct quad_job *job );
+
+struct quad_job
+{
+   struct quad_header_input input;
+   struct quad_header_inout inout;
+   quad_job_routine routine;
+};
+
+#define NUM_QUAD_JOBS 64
+
+struct quad_job_que
+{
+   struct quad_job jobs[NUM_QUAD_JOBS];
+   uint first;
+   uint last;
+   pipe_mutex que_mutex;
+   pipe_condvar que_notfull_condvar;
+   pipe_condvar que_notempty_condvar;
+   uint jobs_added;
+   uint jobs_done;
+   pipe_condvar que_done_condvar;
+};
+
+static void
+add_quad_job( struct quad_job_que *que, struct quad_header *quad, quad_job_routine routine )
+{
+#if INSTANT_NOTEMPTY_NOTIFY
+   boolean empty;
+#endif
+
+   /* Wait for empty slot, see if the que is empty.
+    */
+   pipe_mutex_lock( que->que_mutex );
+   while ((que->last + 1) % NUM_QUAD_JOBS == que->first) {
+#if !INSTANT_NOTEMPTY_NOTIFY
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+#endif
+      pipe_condvar_wait( que->que_notfull_condvar, que->que_mutex );
+   }
+#if INSTANT_NOTEMPTY_NOTIFY
+   empty = que->last == que->first;
+#endif
+   que->jobs_added++;
+   pipe_mutex_unlock( que->que_mutex );
+
+   /* Submit new job.
+    */
+   que->jobs[que->last].input = quad->input;
+   que->jobs[que->last].inout = quad->inout;
+   que->jobs[que->last].routine = routine;
+   que->last = (que->last + 1) % NUM_QUAD_JOBS;
+
+#if INSTANT_NOTEMPTY_NOTIFY
+   /* If the que was empty, notify consumers there's a job to be done.
+    */
+   if (empty) {
+      pipe_mutex_lock( que->que_mutex );
+      pipe_condvar_broadcast( que->que_notempty_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+#endif
+}
+
+#endif
 
 /**
  * Triangle setup info (derived from draw_stage).
@@ -88,6 +170,11 @@ struct setup_context {
    struct tgsi_interp_coef posCoef;  /* For Z, W */
    struct quad_header quad;
 
+#if SP_NUM_QUAD_THREADS > 1
+   struct quad_job_que que;
+   struct thread_info threads[SP_NUM_QUAD_THREADS];
+#endif
+
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
       int right[2];
@@ -104,7 +191,67 @@ struct setup_context {
    unsigned winding;		/* which winding to cull */
 };
 
+#if SP_NUM_QUAD_THREADS > 1
+
+static PIPE_THREAD_ROUTINE( quad_thread, param )
+{
+   struct thread_info *info = (struct thread_info *) param;
+   struct quad_job_que *que = &info->setup->que;
+
+   for (;;) {
+      struct quad_job job;
+      boolean full;
+
+      /* Wait for an available job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      while (que->last == que->first)
+         pipe_condvar_wait( que->que_notempty_condvar, que->que_mutex );
+
+      /* See if the que is full.
+       */
+      full = (que->last + 1) % NUM_QUAD_JOBS == que->first;
+
+      /* Take a job and remove it from que.
+       */
+      job = que->jobs[que->first];
+      que->first = (que->first + 1) % NUM_QUAD_JOBS;
+
+      /* Notify the producer if the que is not full.
+       */
+      if (full)
+         pipe_condvar_signal( que->que_notfull_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+
+      job.routine( info->setup, info->id, &job );
+
+      /* Notify the producer if that's the last finished job.
+       */
+      pipe_mutex_lock( que->que_mutex );
+      que->jobs_done++;
+      if (que->jobs_added == que->jobs_done)
+         pipe_condvar_signal( que->que_done_condvar );
+      pipe_mutex_unlock( que->que_mutex );
+   }
+
+   return NULL;
+}
+
+#define WAIT_FOR_COMPLETION(setup) \
+   do {\
+      pipe_mutex_lock( setup->que.que_mutex );\
+      if (!INSTANT_NOTEMPTY_NOTIFY)\
+         pipe_condvar_broadcast( setup->que.que_notempty_condvar );\
+      while (setup->que.jobs_added != setup->que.jobs_done)\
+         pipe_condvar_wait( setup->que.que_done_condvar, setup->que.que_mutex );\
+      pipe_mutex_unlock( setup->que.que_mutex );\
+   } while (0)
+
+#else
+
+#define WAIT_FOR_COMPLETION(setup) ((void) 0)
 
+#endif
 
 /**
  * Test if x is NaN or +/- infinity.
@@ -143,7 +290,7 @@ static boolean cull_tri( struct setup_context *setup,
  * Clip setup->quad against the scissor/surface bounds.
  */
 static INLINE void
-quad_clip(struct setup_context *setup)
+quad_clip( struct setup_context *setup, struct quad_header *quad )
 {
    const struct pipe_scissor_state *cliprect = &setup->softpipe->cliprect;
    const int minx = (int) cliprect->minx;
@@ -151,22 +298,22 @@ quad_clip(struct setup_context *setup)
    const int miny = (int) cliprect->miny;
    const int maxy = (int) cliprect->maxy;
 
-   if (setup->quad.x0 >= maxx ||
-       setup->quad.y0 >= maxy ||
-       setup->quad.x0 + 1 < minx ||
-       setup->quad.y0 + 1 < miny) {
+   if (quad->input.x0 >= maxx ||
+       quad->input.y0 >= maxy ||
+       quad->input.x0 + 1 < minx ||
+       quad->input.y0 + 1 < miny) {
       /* totally clipped */
-      setup->quad.mask = 0x0;
+      quad->inout.mask = 0x0;
       return;
    }
-   if (setup->quad.x0 < minx)
-      setup->quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup->quad.y0 < miny)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup->quad.x0 == maxx - 1)
-      setup->quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup->quad.y0 == maxy - 1)
-      setup->quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
+   if (quad->input.x0 < minx)
+      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
+   if (quad->input.y0 < miny)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
+   if (quad->input.x0 == maxx - 1)
+      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
+   if (quad->input.y0 == maxy - 1)
+      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
 }
 
 
@@ -174,35 +321,59 @@ quad_clip(struct setup_context *setup)
  * Emit a quad (pass to next stage) with clipping.
  */
 static INLINE void
-clip_emit_quad(struct setup_context *setup)
+clip_emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
 {
-   quad_clip(setup);
-   if (setup->quad.mask) {
+   quad_clip( setup, quad );
+   if (quad->inout.mask) {
       struct softpipe_context *sp = setup->softpipe;
-      sp->quad.first->run(sp->quad.first, &setup->quad);
+
+      sp->quad[thread].first->run( sp->quad[thread].first, quad );
    }
 }
 
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+clip_emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   clip_emit_quad( setup, &quad, thread );
+}
+
+#define CLIP_EMIT_QUAD(setup) add_quad_job( &setup->que, &setup->quad, clip_emit_quad_job )
+
+#else
+
+#define CLIP_EMIT_QUAD(setup) clip_emit_quad( setup, &setup->quad, 0 )
+
+#endif
 
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  */
 static INLINE void
-emit_quad( struct setup_context *setup, int x, int y, unsigned mask )
+emit_quad( struct setup_context *setup, struct quad_header *quad, uint thread )
 {
    struct softpipe_context *sp = setup->softpipe;
-   setup->quad.x0 = x;
-   setup->quad.y0 = y;
-   setup->quad.mask = mask;
+#if DEBUG_FRAGS
+   uint mask = quad->inout.mask;
+#endif
+
 #if DEBUG_FRAGS
    if (mask & 1) setup->numFragsEmitted++;
    if (mask & 2) setup->numFragsEmitted++;
    if (mask & 4) setup->numFragsEmitted++;
    if (mask & 8) setup->numFragsEmitted++;
 #endif
-   sp->quad.first->run(sp->quad.first, &setup->quad);
+   sp->quad[thread].first->run( sp->quad[thread].first, quad );
 #if DEBUG_FRAGS
-   mask = setup->quad.mask;
+   mask = quad->inout.mask;
    if (mask & 1) setup->numFragsWritten++;
    if (mask & 2) setup->numFragsWritten++;
    if (mask & 4) setup->numFragsWritten++;
@@ -210,6 +381,38 @@ emit_quad( struct setup_context *setup, int x, int y, unsigned mask )
 #endif
 }
 
+#if SP_NUM_QUAD_THREADS > 1
+
+static void
+emit_quad_job( struct setup_context *setup, uint thread, struct quad_job *job )
+{
+   struct quad_header quad;
+
+   quad.input = job->input;
+   quad.inout = job->inout;
+   quad.coef = setup->quad.coef;
+   quad.posCoef = setup->quad.posCoef;
+   quad.nr_attrs = setup->quad.nr_attrs;
+   emit_quad( setup, &quad, thread );
+}
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      add_quad_job( &setup->que, &setup->quad, emit_quad_job );\
+   } while (0)
+
+#else
+
+#define EMIT_QUAD(setup,x,y,mask) do {\
+      setup->quad.input.x0 = x;\
+      setup->quad.input.y0 = y;\
+      setup->quad.inout.mask = mask;\
+      emit_quad( setup, &setup->quad, 0 );\
+   } while (0)
+
+#endif
 
 /**
  * Given an X or Y coordinate, return the block/quad coordinate that it
@@ -249,7 +452,7 @@ static void flush_spans( struct setup_context *setup )
             mask |= MASK_TOP_RIGHT;
          if (x+1 >= xleft1 && x+1 < xright1)
             mask |= MASK_BOTTOM_RIGHT;
-         emit_quad( setup, x, setup->span.y, mask );
+         EMIT_QUAD( setup, x, setup->span.y, mask );
       }
       break;
 
@@ -263,7 +466,7 @@ static void flush_spans( struct setup_context *setup )
             mask |= MASK_TOP_LEFT;
          if (x+1 >= xleft0 && x+1 < xright0)
             mask |= MASK_TOP_RIGHT;
-         emit_quad( setup, x, setup->span.y, mask );
+         EMIT_QUAD( setup, x, setup->span.y, mask );
       }
       break;
 
@@ -277,7 +480,7 @@ static void flush_spans( struct setup_context *setup )
             mask |= MASK_BOTTOM_LEFT;
          if (x+1 >= xleft1 && x+1 < xright1)
             mask |= MASK_BOTTOM_RIGHT;
-         emit_quad( setup, x, setup->span.y, mask );
+         EMIT_QUAD( setup, x, setup->span.y, mask );
       }
       break;
 
@@ -398,7 +601,7 @@ static boolean setup_sort_vertices( struct setup_context *setup,
     *  - the GLSL gl_FrontFacing fragment attribute (bool)
     *  - two-sided stencil test
     */
-   setup->quad.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
+   setup->quad.input.facing = (det > 0.0) ^ (setup->softpipe->rasterizer->front_winding == PIPE_WINDING_CW);
 
    return TRUE;
 }
@@ -595,7 +798,7 @@ static void setup_tri_coefficients( struct setup_context *setup )
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
          /* FOG.y = front/back facing  XXX fix this */
-         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.facing;
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
          setup->coef[fragSlot].dadx[1] = 0.0;
          setup->coef[fragSlot].dady[1] = 0.0;
       }
@@ -765,7 +968,7 @@ void setup_tri( struct setup_context *setup,
    setup_tri_coefficients( setup );
    setup_tri_edges( setup );
 
-   setup->quad.prim = PRIM_TRI;
+   setup->quad.input.prim = PRIM_TRI;
 
    setup->span.y = 0;
    setup->span.y_flags = 0;
@@ -790,6 +993,8 @@ void setup_tri( struct setup_context *setup,
 
    flush_spans( setup );
 
+   WAIT_FOR_COMPLETION(setup);
+
 #if DEBUG_FRAGS
    printf("Tri: %u frags emitted, %u written\n",
           setup->numFragsEmitted,
@@ -904,7 +1109,7 @@ setup_line_coefficients(struct setup_context *setup,
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
          /* FOG.y = front/back facing  XXX fix this */
-         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.facing;
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
          setup->coef[fragSlot].dadx[1] = 0.0;
          setup->coef[fragSlot].dady[1] = 0.0;
       }
@@ -925,20 +1130,20 @@ plot(struct setup_context *setup, int x, int y)
    const int quadY = y - iy;
    const int mask = (1 << ix) << (2 * iy);
 
-   if (quadX != setup->quad.x0 ||
-       quadY != setup->quad.y0)
+   if (quadX != setup->quad.input.x0 ||
+       quadY != setup->quad.input.y0)
    {
       /* flush prev quad, start new quad */
 
-      if (setup->quad.x0 != -1)
-         clip_emit_quad(setup);
+      if (setup->quad.input.x0 != -1)
+         CLIP_EMIT_QUAD(setup);
 
-      setup->quad.x0 = quadX;
-      setup->quad.y0 = quadY;
-      setup->quad.mask = 0x0;
+      setup->quad.input.x0 = quadX;
+      setup->quad.input.y0 = quadY;
+      setup->quad.inout.mask = 0x0;
    }
 
-   setup->quad.mask |= mask;
+   setup->quad.inout.mask |= mask;
 }
 
 
@@ -999,16 +1204,16 @@ setup_line(struct setup_context *setup,
    assert(dx >= 0);
    assert(dy >= 0);
 
-   setup->quad.x0 = setup->quad.y0 = -1;
-   setup->quad.mask = 0x0;
-   setup->quad.prim = PRIM_LINE;
+   setup->quad.input.x0 = setup->quad.input.y0 = -1;
+   setup->quad.inout.mask = 0x0;
+   setup->quad.input.prim = PRIM_LINE;
    /* XXX temporary: set coverage to 1.0 so the line appears
     * if AA mode happens to be enabled.
     */
-   setup->quad.coverage[0] =
-   setup->quad.coverage[1] =
-   setup->quad.coverage[2] =
-   setup->quad.coverage[3] = 1.0;
+   setup->quad.input.coverage[0] =
+   setup->quad.input.coverage[1] =
+   setup->quad.input.coverage[2] =
+   setup->quad.input.coverage[3] = 1.0;
 
    if (dx > dy) {
       /*** X-major line ***/
@@ -1052,9 +1257,11 @@ setup_line(struct setup_context *setup,
    }
 
    /* draw final quad */
-   if (setup->quad.mask) {
-      clip_emit_quad(setup);
+   if (setup->quad.inout.mask) {
+      CLIP_EMIT_QUAD(setup);
    }
+
+   WAIT_FOR_COMPLETION(setup);
 }
 
 
@@ -1148,22 +1355,22 @@ setup_point( struct setup_context *setup,
 
       if (spfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FOG) {
          /* FOG.y = front/back facing  XXX fix this */
-         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.facing;
+         setup->coef[fragSlot].a0[1] = 1.0f - setup->quad.input.facing;
          setup->coef[fragSlot].dadx[1] = 0.0;
          setup->coef[fragSlot].dady[1] = 0.0;
       }
    }
 
-   setup->quad.prim = PRIM_POINT;
+   setup->quad.input.prim = PRIM_POINT;
 
    if (halfSize <= 0.5 && !round) {
       /* special case for 1-pixel points */
       const int ix = ((int) x) & 1;
       const int iy = ((int) y) & 1;
-      setup->quad.x0 = (int) x - ix;
-      setup->quad.y0 = (int) y - iy;
-      setup->quad.mask = (1 << ix) << (2 * iy);
-      clip_emit_quad(setup);
+      setup->quad.input.x0 = (int) x - ix;
+      setup->quad.input.y0 = (int) y - iy;
+      setup->quad.inout.mask = (1 << ix) << (2 * iy);
+      CLIP_EMIT_QUAD(setup);
    }
    else {
       if (round) {
@@ -1183,15 +1390,15 @@ setup_point( struct setup_context *setup,
             for (ix = ixmin; ix <= ixmax; ix += 2) {
                float dx, dy, dist2, cover;
 
-               setup->quad.mask = 0x0;
+               setup->quad.inout.mask = 0x0;
 
                dx = (ix + 0.5f) - x;
                dy = (iy + 0.5f) - y;
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_TOP_LEFT;
+                  setup->quad.input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1199,8 +1406,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_TOP_RIGHT;
+                  setup->quad.input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_TOP_RIGHT;
                }
 
                dx = (ix + 0.5f) - x;
@@ -1208,8 +1415,8 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_BOTTOM_LEFT;
+                  setup->quad.input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_LEFT;
                }
 
                dx = (ix + 1.5f) - x;
@@ -1217,14 +1424,14 @@ setup_point( struct setup_context *setup,
                dist2 = dx * dx + dy * dy;
                if (dist2 <= rmax2) {
                   cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad.mask |= MASK_BOTTOM_RIGHT;
+                  setup->quad.input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
+                  setup->quad.inout.mask |= MASK_BOTTOM_RIGHT;
                }
 
-               if (setup->quad.mask) {
-                  setup->quad.x0 = ix;
-                  setup->quad.y0 = iy;
-                  clip_emit_quad(setup);
+               if (setup->quad.inout.mask) {
+                  setup->quad.input.x0 = ix;
+                  setup->quad.input.y0 = iy;
+                  CLIP_EMIT_QUAD(setup);
                }
             }
          }
@@ -1268,14 +1475,16 @@ setup_point( struct setup_context *setup,
                   mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
                }
 
-               setup->quad.mask = mask;
-               setup->quad.x0 = ix;
-               setup->quad.y0 = iy;
-               clip_emit_quad(setup);
+               setup->quad.inout.mask = mask;
+               setup->quad.input.x0 = ix;
+               setup->quad.input.y0 = iy;
+               CLIP_EMIT_QUAD(setup);
             }
          }
       }
    }
+
+   WAIT_FOR_COMPLETION(setup);
 }
 
 void setup_prepare( struct setup_context *setup )
@@ -1300,7 +1509,9 @@ void setup_prepare( struct setup_context *setup )
    /* Note: nr_attrs is only used for debugging (vertex printing) */
    setup->quad.nr_attrs = draw_num_vs_outputs(sp->draw);
 
-   sp->quad.first->begin(sp->quad.first);
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      sp->quad[i].first->begin( sp->quad[i].first );
+   }
 
    if (sp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        sp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
@@ -1328,11 +1539,31 @@ void setup_destroy_context( struct setup_context *setup )
 struct setup_context *setup_create_context( struct softpipe_context *softpipe )
 {
    struct setup_context *setup = CALLOC_STRUCT(setup_context);
+#if SP_NUM_QUAD_THREADS > 1
+   uint i;
+#endif
 
    setup->softpipe = softpipe;
 
    setup->quad.coef = setup->coef;
    setup->quad.posCoef = &setup->posCoef;
 
+#if SP_NUM_QUAD_THREADS > 1
+   setup->que.first = 0;
+   setup->que.last = 0;
+   pipe_mutex_init( setup->que.que_mutex );
+   pipe_condvar_init( setup->que.que_notfull_condvar );
+   pipe_condvar_init( setup->que.que_notempty_condvar );
+   setup->que.jobs_added = 0;
+   setup->que.jobs_done = 0;
+   pipe_condvar_init( setup->que.que_done_condvar );
+   for (i = 0; i < SP_NUM_QUAD_THREADS; i++) {
+      setup->threads[i].setup = setup;
+      setup->threads[i].id = i;
+      setup->threads[i].handle = pipe_thread_create( quad_thread, &setup->threads[i] );
+   }
+#endif
+
    return setup;
 }
+
diff --git a/src/gallium/drivers/softpipe/sp_state_fs.c b/src/gallium/drivers/softpipe/sp_state_fs.c
index 1be461b3a46..e5b609cf6c9 100644
--- a/src/gallium/drivers/softpipe/sp_state_fs.c
+++ b/src/gallium/drivers/softpipe/sp_state_fs.c
@@ -152,7 +152,7 @@ softpipe_set_constant_buffer(struct pipe_context *pipe,
    assert(index == 0);
 
    /* note: reference counting */
-   pipe_buffer_reference(ws,
+   winsys_buffer_reference(ws,
 			 &softpipe->constants[shader].buffer,
 			 buf ? buf->buffer : NULL);
    softpipe->constants[shader].size = buf ? buf->size : 0;
diff --git a/src/gallium/drivers/softpipe/sp_surface.c b/src/gallium/drivers/softpipe/sp_surface.c
index 389aceb27ce..6ade7326982 100644
--- a/src/gallium/drivers/softpipe/sp_surface.c
+++ b/src/gallium/drivers/softpipe/sp_surface.c
@@ -25,132 +25,14 @@
  * 
  **************************************************************************/
 
-#include "pipe/p_defines.h"
-#include "pipe/p_inlines.h"
-#include "pipe/p_winsys.h"
-#include "util/u_tile.h"
 #include "util/u_rect.h"
 #include "sp_context.h"
-#include "sp_surface.h"
 
 
 
-/**
- * Copy a rectangular region from one surface to another.
- * Surfaces must have same bpp.
- *
- * Note that it's always the case that Y=0=top of the raster.
- * If do_flip is non-zero, the region being copied will be flipped vertically.
- *
- * Assumes all values are within bounds -- no checking at this level -
- * do it higher up if required.
- */
-static void
-sp_surface_copy(struct pipe_context *pipe,
-                boolean do_flip,
-		struct pipe_surface *dst,
-		unsigned dstx, unsigned dsty,
-		struct pipe_surface *src,
-		unsigned srcx, unsigned srcy, unsigned width, unsigned height)
-{
-   void *dst_map = pipe->screen->surface_map( pipe->screen,
-                                              dst,
-                                              PIPE_BUFFER_USAGE_CPU_WRITE );
-
-   const void *src_map = pipe->screen->surface_map( pipe->screen,
-                                                    src,
-                                                    PIPE_BUFFER_USAGE_CPU_READ );
-
-   assert(dst->block.size == src->block.size);
-   assert(dst->block.width == src->block.width);
-   assert(dst->block.height == src->block.height);
-   assert(src_map);
-   assert(dst_map);
-
-   /* If do_flip, invert src_y position and pass negative src stride */
-   pipe_copy_rect(dst_map,
-                  &dst->block,
-                  dst->stride,
-                  dstx, dsty,
-                  width, height,
-                  src_map,
-                  do_flip ? -(int) src->stride : src->stride,
-                  srcx, srcy);
-
-   pipe->screen->surface_unmap(pipe->screen, src);
-   pipe->screen->surface_unmap(pipe->screen, dst);
-}
-
-
-static void *
-get_pointer(struct pipe_surface *dst, void *dst_map, unsigned x, unsigned y)
-{
-   return (char *)dst_map + y / dst->block.height * dst->stride + x / dst->block.width * dst->block.size;
-}
-
-
-#define UBYTE_TO_USHORT(B) ((B) | ((B) << 8))
-
-
-/**
- * Fill a rectangular sub-region.  Need better logic about when to
- * push buffers into AGP - will currently do so whenever possible.
- */
-static void
-sp_surface_fill(struct pipe_context *pipe,
-		struct pipe_surface *dst,
-		unsigned dstx, unsigned dsty,
-		unsigned width, unsigned height, unsigned value)
-{
-   unsigned i, j;
-   void *dst_map = pipe->screen->surface_map( pipe->screen,
-                                              dst,
-                                              PIPE_BUFFER_USAGE_CPU_WRITE );
-
-   assert(dst->stride > 0);
-
-
-   switch (dst->block.size) {
-   case 1:
-   case 2:
-   case 4:
-      pipe_fill_rect(dst_map, &dst->block, dst->stride, dstx, dsty, width, height, value);
-      break;
-   case 8:
-      {
-         /* expand the 4-byte clear value to an 8-byte value */
-         ushort *row = (ushort *) get_pointer(dst, dst_map, dstx, dsty);
-         ushort val0 = UBYTE_TO_USHORT((value >>  0) & 0xff);
-         ushort val1 = UBYTE_TO_USHORT((value >>  8) & 0xff);
-         ushort val2 = UBYTE_TO_USHORT((value >> 16) & 0xff);
-         ushort val3 = UBYTE_TO_USHORT((value >> 24) & 0xff);
-         val0 = (val0 << 8) | val0;
-         val1 = (val1 << 8) | val1;
-         val2 = (val2 << 8) | val2;
-         val3 = (val3 << 8) | val3;
-         for (i = 0; i < height; i++) {
-            for (j = 0; j < width; j++) {
-               row[j*4+0] = val0;
-               row[j*4+1] = val1;
-               row[j*4+2] = val2;
-               row[j*4+3] = val3;
-            }
-            row += dst->stride/2;
-         }
-      }
-      break;
-   default:
-      assert(0);
-      break;
-   }
-
-   pipe->screen->surface_unmap(pipe->screen, dst);
-}
-
-
 void
 sp_init_surface_functions(struct softpipe_context *sp)
 {
-   sp->pipe.surface_copy = sp_surface_copy;
-   sp->pipe.surface_fill = sp_surface_fill;
+   sp->pipe.surface_copy = util_surface_copy;
+   sp->pipe.surface_fill = util_surface_fill;
 }
diff --git a/src/gallium/drivers/softpipe/sp_texture.c b/src/gallium/drivers/softpipe/sp_texture.c
index 3a737d6f722..cb48035771b 100644
--- a/src/gallium/drivers/softpipe/sp_texture.c
+++ b/src/gallium/drivers/softpipe/sp_texture.c
@@ -192,7 +192,7 @@ softpipe_texture_blanket(struct pipe_screen * screen,
    spt->base.nblocksy[0] = pf_get_nblocksy(&spt->base.block, spt->base.height[0]);  
    spt->stride[0] = stride[0];
 
-   pipe_buffer_reference(screen->winsys, &spt->buffer, buffer);
+   pipe_buffer_reference(screen, &spt->buffer, buffer);
 
    return &spt->base;
 }
@@ -208,7 +208,7 @@ softpipe_texture_release(struct pipe_screen *screen,
    if (--(*pt)->refcount <= 0) {
       struct softpipe_texture *spt = softpipe_texture(*pt);
 
-      pipe_buffer_reference(screen->winsys, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &spt->buffer, NULL);
       FREE(spt);
    }
    *pt = NULL;
@@ -231,7 +231,7 @@ softpipe_get_tex_surface(struct pipe_screen *screen,
    if (ps) {
       assert(ps->refcount);
       assert(ps->winsys);
-      pipe_buffer_reference(ws, &ps->buffer, spt->buffer);
+      pipe_buffer_reference(screen, &ps->buffer, spt->buffer);
       ps->format = pt->format;
       ps->block = pt->block;
       ps->width = pt->width[level];
@@ -307,7 +307,7 @@ softpipe_surface_map( struct pipe_screen *screen,
       return NULL;
    }
 
-   map = screen->winsys->buffer_map( screen->winsys, surface->buffer, flags );
+   map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
 
@@ -331,7 +331,7 @@ static void
 softpipe_surface_unmap(struct pipe_screen *screen,
                        struct pipe_surface *surface)
 {
-   screen->winsys->buffer_unmap( screen->winsys, surface->buffer );
+   pipe_buffer_unmap( screen, surface->buffer );
 }
 
 
diff --git a/src/gallium/drivers/trace/SConscript b/src/gallium/drivers/trace/SConscript
index 5c49468c4eb..0a6bfb8f4c7 100644
--- a/src/gallium/drivers/trace/SConscript
+++ b/src/gallium/drivers/trace/SConscript
@@ -9,8 +9,6 @@ trace = env.ConvenienceLibrary(
         'tr_dump.c',
         'tr_screen.c',
         'tr_state.c',
-        'tr_stream_stdc.c',
-        'tr_stream_wd.c',
         'tr_texture.c',
         'tr_winsys.c',
     ])
diff --git a/src/gallium/drivers/trace/tr_dump.c b/src/gallium/drivers/trace/tr_dump.c
index 48032c1617f..a0ead0ded33 100644
--- a/src/gallium/drivers/trace/tr_dump.c
+++ b/src/gallium/drivers/trace/tr_dump.c
@@ -48,12 +48,12 @@
 #include "pipe/p_debug.h"
 #include "util/u_memory.h"
 #include "util/u_string.h"
+#include "util/u_stream.h"
 
-#include "tr_stream.h"
 #include "tr_dump.h"
 
 
-static struct trace_stream *stream = NULL;
+static struct util_stream *stream = NULL;
 static unsigned refcount = 0;
 
 
@@ -61,7 +61,7 @@ static INLINE void
 trace_dump_write(const char *buf, size_t size)
 {
    if(stream)
-      trace_stream_write(stream, buf, size);
+      util_stream_write(stream, buf, size);
 }
 
 
@@ -212,7 +212,7 @@ trace_dump_trace_close(void)
 {
    if(stream) {
       trace_dump_writes("</trace>\n");
-      trace_stream_close(stream);
+      util_stream_close(stream);
       stream = NULL;
       refcount = 0;
    }
@@ -228,7 +228,7 @@ boolean trace_dump_trace_begin()
    
    if(!stream) {
    
-      stream = trace_stream_create(filename);
+      stream = util_stream_create(filename, 0);
       if(!stream)
          return FALSE;
       
@@ -272,7 +272,7 @@ void trace_dump_call_end(void)
    trace_dump_indent(1);
    trace_dump_tag_end("call");
    trace_dump_newline();
-   trace_stream_flush(stream);
+   util_stream_flush(stream);
 }
 
 void trace_dump_arg_begin(const char *name)
diff --git a/src/gallium/include/pipe/p_debug.h b/src/gallium/include/pipe/p_debug.h
index 6478ae2f08e..cb6196aa9fb 100644
--- a/src/gallium/include/pipe/p_debug.h
+++ b/src/gallium/include/pipe/p_debug.h
@@ -340,9 +340,12 @@ void debug_dump_image(const char *prefix,
                       const void *data);
 void debug_dump_surface(const char *prefix,
                         struct pipe_surface *surface);   
+void debug_dump_surface_bmp(const char *filename,
+                            struct pipe_surface *surface);
 #else
 #define debug_dump_image(prefix, format, cpp, width, height, stride, data) ((void)0)
 #define debug_dump_surface(prefix, surface) ((void)0)
+#define debug_dump_surface_bmp(filename, surface) ((void)0)
 #endif
 
 
diff --git a/src/gallium/include/pipe/p_format.h b/src/gallium/include/pipe/p_format.h
index d9aa5792a44..97a4c8c510b 100644
--- a/src/gallium/include/pipe/p_format.h
+++ b/src/gallium/include/pipe/p_format.h
@@ -531,6 +531,35 @@ pf_is_ycbcr( enum pipe_format format )
    return pf_layout(format) == PIPE_FORMAT_LAYOUT_YCBCR ? TRUE : FALSE;
 }
 
+static INLINE boolean 
+pf_has_alpha( enum pipe_format format )
+{
+   switch (pf_layout(format)) {
+   case PIPE_FORMAT_LAYOUT_RGBAZS:
+   case PIPE_FORMAT_LAYOUT_MIXED:
+      /* FIXME: pf_get_component_bits( PIPE_FORMAT_A8L8_UNORM, PIPE_FORMAT_COMP_A ) should not return 0 right? */
+      if(format == PIPE_FORMAT_A8_UNORM || 
+         format == PIPE_FORMAT_A8L8_UNORM || 
+         format == PIPE_FORMAT_A8_L8_SRGB)
+         return TRUE;
+      return pf_get_component_bits( format, PIPE_FORMAT_COMP_A ) ? TRUE : FALSE;
+   case PIPE_FORMAT_LAYOUT_YCBCR:
+      return FALSE; 
+   case PIPE_FORMAT_LAYOUT_DXT:
+      switch (format) {
+      case PIPE_FORMAT_DXT1_RGBA:
+      case PIPE_FORMAT_DXT3_RGBA:
+      case PIPE_FORMAT_DXT5_RGBA:
+         return TRUE;
+      default:
+         return FALSE;
+      }
+   default:
+      assert( 0 );
+      return FALSE;
+   }
+}
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/src/gallium/include/pipe/p_inlines.h b/src/gallium/include/pipe/p_inlines.h
index 1e4b98edb48..d70de8e3011 100644
--- a/src/gallium/include/pipe/p_inlines.h
+++ b/src/gallium/include/pipe/p_inlines.h
@@ -109,7 +109,7 @@ pipe_surface_reference(struct pipe_surface **ptr, struct pipe_surface *surf)
 /* XXX: thread safety issues!
  */
 static INLINE void
-pipe_buffer_reference(struct pipe_winsys *winsys,
+winsys_buffer_reference(struct pipe_winsys *winsys,
 		      struct pipe_buffer **ptr,
 		      struct pipe_buffer *buf)
 {
@@ -164,48 +164,48 @@ pipe_texture_release(struct pipe_texture **ptr)
  */
 
 static INLINE struct pipe_buffer *
-pipe_buffer_create( struct pipe_context *pipe,
+pipe_buffer_create( struct pipe_screen *screen,
                     unsigned alignment, unsigned usage, unsigned size )
 {
-   return pipe->winsys->buffer_create(pipe->winsys, alignment, usage, size);
+   return screen->winsys->buffer_create(screen->winsys, alignment, usage, size);
 }
 
 static INLINE struct pipe_buffer *
-pipe_user_buffer_create( struct pipe_context *pipe, void *ptr, unsigned size )
+pipe_user_buffer_create( struct pipe_screen *screen, void *ptr, unsigned size )
 {
-   return pipe->winsys->user_buffer_create(pipe->winsys, ptr, size);
+   return screen->winsys->user_buffer_create(screen->winsys, ptr, size);
 }
 
 static INLINE void
-pipe_buffer_destroy( struct pipe_context *pipe, struct pipe_buffer *buf )
+pipe_buffer_destroy( struct pipe_screen *screen, struct pipe_buffer *buf )
 {
-   pipe->winsys->buffer_destroy(pipe->winsys, buf);
+   screen->winsys->buffer_destroy(screen->winsys, buf);
 }
 
 static INLINE void *
-pipe_buffer_map(struct pipe_context *pipe,
+pipe_buffer_map(struct pipe_screen *screen,
                 struct pipe_buffer *buf,
                 unsigned usage)
 {
-   return pipe->winsys->buffer_map(pipe->winsys, buf, usage);
+   return screen->winsys->buffer_map(screen->winsys, buf, usage);
 }
 
 static INLINE void
-pipe_buffer_unmap(struct pipe_context *pipe,
+pipe_buffer_unmap(struct pipe_screen *screen,
                   struct pipe_buffer *buf)
 {
-   pipe->winsys->buffer_unmap(pipe->winsys, buf);
+   screen->winsys->buffer_unmap(screen->winsys, buf);
 }
 
 /* XXX when we're using this everywhere, get rid of
- * pipe_buffer_reference() above.
+ * winsys_buffer_reference() above.
  */
 static INLINE void
-pipe_reference_buffer(struct pipe_context *pipe,
+pipe_buffer_reference(struct pipe_screen *screen,
 		      struct pipe_buffer **ptr,
 		      struct pipe_buffer *buf)
 {
-   pipe_buffer_reference(pipe->winsys, ptr, buf);
+   winsys_buffer_reference(screen->winsys, ptr, buf);
 }
 
 
diff --git a/src/gallium/include/pipe/p_state.h b/src/gallium/include/pipe/p_state.h
index 4d0f3597e60..da783389dae 100644
--- a/src/gallium/include/pipe/p_state.h
+++ b/src/gallium/include/pipe/p_state.h
@@ -276,12 +276,12 @@ struct pipe_surface
    enum pipe_format format;      /**< PIPE_FORMAT_x */
    unsigned status;              /**< PIPE_SURFACE_STATUS_x */
    unsigned clear_value;         /**< XXX may be temporary */
-   unsigned width;
-   unsigned height;
+   unsigned width;               /**< logical width in pixels */
+   unsigned height;              /**< logical height in pixels */
    struct pipe_format_block block;
-   unsigned nblocksx;
-   unsigned nblocksy;
-   unsigned stride;              /**< in bytes */
+   unsigned nblocksx;            /**< allocated width in blocks */
+   unsigned nblocksy;            /**< allocated height in blocks */
+   unsigned stride;              /**< stride in bytes between rows of blocks */
    unsigned layout;              /**< PIPE_SURFACE_LAYOUT_x */
    unsigned offset;              /**< offset from start of buffer, in bytes */
    unsigned refcount;
@@ -309,8 +309,8 @@ struct pipe_texture
    unsigned depth[PIPE_MAX_TEXTURE_LEVELS];
 
    struct pipe_format_block block;
-   unsigned nblocksx[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned nblocksy[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned nblocksx[PIPE_MAX_TEXTURE_LEVELS]; /**< allocated width in blocks */
+   unsigned nblocksy[PIPE_MAX_TEXTURE_LEVELS]; /**< allocated height in blocks */
 
    unsigned last_level:8;    /**< Index of last mipmap level present/defined */
    unsigned compressed:1;
diff --git a/src/gallium/include/pipe/p_thread.h b/src/gallium/include/pipe/p_thread.h
index 6e526b7aa8b..e01d5a602b8 100644
--- a/src/gallium/include/pipe/p_thread.h
+++ b/src/gallium/include/pipe/p_thread.h
@@ -23,307 +23,252 @@
  * 
  **************************************************************************/
 
+
 /**
- * @file
- * Thread
- *
- * Initial version by John Stone ([email protected]) ([email protected])
- *                and Christoph Poliwoda ([email protected])
- * Revised by Keith Whitwell
- * Adapted for new gl dispatcher by Brian Paul
- *
- *
- *
- * DOCUMENTATION
- *
- * This thread module exports the following types:
- *   _glthread_TSD     Thread-specific data area
- *   _glthread_Thread  Thread datatype
- *   _glthread_Mutex   Mutual exclusion lock
- *
- * Macros:
- *   _glthread_DECLARE_STATIC_MUTEX(name)   Declare a non-local mutex
- *   _glthread_INIT_MUTEX(name)             Initialize a mutex
- *   _glthread_LOCK_MUTEX(name)             Lock a mutex
- *   _glthread_UNLOCK_MUTEX(name)           Unlock a mutex
- *
- * Functions:
- *   _glthread_GetID(v)      Get integer thread ID
- *   _glthread_InitTSD()     Initialize thread-specific data
- *   _glthread_GetTSD()      Get thread-specific data
- *   _glthread_SetTSD()      Set thread-specific data
- *
- * If this file is accidentally included by a non-threaded build,
- * it should not cause the build to fail, or otherwise cause problems.
- * In general, it should only be included when needed however.
+ * Thread, mutex, condition var and thread-specific data functions.
  */
 
-#ifndef _P_THREAD_H_
-#define _P_THREAD_H_
 
+#ifndef _P_THREAD2_H_
+#define _P_THREAD2_H_
 
-#if (defined(PTHREADS) || defined(SOLARIS_THREADS) ||\
-     defined(WIN32_THREADS) || defined(USE_XTHREADS) || defined(BEOS_THREADS)) \
-    && !defined(THREADS)
-# define THREADS
-#endif
 
-#ifdef VMS
-#include <GL/vms_x_fix.h>
-#endif
+#include "pipe/p_compiler.h"
+
+
+#if defined(PIPE_OS_LINUX)
 
-/*
- * POSIX threads. This should be your choice in the Unix world
- * whenever possible.  When building with POSIX threads, be sure
- * to enable any compiler flags which will cause the MT-safe
- * libc (if one exists) to be used when linking, as well as any
- * header macros for MT-safe errno, etc.  For Solaris, this is the -mt
- * compiler flag.  On Solaris with gcc, use -D_REENTRANT to enable
- * proper compiling for MT-safe libc etc.
- */
-#if defined(PTHREADS)
 #include <pthread.h> /* POSIX threads headers */
+#include <stdio.h> /* for perror() */
 
-typedef struct {
-   pthread_key_t  key;
-   int initMagic;
-} _glthread_TSD;
+typedef pthread_t pipe_thread;
+
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void *name( void *param )
 
-typedef pthread_t _glthread_Thread;
+static INLINE pipe_thread pipe_thread_create( void *(* routine)( void *), void *param )
+{
+   pipe_thread thread;
+   if (pthread_create( &thread, NULL, routine, param ))
+      return 0;
+   return thread;
+}
 
-typedef pthread_mutex_t _glthread_Mutex;
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   return pthread_join( thread, NULL );
+}
 
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name = PTHREAD_MUTEX_INITIALIZER
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   return pthread_detach( thread );
+}
 
-#define _glthread_INIT_MUTEX(name) \
-   pthread_mutex_init(&(name), NULL)
+typedef pthread_mutex_t pipe_mutex;
+typedef pthread_cond_t pipe_condvar;
 
-#define _glthread_DESTROY_MUTEX(name) \
-   pthread_mutex_destroy(&(name))
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = PTHREAD_MUTEX_INITIALIZER
 
-#define _glthread_LOCK_MUTEX(name) \
-   (void) pthread_mutex_lock(&(name))
+#define pipe_mutex_init(mutex) \
+   pthread_mutex_init(&(mutex), NULL)
 
-#define _glthread_UNLOCK_MUTEX(name) \
-   (void) pthread_mutex_unlock(&(name))
+#define pipe_mutex_destroy(mutex) \
+   pthread_mutex_destroy(&(mutex))
 
-typedef pthread_cond_t _glthread_Cond;
+#define pipe_mutex_lock(mutex) \
+   (void) pthread_mutex_lock(&(mutex))
 
-#define _glthread_DECLARE_STATIC_COND(name) \
-   static _glthread_Cond name = PTHREAD_COND_INITIALIZER
+#define pipe_mutex_unlock(mutex) \
+   (void) pthread_mutex_unlock(&(mutex))
 
-#define _glthread_INIT_COND(cond)			\
+#define pipe_static_condvar(mutex) \
+   static pipe_condvar mutex = PTHREAD_COND_INITIALIZER
+
+#define pipe_condvar_init(cond)	\
    pthread_cond_init(&(cond), NULL)
 
-#define _glthread_DESTROY_COND(name) \
-   pthread_cond_destroy(&(name))
+#define pipe_condvar_destroy(cond) \
+   pthread_cond_destroy(&(cond))
 
-#define _glthread_COND_WAIT(cond, mutex) \
+#define pipe_condvar_wait(cond, mutex) \
   pthread_cond_wait(&(cond), &(mutex))
 
-#define _glthread_COND_SIGNAL(cond) \
+#define pipe_condvar_signal(cond) \
   pthread_cond_signal(&(cond))
 
-#define _glthread_COND_BROADCAST(cond) \
+#define pipe_condvar_broadcast(cond) \
   pthread_cond_broadcast(&(cond))
 
-#endif /* PTHREADS */
-
 
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
 
-
-/*
- * Solaris threads. Use only up to Solaris 2.4.
- * Solaris 2.5 and higher provide POSIX threads.
- * Be sure to compile with -mt on the Solaris compilers, or
- * use -D_REENTRANT if using gcc.
- */
-#ifdef SOLARIS_THREADS
-#include <thread.h>
-
-typedef struct {
-   thread_key_t key;
-   mutex_t      keylock;
-   int          initMagic;
-} _glthread_TSD;
-
-typedef thread_t _glthread_Thread;
-
-typedef mutex_t _glthread_Mutex;
-
-/* XXX need to really implement mutex-related macros */
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = 0
-#define _glthread_INIT_MUTEX(name)  (void) name
-#define _glthread_DESTROY_MUTEX(name) (void) name
-#define _glthread_LOCK_MUTEX(name)  (void) name
-#define _glthread_UNLOCK_MUTEX(name)  (void) name
-
-#endif /* SOLARIS_THREADS */
-
-
-
-
-/*
- * Windows threads. Should work with Windows NT and 95.
- * IMPORTANT: Link with multithreaded runtime library when THREADS are
- * used!
- */
-#ifdef WIN32_THREADS
 #include <windows.h>
 
-typedef struct {
-   DWORD key;
-   int   initMagic;
-} _glthread_TSD;
+typedef HANDLE pipe_thread;
 
-typedef HANDLE _glthread_Thread;
+#define PIPE_THREAD_ROUTINE( name, param ) \
+   void * WINAPI name( void *param )
 
-typedef CRITICAL_SECTION _glthread_Mutex;
+static INLINE pipe_thread pipe_thread_create( void *(WINAPI * routine)( void *), void *param )
+{
+   DWORD id;
+   return CreateThread( NULL, 0, (LPTHREAD_START_ROUTINE) routine, param, 0, &id );
+}
 
-#define _glthread_DECLARE_STATIC_MUTEX(name)  /*static*/ _glthread_Mutex name = {0,0,0,0,0,0}
-#define _glthread_INIT_MUTEX(name)  InitializeCriticalSection(&name)
-#define _glthread_DESTROY_MUTEX(name)  DeleteCriticalSection(&name)
-#define _glthread_LOCK_MUTEX(name)  EnterCriticalSection(&name)
-#define _glthread_UNLOCK_MUTEX(name)  LeaveCriticalSection(&name)
+static INLINE int pipe_thread_wait( pipe_thread thread )
+{
+   if (WaitForSingleObject( thread, INFINITE ) == WAIT_OBJECT_0)
+      return 0;
+   return -1;
+}
 
-#endif /* WIN32_THREADS */
+static INLINE int pipe_thread_destroy( pipe_thread thread )
+{
+   if (CloseHandle( thread ))
+      return 0;
+   return -1;
+}
 
+typedef CRITICAL_SECTION pipe_mutex;
 
+#define pipe_static_mutex(name) \
+   /*static*/ pipe_mutex name = {0,0,0,0,0,0}
 
+#define pipe_mutex_init(name) \
+   InitializeCriticalSection(&name)
 
-/*
- * XFree86 has its own thread wrapper, Xthreads.h
- * We wrap it again for GL.
- */
-#ifdef USE_XTHREADS
-#include <X11/Xthreads.h>
+#define pipe_mutex_destroy(name) \
+   DeleteCriticalSection(&name)
 
-typedef struct {
-   xthread_key_t key;
-   int initMagic;
-} _glthread_TSD;
-
-typedef xthread_t _glthread_Thread;
-
-typedef xmutex_rec _glthread_Mutex;
-
-#ifdef XMUTEX_INITIALIZER
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name = XMUTEX_INITIALIZER
-#else
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name
-#endif
-
-#define _glthread_INIT_MUTEX(name) \
-   xmutex_init(&(name))
-
-#define _glthread_DESTROY_MUTEX(name) \
-   xmutex_clear(&(name))
-
-#define _glthread_LOCK_MUTEX(name) \
-   (void) xmutex_lock(&(name))
-
-#define _glthread_UNLOCK_MUTEX(name) \
-   (void) xmutex_unlock(&(name))
-
-#endif /* USE_XTHREADS */
-
-
-
-/*
- * BeOS threads. R5.x required.
- */
-#ifdef BEOS_THREADS
-
-#include <kernel/OS.h>
-#include <support/TLS.h>
-
-typedef struct {
-   int32        key;
-   int          initMagic;
-} _glthread_TSD;
-
-typedef thread_id _glthread_Thread;
-
-/* Use Benaphore, aka speeder semaphore */
-typedef struct {
-    int32   lock;
-    sem_id  sem;
-} benaphore;
-typedef benaphore _glthread_Mutex;
-
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = { 0, 0 }
-#define _glthread_INIT_MUTEX(name)    	name.sem = create_sem(0, #name"_benaphore"), name.lock = 0
-#define _glthread_DESTROY_MUTEX(name) 	delete_sem(name.sem), name.lock = 0
-#define _glthread_LOCK_MUTEX(name)    	if (name.sem == 0) _glthread_INIT_MUTEX(name); \
-									  	if (atomic_add(&(name.lock), 1) >= 1) acquire_sem(name.sem)
-#define _glthread_UNLOCK_MUTEX(name)  	if (atomic_add(&(name.lock), -1) > 1) release_sem(name.sem)
+#define pipe_mutex_lock(name) \
+   EnterCriticalSection(&name)
 
-#endif /* BEOS_THREADS */
+#define pipe_mutex_unlock(name) \
+   LeaveCriticalSection(&name)
 
+/* XXX: dummy definitions, make it compile */
 
+typedef unsigned pipe_condvar;
 
-#ifndef THREADS
+#define pipe_condvar_init(condvar) \
+   (void) condvar
 
-/*
- * THREADS not defined
- */
+#define pipe_condvar_broadcast(condvar) \
+   (void) condvar
 
-typedef unsigned _glthread_TSD;
+#else
 
-typedef unsigned _glthread_Thread;
+/** Dummy definitions */
 
-typedef unsigned _glthread_Mutex;
+typedef unsigned pipe_thread;
+typedef unsigned pipe_mutex;
+typedef unsigned pipe_condvar;
 
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = 0
+#define pipe_static_mutex(mutex) \
+   static pipe_mutex mutex = 0
 
-#define _glthread_INIT_MUTEX(name)  (void) name
+#define pipe_mutex_init(mutex) \
+   (void) mutex
 
-#define _glthread_DESTROY_MUTEX(name)  (void) name
+#define pipe_mutex_destroy(mutex) \
+   (void) mutex
 
-#define _glthread_LOCK_MUTEX(name)  (void) name
+#define pipe_mutex_lock(mutex) \
+   (void) mutex
 
-#define _glthread_UNLOCK_MUTEX(name)  (void) name
+#define pipe_mutex_unlock(mutex) \
+   (void) mutex
 
-typedef unsigned _glthread_Cond;
+#define pipe_static_condvar(condvar) \
+   static unsigned condvar = 0
 
-#define _glthread_DECLARE_STATIC_COND(name) static _glthread_Cond name = 0
+#define pipe_condvar_init(condvar) \
+   (void) condvar
 
-#define _glthread_INIT_COND(name)  (void) name
+#define pipe_condvar_destroy(condvar) \
+   (void) condvar
 
-#define _glthread_DESTROY_COND(name)  (void) name
+#define pipe_condvar_wait(condvar, mutex) \
+   (void) condvar
 
-#define _glthread_COND_WAIT(name, mutex)  (void) name
+#define pipe_condvar_signal(condvar) \
+   (void) condvar
 
-#define _glthread_COND_SIGNAL(name)  (void) name
+#define pipe_condvar_broadcast(condvar) \
+   (void) condvar
 
-#define _glthread_COND_BROADCAST(name)  (void) name
 
-#endif /* THREADS */
+#endif  /* PIPE_OS_? */
 
 
 
 /*
- * Platform independent thread specific data API.
+ * Thread-specific data.
  */
 
-extern unsigned long
-_glthread_GetID(void);
-
-
-extern void
-_glthread_InitTSD(_glthread_TSD *);
+typedef struct {
+#if defined(PIPE_OS_LINUX)
+   pthread_key_t key;
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   DWORD key;
+#endif
+   int initMagic;
+} pipe_tsd;
 
 
-extern void *
-_glthread_GetTSD(_glthread_TSD *);
+#define PIPE_TSD_INIT_MAGIC 0xff8adc98
 
 
-extern void
-_glthread_SetTSD(_glthread_TSD *, void *);
+static INLINE void
+pipe_tsd_init(pipe_tsd *tsd)
+{
+#if defined(PIPE_OS_LINUX)
+   if (pthread_key_create(&tsd->key, NULL/*free*/) != 0) {
+      perror("pthread_key_create(): failed to allocate key for thread specific data");
+      exit(-1);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+#endif
+   tsd->initMagic = PIPE_TSD_INIT_MAGIC;
+}
+
+static INLINE void *
+pipe_tsd_get(pipe_tsd *tsd)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX)
+   return pthread_getspecific(tsd->key);
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+   return NULL;
+#else
+   assert(0);
+   return NULL;
+#endif
+}
+
+static INLINE void
+pipe_tsd_set(pipe_tsd *tsd, void *value)
+{
+   if (tsd->initMagic != (int) PIPE_TSD_INIT_MAGIC) {
+      pipe_tsd_init(tsd);
+   }
+#if defined(PIPE_OS_LINUX)
+   if (pthread_setspecific(tsd->key, value) != 0) {
+      perror("pthread_set_specific() failed");
+      exit(-1);
+   }
+#elif defined(PIPE_SUBSYSTEM_WINDOWS_USER)
+   assert(0);
+#else
+   assert(0);
+#endif
+}
 
 
 
-#endif /* _P_THREAD_H_ */
+#endif /* _P_THREAD2_H_ */
diff --git a/src/gallium/state_trackers/python/gallium.i b/src/gallium/state_trackers/python/gallium.i
index a67372c6239..68d2db3325a 100644
--- a/src/gallium/state_trackers/python/gallium.i
+++ b/src/gallium/state_trackers/python/gallium.i
@@ -46,7 +46,7 @@
 #include "pipe/p_shader_tokens.h" 
 #include "cso_cache/cso_context.h"
 #include "util/u_draw_quad.h" 
-#include "util/p_tile.h" 
+#include "util/u_tile.h" 
 #include "tgsi/tgsi_text.h" 
 #include "tgsi/tgsi_dump.h" 
 
diff --git a/src/gallium/state_trackers/python/p_context.i b/src/gallium/state_trackers/python/p_context.i
index 0b2621f7c31..1fdcec639f7 100644
--- a/src/gallium/state_trackers/python/p_context.i
+++ b/src/gallium/state_trackers/python/p_context.i
@@ -225,30 +225,30 @@ struct st_context {
                       const float *vertices) 
    {
       struct pipe_context *pipe = $self->pipe;
-      struct pipe_winsys *winsys = pipe->winsys;
+      struct pipe_screen *screen = pipe->screen;
       struct pipe_buffer *vbuf;
       float *map;
       unsigned size;
 
       size = num_verts * num_attribs * 4 * sizeof(float);
 
-      vbuf = winsys->buffer_create(winsys,
-                                   32,
-                                   PIPE_BUFFER_USAGE_VERTEX, 
-                                   size);
+      vbuf = pipe_buffer_create(screen,
+                                32,
+                                PIPE_BUFFER_USAGE_VERTEX, 
+                                size);
       if(!vbuf)
          goto error1;
       
-      map = winsys->buffer_map(winsys, vbuf, PIPE_BUFFER_USAGE_CPU_WRITE);
+      map = pipe_buffer_map(screen, vbuf, PIPE_BUFFER_USAGE_CPU_WRITE);
       if (!map)
          goto error2;
       memcpy(map, vertices, size);
-      pipe->winsys->buffer_unmap(pipe->winsys, vbuf);
+      pipe_buffer_unmap(screen, vbuf);
       
       util_draw_vertex_buffer(pipe, vbuf, prim, num_verts, num_attribs);
       
 error2:
-      pipe_buffer_reference(pipe->winsys, &vbuf, NULL);
+      pipe_buffer_reference(screen, &vbuf, NULL);
 error1:
       ;
    }
diff --git a/src/gallium/state_trackers/python/p_texture.i b/src/gallium/state_trackers/python/p_texture.i
index f23afea1677..33fb3743cce 100644
--- a/src/gallium/state_trackers/python/p_texture.i
+++ b/src/gallium/state_trackers/python/p_texture.i
@@ -180,7 +180,7 @@ struct st_buffer {
    }
    
    void write( const char *STRING, unsigned LENGTH, unsigned offset = 0) {
-      struct pipe_winsys *winsys = $self->st_dev->screen->winsys;
+      struct pipe_screen *screen = $self->st_dev->screen;
       char *map;
       
       assert($self->buffer->refcount);
@@ -195,10 +195,10 @@ struct st_buffer {
          return;
       }
 
-      map = winsys->buffer_map(winsys, $self->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
+      map = pipe_buffer_map(screen, $self->buffer, PIPE_BUFFER_USAGE_CPU_WRITE);
       if(map) {
          memcpy(map + offset, STRING, LENGTH);
-         winsys->buffer_unmap(winsys, $self->buffer);
+         pipe_buffer_unmap(screen, $self->buffer);
       }
    }
 };
diff --git a/src/gallium/state_trackers/python/samples/tri.py b/src/gallium/state_trackers/python/samples/tri.py
index 1271c67627d..193479f7d6c 100644
--- a/src/gallium/state_trackers/python/samples/tri.py
+++ b/src/gallium/state_trackers/python/samples/tri.py
@@ -167,7 +167,6 @@ def test(dev):
         1:MOV OUT[1], IN[1]
         2:END
     ''')
-    #vs.dump()
     ctx.set_vertex_shader(vs)
 
     # fragment shader
@@ -178,7 +177,6 @@ def test(dev):
         0:MOV OUT[0], IN[0]
         1:END
     ''')
-    #fs.dump()
     ctx.set_fragment_shader(fs)
 
     nverts = 3
@@ -218,6 +216,7 @@ def test(dev):
     ctx.flush()
     
     show_image(cbuf.get_surface(usage = PIPE_BUFFER_USAGE_CPU_READ|PIPE_BUFFER_USAGE_CPU_WRITE))
+    #save_image('tri.png', cbuf.get_surface(usage = PIPE_BUFFER_USAGE_CPU_READ|PIPE_BUFFER_USAGE_CPU_WRITE))
 
 
 
diff --git a/src/gallium/state_trackers/python/st_device.c b/src/gallium/state_trackers/python/st_device.c
index f71d85dd9b4..95c1378a032 100644
--- a/src/gallium/state_trackers/python/st_device.c
+++ b/src/gallium/state_trackers/python/st_device.c
@@ -292,8 +292,8 @@ void
 st_buffer_destroy(struct st_buffer *st_buf)
 {
    if(st_buf) {
-      struct pipe_winsys *winsys = st_buf->st_dev->screen->winsys;
-      pipe_buffer_reference(winsys, &st_buf->buffer, NULL);
+      struct pipe_screen *screen = st_buf->st_dev->screen;
+      pipe_buffer_reference(screen, &st_buf->buffer, NULL);
       FREE(st_buf);
    }
 }
@@ -303,7 +303,7 @@ struct st_buffer *
 st_buffer_create(struct st_device *st_dev,
                  unsigned alignment, unsigned usage, unsigned size)
 {
-   struct pipe_winsys *winsys = st_dev->screen->winsys;
+   struct pipe_screen *screen = st_dev->screen;
    struct st_buffer *st_buf;
    
    st_buf = CALLOC_STRUCT(st_buffer);
@@ -312,7 +312,7 @@ st_buffer_create(struct st_device *st_dev,
 
    st_buf->st_dev = st_dev;
    
-   st_buf->buffer = winsys->buffer_create(winsys, alignment, usage, size);
+   st_buf->buffer = pipe_buffer_create(screen, alignment, usage, size);
    if(!st_buf->buffer) {
       st_buffer_destroy(st_buf);
       return NULL;
diff --git a/src/gallium/state_trackers/python/st_softpipe_winsys.c b/src/gallium/state_trackers/python/st_softpipe_winsys.c
index 2d4f5434b35..f62113a4691 100644
--- a/src/gallium/state_trackers/python/st_softpipe_winsys.c
+++ b/src/gallium/state_trackers/python/st_softpipe_winsys.c
@@ -221,7 +221,7 @@ st_softpipe_surface_release(struct pipe_winsys *winsys,
    surf->refcount--;
    if (surf->refcount == 0) {
       if (surf->buffer)
-	pipe_buffer_reference(winsys, &surf->buffer, NULL);
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
       free(surf);
    }
    *s = NULL;
diff --git a/src/gallium/winsys/drm/intel/common/glthread.h b/src/gallium/winsys/drm/intel/common/glthread.h
deleted file mode 100644
index b8e9d5f59b8..00000000000
--- a/src/gallium/winsys/drm/intel/common/glthread.h
+++ /dev/null
@@ -1,359 +0,0 @@
-/*
- * Mesa 3-D graphics library
- * Version:  6.5.2
- *
- * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-
-/*
- * Thread support for gl dispatch.
- *
- * Initial version by John Stone ([email protected]) ([email protected])
- *                and Christoph Poliwoda ([email protected])
- * Revised by Keith Whitwell
- * Adapted for new gl dispatcher by Brian Paul
- *
- *
- *
- * DOCUMENTATION
- *
- * This thread module exports the following types:
- *   _glthread_TSD     Thread-specific data area
- *   _glthread_Thread  Thread datatype
- *   _glthread_Mutex   Mutual exclusion lock
- *
- * Macros:
- *   _glthread_DECLARE_STATIC_MUTEX(name)   Declare a non-local mutex
- *   _glthread_INIT_MUTEX(name)             Initialize a mutex
- *   _glthread_LOCK_MUTEX(name)             Lock a mutex
- *   _glthread_UNLOCK_MUTEX(name)           Unlock a mutex
- *
- * Functions:
- *   _glthread_GetID(v)      Get integer thread ID
- *   _glthread_InitTSD()     Initialize thread-specific data
- *   _glthread_GetTSD()      Get thread-specific data
- *   _glthread_SetTSD()      Set thread-specific data
- *
- */
-
-/*
- * If this file is accidentally included by a non-threaded build,
- * it should not cause the build to fail, or otherwise cause problems.
- * In general, it should only be included when needed however.
- */
-
-#ifndef GLTHREAD_H
-#define GLTHREAD_H
-
-
-#if defined(USE_MGL_NAMESPACE)
-#define _glapi_Dispatch _mglapi_Dispatch
-#endif
-
-
-
-#if (defined(PTHREADS) || defined(SOLARIS_THREADS) ||\
-     defined(WIN32_THREADS) || defined(USE_XTHREADS) || defined(BEOS_THREADS)) \
-    && !defined(THREADS)
-# define THREADS
-#endif
-
-#ifdef VMS
-#include <GL/vms_x_fix.h>
-#endif
-
-/*
- * POSIX threads. This should be your choice in the Unix world
- * whenever possible.  When building with POSIX threads, be sure
- * to enable any compiler flags which will cause the MT-safe
- * libc (if one exists) to be used when linking, as well as any
- * header macros for MT-safe errno, etc.  For Solaris, this is the -mt
- * compiler flag.  On Solaris with gcc, use -D_REENTRANT to enable
- * proper compiling for MT-safe libc etc.
- */
-#if defined(PTHREADS)
-#include <pthread.h> /* POSIX threads headers */
-
-typedef struct {
-   pthread_key_t  key;
-   int initMagic;
-} _glthread_TSD;
-
-typedef pthread_t _glthread_Thread;
-
-typedef pthread_mutex_t _glthread_Mutex;
-
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name = PTHREAD_MUTEX_INITIALIZER
-
-#define _glthread_INIT_MUTEX(name) \
-   pthread_mutex_init(&(name), NULL)
-
-#define _glthread_DESTROY_MUTEX(name) \
-   pthread_mutex_destroy(&(name))
-
-#define _glthread_LOCK_MUTEX(name) \
-   (void) pthread_mutex_lock(&(name))
-
-#define _glthread_UNLOCK_MUTEX(name) \
-   (void) pthread_mutex_unlock(&(name))
-
-typedef pthread_cond_t _glthread_Cond;
-
-#define _glthread_DECLARE_STATIC_COND(name) \
-   static _glthread_Cond name = PTHREAD_COND_INITIALIZER
-
-#define _glthread_INIT_COND(cond)			\
-   pthread_cond_init(&(cond), NULL)
-
-#define _glthread_DESTROY_COND(name) \
-   pthread_cond_destroy(&(name))
-
-#define _glthread_COND_WAIT(cond, mutex) \
-  pthread_cond_wait(&(cond), &(mutex))
-
-#define _glthread_COND_SIGNAL(cond) \
-  pthread_cond_signal(&(cond))
-
-#define _glthread_COND_BROADCAST(cond) \
-  pthread_cond_broadcast(&(cond))
-
-
-#else /* PTHREADS */
-
-typedef unsigned int _glthread_Cond;
-#define _glthread_DECLARE_STATIC_COND(name) \
-//  #warning Condition variables not implemented.
-
-#define _glthread_INIT_COND(cond)	    \
-  abort();
-
-#define _glthread_DESTROY_COND(name) \
-  abort();
-
-#define _glthread_COND_WAIT(cond, mutex) \
-  abort();
-
-#define _glthread_COND_SIGNAL(cond) \
-  abort();
-
-#define _glthread_COND_BROADCAST(cond) \
-  abort();
-
-#endif
-
-
-/*
- * Solaris threads. Use only up to Solaris 2.4.
- * Solaris 2.5 and higher provide POSIX threads.
- * Be sure to compile with -mt on the Solaris compilers, or
- * use -D_REENTRANT if using gcc.
- */
-#ifdef SOLARIS_THREADS
-#include <thread.h>
-
-typedef struct {
-   thread_key_t key;
-   mutex_t      keylock;
-   int          initMagic;
-} _glthread_TSD;
-
-typedef thread_t _glthread_Thread;
-
-typedef mutex_t _glthread_Mutex;
-
-/* XXX need to really implement mutex-related macros */
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = 0
-#define _glthread_INIT_MUTEX(name)  (void) name
-#define _glthread_DESTROY_MUTEX(name) (void) name
-#define _glthread_LOCK_MUTEX(name)  (void) name
-#define _glthread_UNLOCK_MUTEX(name)  (void) name
-
-#endif /* SOLARIS_THREADS */
-
-
-
-
-/*
- * Windows threads. Should work with Windows NT and 95.
- * IMPORTANT: Link with multithreaded runtime library when THREADS are
- * used!
- */
-#ifdef WIN32_THREADS
-#include <windows.h>
-
-typedef struct {
-   DWORD key;
-   int   initMagic;
-} _glthread_TSD;
-
-typedef HANDLE _glthread_Thread;
-
-typedef CRITICAL_SECTION _glthread_Mutex;
-
-#define _glthread_DECLARE_STATIC_MUTEX(name)  /*static*/ _glthread_Mutex name = {0,0,0,0,0,0}
-#define _glthread_INIT_MUTEX(name)  InitializeCriticalSection(&name)
-#define _glthread_DESTROY_MUTEX(name)  DeleteCriticalSection(&name)
-#define _glthread_LOCK_MUTEX(name)  EnterCriticalSection(&name)
-#define _glthread_UNLOCK_MUTEX(name)  LeaveCriticalSection(&name)
-
-#endif /* WIN32_THREADS */
-
-
-
-
-/*
- * XFree86 has its own thread wrapper, Xthreads.h
- * We wrap it again for GL.
- */
-#ifdef USE_XTHREADS
-#include <X11/Xthreads.h>
-
-typedef struct {
-   xthread_key_t key;
-   int initMagic;
-} _glthread_TSD;
-
-typedef xthread_t _glthread_Thread;
-
-typedef xmutex_rec _glthread_Mutex;
-
-#ifdef XMUTEX_INITIALIZER
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name = XMUTEX_INITIALIZER
-#else
-#define _glthread_DECLARE_STATIC_MUTEX(name) \
-   static _glthread_Mutex name
-#endif
-
-#define _glthread_INIT_MUTEX(name) \
-   xmutex_init(&(name))
-
-#define _glthread_DESTROY_MUTEX(name) \
-   xmutex_clear(&(name))
-
-#define _glthread_LOCK_MUTEX(name) \
-   (void) xmutex_lock(&(name))
-
-#define _glthread_UNLOCK_MUTEX(name) \
-   (void) xmutex_unlock(&(name))
-
-#endif /* USE_XTHREADS */
-
-
-
-/*
- * BeOS threads. R5.x required.
- */
-#ifdef BEOS_THREADS
-
-#include <kernel/OS.h>
-#include <support/TLS.h>
-
-typedef struct {
-   int32        key;
-   int          initMagic;
-} _glthread_TSD;
-
-typedef thread_id _glthread_Thread;
-
-/* Use Benaphore, aka speeder semaphore */
-typedef struct {
-    int32   lock;
-    sem_id  sem;
-} benaphore;
-typedef benaphore _glthread_Mutex;
-
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = { 0, 0 }
-#define _glthread_INIT_MUTEX(name)    	name.sem = create_sem(0, #name"_benaphore"), name.lock = 0
-#define _glthread_DESTROY_MUTEX(name) 	delete_sem(name.sem), name.lock = 0
-#define _glthread_LOCK_MUTEX(name)    	if (name.sem == 0) _glthread_INIT_MUTEX(name); \
-									  	if (atomic_add(&(name.lock), 1) >= 1) acquire_sem(name.sem)
-#define _glthread_UNLOCK_MUTEX(name)  	if (atomic_add(&(name.lock), -1) > 1) release_sem(name.sem)
-
-#endif /* BEOS_THREADS */
-
-
-
-#ifndef THREADS
-
-/*
- * THREADS not defined
- */
-
-typedef GLuint _glthread_TSD;
-
-typedef GLuint _glthread_Thread;
-
-typedef GLuint _glthread_Mutex;
-
-#define _glthread_DECLARE_STATIC_MUTEX(name)  static _glthread_Mutex name = 0
-
-#define _glthread_INIT_MUTEX(name)  (void) name
-
-#define _glthread_DESTROY_MUTEX(name)  (void) name
-
-#define _glthread_LOCK_MUTEX(name)  (void) name
-
-#define _glthread_UNLOCK_MUTEX(name)  (void) name
-
-#endif /* THREADS */
-
-
-
-/*
- * Platform independent thread specific data API.
- */
-
-extern unsigned long
-_glthread_GetID(void);
-
-
-extern void
-_glthread_InitTSD(_glthread_TSD *);
-
-
-extern void *
-_glthread_GetTSD(_glthread_TSD *);
-
-
-extern void
-_glthread_SetTSD(_glthread_TSD *, void *);
-
-#if defined(GLX_USE_TLS)
-
-extern __thread struct _glapi_table * _glapi_tls_Dispatch
-    __attribute__((tls_model("initial-exec")));
-
-#define GET_DISPATCH() _glapi_tls_Dispatch
-
-#elif !defined(GL_CALL)
-# if defined(THREADS)
-#  define GET_DISPATCH() \
-   ((__builtin_expect( _glapi_Dispatch != NULL, 1 )) \
-       ? _glapi_Dispatch : _glapi_get_dispatch())
-# else
-#  define GET_DISPATCH() _glapi_Dispatch
-# endif /* defined(THREADS) */
-#endif  /* ndef GL_CALL */
-
-
-#endif /* THREADS_H */
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
index b6d901f85e4..517a97b3ee5 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufmgr.c
@@ -33,7 +33,7 @@
 #include <xf86drm.h>
 #include <stdlib.h>
 #include <stdio.h>
-#include "glthread.h"
+#include "pipe/p_thread.h"
 #include "errno.h"
 #include "ws_dri_bufmgr.h"
 #include "string.h"
@@ -51,8 +51,8 @@
  * driBufferObject mutex - > this rw lock.
  */
 
-_glthread_DECLARE_STATIC_MUTEX(bmMutex);
-_glthread_DECLARE_STATIC_COND(bmCond);
+pipe_static_mutex(bmMutex);
+pipe_static_condvar(bmCond);
 
 static int kernelReaders = 0;
 static int num_buffers = 0;
@@ -241,29 +241,29 @@ static int drmBOResetList(drmBOList *list)
 
 void driWriteLockKernelBO(void)
 {
-    _glthread_LOCK_MUTEX(bmMutex);
+    pipe_mutex_lock(bmMutex);
     while(kernelReaders != 0)
-	_glthread_COND_WAIT(bmCond, bmMutex);
+	pipe_condvar_wait(bmCond, bmMutex);
 }
 
 void driWriteUnlockKernelBO(void)
 {
-    _glthread_UNLOCK_MUTEX(bmMutex);
+    pipe_mutex_unlock(bmMutex);
 }
 
 void driReadLockKernelBO(void)
 {
-    _glthread_LOCK_MUTEX(bmMutex);
+    pipe_mutex_lock(bmMutex);
     kernelReaders++;
-    _glthread_UNLOCK_MUTEX(bmMutex);
+    pipe_mutex_unlock(bmMutex);
 }
 
 void driReadUnlockKernelBO(void)
 {
-    _glthread_LOCK_MUTEX(bmMutex);
+    pipe_mutex_lock(bmMutex);
     if (--kernelReaders == 0)
-        _glthread_COND_BROADCAST(bmCond);
-    _glthread_UNLOCK_MUTEX(bmMutex);
+       pipe_condvar_broadcast(bmCond);
+    pipe_mutex_unlock(bmMutex);
 }
 
 
@@ -277,7 +277,7 @@ void driReadUnlockKernelBO(void)
 typedef struct _DriBufferObject
 {
    DriBufferPool *pool;
-   _glthread_Mutex mutex;
+   pipe_mutex mutex;
    int refCount;
    const char *name;
    uint64_t flags;
@@ -318,12 +318,12 @@ driBOKernel(struct _DriBufferObject *buf)
    drmBO *ret;
 
    driReadLockKernelBO();
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    assert(buf->private != NULL);
    ret = buf->pool->kernel(buf->pool, buf->private);
    if (!ret)
       BM_CKFATAL(-EINVAL);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    driReadUnlockKernelBO();
 
    return ret;
@@ -338,9 +338,9 @@ driBOWaitIdle(struct _DriBufferObject *buf, int lazy)
    * that time??
    */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    BM_CKFATAL(buf->pool->waitIdle(buf->pool, buf->private, &buf->mutex, lazy));
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 void *
@@ -353,11 +353,11 @@ driBOMap(struct _DriBufferObject *buf, unsigned flags, unsigned hint)
       return buf->userData;
    }
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    assert(buf->private != NULL);
    retval = buf->pool->map(buf->pool, buf->private, flags, hint,
 			   &buf->mutex, &virtual);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
    return retval == 0 ? virtual : NULL;
 }
@@ -369,9 +369,9 @@ driBOUnmap(struct _DriBufferObject *buf)
       return;
 
    assert(buf->private != NULL);
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 unsigned long
@@ -381,9 +381,9 @@ driBOOffset(struct _DriBufferObject *buf)
 
    assert(buf->private != NULL);
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    ret = buf->pool->offset(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    return ret;
 }
 
@@ -394,9 +394,9 @@ driBOPoolOffset(struct _DriBufferObject *buf)
 
    assert(buf->private != NULL);
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    ret = buf->pool->poolOffset(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    return ret;
 }
 
@@ -408,9 +408,9 @@ driBOFlags(struct _DriBufferObject *buf)
    assert(buf->private != NULL);
 
    driReadLockKernelBO();
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    ret = buf->pool->flags(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    driReadUnlockKernelBO();
    return ret;
 }
@@ -418,12 +418,12 @@ driBOFlags(struct _DriBufferObject *buf)
 struct _DriBufferObject *
 driBOReference(struct _DriBufferObject *buf)
 {
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (++buf->refCount == 1) {
-      _glthread_UNLOCK_MUTEX(buf->mutex);
+      pipe_mutex_unlock(buf->mutex);
       BM_CKFATAL(-EINVAL);
    }
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
    return buf;
 }
 
@@ -435,10 +435,10 @@ driBOUnReference(struct _DriBufferObject *buf)
    if (!buf)
       return;
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    tmp = --buf->refCount;
    if (!tmp) {
-      _glthread_UNLOCK_MUTEX(buf->mutex);
+      pipe_mutex_unlock(buf->mutex);
       if (buf->private) {
 	 if (buf->createdByReference)
 	    buf->pool->unreference(buf->pool, buf->private);
@@ -451,7 +451,7 @@ driBOUnReference(struct _DriBufferObject *buf)
 	 num_buffers--;
       free(buf);
    } else
-     _glthread_UNLOCK_MUTEX(buf->mutex);
+     pipe_mutex_unlock(buf->mutex);
 
 }
 
@@ -469,7 +469,7 @@ driBOData(struct _DriBufferObject *buf,
 
    assert(!buf->userBuffer); /* XXX just do a memcpy? */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    pool = buf->pool;
 
    if (pool == NULL && newPool != NULL) {
@@ -556,7 +556,7 @@ driBOData(struct _DriBufferObject *buf,
    }
 
  out:
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
    return retval;
 }
@@ -569,7 +569,7 @@ driBOSubData(struct _DriBufferObject *buf,
 
    assert(!buf->userBuffer); /* XXX just do a memcpy? */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (size && data) {
       BM_CKFATAL(buf->pool->map(buf->pool, buf->private,
                                 DRM_BO_FLAG_WRITE, 0, &buf->mutex,
@@ -577,7 +577,7 @@ driBOSubData(struct _DriBufferObject *buf,
       memcpy((unsigned char *) virtual + offset, data, size);
       BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
    }
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 void
@@ -588,21 +588,21 @@ driBOGetSubData(struct _DriBufferObject *buf,
 
    assert(!buf->userBuffer); /* XXX just do a memcpy? */
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (size && data) {
       BM_CKFATAL(buf->pool->map(buf->pool, buf->private,
                                 DRM_BO_FLAG_READ, 0, &buf->mutex, &virtual));
       memcpy(data, (unsigned char *) virtual + offset, size);
       BM_CKFATAL(buf->pool->unmap(buf->pool, buf->private));
    }
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 void
 driBOSetReferenced(struct _DriBufferObject *buf,
 		   unsigned long handle)
 {
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (buf->private != NULL) {
       assert((size_t)"Invalid buffer for setReferenced\n" & 0);
       BM_CKFATAL(-EINVAL);
@@ -619,7 +619,7 @@ driBOSetReferenced(struct _DriBufferObject *buf,
    }
    buf->createdByReference = TRUE;
    buf->flags = buf->pool->kernel(buf->pool, buf->private)->flags;
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 int
@@ -644,8 +644,8 @@ driGenBuffers(struct _DriBufferPool *pool,
       if (!buf)
 	 return -ENOMEM;
 
-      _glthread_INIT_MUTEX(buf->mutex);
-      _glthread_LOCK_MUTEX(buf->mutex);
+      pipe_mutex_init(buf->mutex);
+      pipe_mutex_lock(buf->mutex);
       buf->refCount = 1;
       buf->flags = flags;
       buf->hint = hint;
@@ -653,7 +653,7 @@ driGenBuffers(struct _DriBufferPool *pool,
       buf->alignment = alignment;
       buf->pool = pool;
       buf->createdByReference = 0;
-      _glthread_UNLOCK_MUTEX(buf->mutex);
+      pipe_mutex_unlock(buf->mutex);
       buffers[i] = buf;
    }
    return 0;
@@ -818,7 +818,7 @@ driBOAddListItem(struct _DriBufferList * list, struct _DriBufferObject *buf,
 {
    int newItem;
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    BM_CKFATAL(driAddValidateItem(&list->drmBuffers,
 				 buf->pool->kernel(buf->pool, buf->private),
                                  flags, mask, itemLoc, node));
@@ -827,7 +827,7 @@ driBOAddListItem(struct _DriBufferList * list, struct _DriBufferObject *buf,
    if (newItem)
      buf->refCount++;
 
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 }
 
 drmBOList *driGetdrmBOList(struct _DriBufferList *list)
@@ -845,10 +845,10 @@ void driPutdrmBOList(struct _DriBufferList *list)
 void
 driBOFence(struct _DriBufferObject *buf, struct _DriFenceObject *fence)
 {
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    if (buf->pool->fence)
        BM_CKFATAL(buf->pool->fence(buf->pool, buf->private, fence));
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
 }
 
@@ -908,10 +908,10 @@ driBOValidateUserList(struct _DriBufferList * list)
 
     while (curBuf) {
 	buf = (struct _DriBufferObject *) drmBOListBuf(curBuf);
-	_glthread_LOCK_MUTEX(buf->mutex);
+	pipe_mutex_lock(buf->mutex);
 	if (buf->pool->validate)
 	    BM_CKFATAL(buf->pool->validate(buf->pool, buf->private, &buf->mutex));
-	_glthread_UNLOCK_MUTEX(buf->mutex);
+	pipe_mutex_unlock(buf->mutex);
 	curBuf = drmBOListNext(&list->driBuffers, curBuf);
     }
 }
@@ -929,9 +929,9 @@ driBOSize(struct _DriBufferObject *buf)
 {
   unsigned long size;
 
-   _glthread_LOCK_MUTEX(buf->mutex);
+   pipe_mutex_lock(buf->mutex);
    size = buf->pool->size(buf->pool, buf->private);
-   _glthread_UNLOCK_MUTEX(buf->mutex);
+   pipe_mutex_unlock(buf->mutex);
 
   return size;
 
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h b/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
index bf607989241..ad3b6f3931c 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_bufpool.h
@@ -33,14 +33,14 @@
 #define _PSB_BUFPOOL_H_
 
 #include <xf86drm.h>
-#include <glthread.h>
+#include "pipe/p_thread.h"
 struct _DriFenceObject;
 
 typedef struct _DriBufferPool
 {
    int fd;
    int (*map) (struct _DriBufferPool * pool, void *private,
-               unsigned flags, int hint, _glthread_Mutex *mutex,
+               unsigned flags, int hint, pipe_mutex *mutex,
 	       void **virtual);
    int (*unmap) (struct _DriBufferPool * pool, void *private);
    int (*destroy) (struct _DriBufferPool * pool, void *private);
@@ -55,8 +55,8 @@ typedef struct _DriBufferPool
    int (*fence) (struct _DriBufferPool * pool, void *private,
                  struct _DriFenceObject * fence);
    drmBO *(*kernel) (struct _DriBufferPool * pool, void *private);
-   int (*validate) (struct _DriBufferPool * pool, void *private, _glthread_Mutex *mutex);
-   int (*waitIdle) (struct _DriBufferPool *pool, void *private, _glthread_Mutex *mutex,
+   int (*validate) (struct _DriBufferPool * pool, void *private, pipe_mutex *mutex);
+   int (*waitIdle) (struct _DriBufferPool *pool, void *private, pipe_mutex *mutex,
 		    int lazy);
    int (*setStatus)  (struct _DriBufferPool *pool, void *private,
 		      uint64_t flag_diff, uint64_t old_flags);
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
index 40929efa2f9..54618b1c82a 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_drmpool.c
@@ -113,7 +113,7 @@ pool_unreference(struct _DriBufferPool *pool, void *private)
 
 static int
 pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
-         int hint, _glthread_Mutex *mutex, void **virtual)
+         int hint, pipe_mutex *mutex, void **virtual)
 {
    drmBO *buf = (drmBO *) private;
    int ret;
@@ -202,7 +202,7 @@ pool_kernel(struct _DriBufferPool *pool, void *private)
 }
 
 static int
-pool_waitIdle(struct _DriBufferPool *pool, void *private, _glthread_Mutex *mutex,
+pool_waitIdle(struct _DriBufferPool *pool, void *private, pipe_mutex *mutex,
 	      int lazy)
 {
    drmBO *buf = (drmBO *) private;
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
index b56bc269da5..831c75d30cc 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_fencemgr.c
@@ -1,5 +1,5 @@
 #include "ws_dri_fencemgr.h"
-#include "glthread.h"
+#include "pipe/p_thread.h"
 #include <xf86mm.h>
 #include <string.h>
 #include <unistd.h>
@@ -20,7 +20,7 @@ struct _DriFenceMgr {
     /*
      * These members are protected by this->mutex
      */
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
     int refCount;
     drmMMListHead *heads;
     int num_fences;
@@ -44,7 +44,7 @@ struct _DriFenceObject {
     /*
      * These members are protected by this->mutex.
      */
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
     uint32_t signaled_type;
     void *private;
 };
@@ -65,8 +65,8 @@ driFenceMgrCreate(const struct _DriFenceMgrCreateInfo *info)
   if (!tmp)
       return NULL;
 
-  _glthread_INIT_MUTEX(tmp->mutex);
-  _glthread_LOCK_MUTEX(tmp->mutex);
+  pipe_mutex_init(tmp->mutex);
+  pipe_mutex_lock(tmp->mutex);
   tmp->refCount = 1;
   tmp->info = *info;
   tmp->num_fences = 0;
@@ -77,7 +77,7 @@ driFenceMgrCreate(const struct _DriFenceMgrCreateInfo *info)
   for (i=0; i<tmp->info.num_classes; ++i) {
       DRMINITLISTHEAD(&tmp->heads[i]);
   }
-  _glthread_UNLOCK_MUTEX(tmp->mutex);
+  pipe_mutex_unlock(tmp->mutex);
   return tmp;
 
   out_err:
@@ -95,13 +95,13 @@ driFenceMgrUnrefUnlock(struct _DriFenceMgr **pMgr)
     if (--mgr->refCount == 0)
 	free(mgr);
     else
-	_glthread_UNLOCK_MUTEX(mgr->mutex);
+	pipe_mutex_unlock(mgr->mutex);
 }
 
 void
 driFenceMgrUnReference(struct _DriFenceMgr **pMgr)
 {
-    _glthread_LOCK_MUTEX((*pMgr)->mutex);
+    pipe_mutex_lock((*pMgr)->mutex);
     driFenceMgrUnrefUnlock(pMgr);
 }
 
@@ -143,9 +143,9 @@ driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
 	 */
 
 	++entry->refCount;
-	_glthread_UNLOCK_MUTEX(mgr->mutex);
-	_glthread_LOCK_MUTEX(entry->mutex);
-	_glthread_LOCK_MUTEX(mgr->mutex);
+	pipe_mutex_unlock(mgr->mutex);
+	pipe_mutex_lock(entry->mutex);
+	pipe_mutex_lock(mgr->mutex);
 
 	prev = list->prev;
 
@@ -157,7 +157,7 @@ driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
 		 * Somebody else removed the entry from the list.
 		 */
 
-		_glthread_UNLOCK_MUTEX(entry->mutex);
+		pipe_mutex_unlock(entry->mutex);
 		driFenceUnReferenceLocked(&entry);
 		return;
 	}
@@ -167,7 +167,7 @@ driSignalPreviousFencesLocked(struct _DriFenceMgr *mgr,
 	    DRMLISTDELINIT(list);
 	    mgr->info.unreference(mgr, &entry->private);
 	}
-	_glthread_UNLOCK_MUTEX(entry->mutex);
+	pipe_mutex_unlock(entry->mutex);
 	driFenceUnReferenceLocked(&entry);
 	list = prev;
     }
@@ -181,7 +181,7 @@ driFenceFinish(struct _DriFenceObject *fence, uint32_t fence_type,
     struct _DriFenceMgr *mgr = fence->mgr;
     int ret = 0;
 
-    _glthread_LOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
 
     if ((fence->signaled_type & fence_type) == fence_type)
 	goto out0;
@@ -190,16 +190,16 @@ driFenceFinish(struct _DriFenceObject *fence, uint32_t fence_type,
     if (ret)
 	goto out0;
 
-    _glthread_LOCK_MUTEX(mgr->mutex);
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
+    pipe_mutex_unlock(fence->mutex);
 
     driSignalPreviousFencesLocked(mgr, &fence->head, fence->fence_class,
 				  fence_type);
-    _glthread_UNLOCK_MUTEX(mgr->mutex);
+    pipe_mutex_unlock(mgr->mutex);
     return 0;
 
   out0:
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
     return ret;
 }
 
@@ -207,9 +207,9 @@ uint32_t driFenceSignaledTypeCached(struct _DriFenceObject *fence)
 {
     uint32_t ret;
 
-    _glthread_LOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
     ret = fence->signaled_type;
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
 
     return ret;
 }
@@ -221,7 +221,7 @@ driFenceSignaledType(struct _DriFenceObject *fence, uint32_t flush_type,
     int ret = 0;
     struct _DriFenceMgr *mgr;
 
-    _glthread_LOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
     mgr = fence->mgr;
     *signaled = fence->signaled_type;
     if ((fence->signaled_type & flush_type) == flush_type)
@@ -236,25 +236,25 @@ driFenceSignaledType(struct _DriFenceObject *fence, uint32_t flush_type,
     if ((fence->signaled_type | *signaled) == fence->signaled_type)
 	goto out0;
 
-    _glthread_LOCK_MUTEX(mgr->mutex);
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
+    pipe_mutex_unlock(fence->mutex);
 
     driSignalPreviousFencesLocked(mgr, &fence->head, fence->fence_class,
 				  *signaled);
 
-    _glthread_UNLOCK_MUTEX(mgr->mutex);
+    pipe_mutex_unlock(mgr->mutex);
     return 0;
   out0:
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
     return ret;
 }
 
 struct _DriFenceObject *
 driFenceReference(struct _DriFenceObject *fence)
 {
-    _glthread_LOCK_MUTEX(fence->mgr->mutex);
+    pipe_mutex_lock(fence->mgr->mutex);
     ++fence->refCount;
-    _glthread_UNLOCK_MUTEX(fence->mgr->mutex);
+    pipe_mutex_unlock(fence->mgr->mutex);
     return fence;
 }
 
@@ -267,7 +267,7 @@ driFenceUnReference(struct _DriFenceObject **pFence)
 	return;
 
     mgr = (*pFence)->mgr;
-    _glthread_LOCK_MUTEX(mgr->mutex);
+    pipe_mutex_lock(mgr->mutex);
     ++mgr->refCount;
     driFenceUnReferenceLocked(pFence);
     driFenceMgrUnrefUnlock(&mgr);
@@ -294,15 +294,15 @@ struct _DriFenceObject
 	return NULL;
     }
 
-    _glthread_INIT_MUTEX(fence->mutex);
-    _glthread_LOCK_MUTEX(fence->mutex);
-    _glthread_LOCK_MUTEX(mgr->mutex);
+    pipe_mutex_init(fence->mutex);
+    pipe_mutex_lock(fence->mutex);
+    pipe_mutex_lock(mgr->mutex);
     fence->refCount = 1;
     DRMLISTADDTAIL(&fence->head, &mgr->heads[fence_class]);
     fence->mgr = mgr;
     ++mgr->refCount;
     ++mgr->num_fences;
-    _glthread_UNLOCK_MUTEX(mgr->mutex);
+    pipe_mutex_unlock(mgr->mutex);
     fence->fence_class = fence_class;
     fence->fence_type = fence_type;
     fence->signaled_type = 0;
@@ -312,7 +312,7 @@ struct _DriFenceObject
 	memcpy(fence->private, private, private_size);
     }
 
-    _glthread_UNLOCK_MUTEX(fence->mutex);
+    pipe_mutex_unlock(fence->mutex);
     return fence;
 }
 
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
index a80555c9c71..60924eac9ee 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_mallocpool.c
@@ -33,7 +33,7 @@
 #include <stdlib.h>
 #include <errno.h>
 #include "pipe/p_debug.h"
-#include "glthread.h"
+#include "pipe/p_thread.h"
 #include "ws_dri_bufpool.h"
 #include "ws_dri_bufmgr.h"
 
@@ -60,14 +60,14 @@ pool_destroy(struct _DriBufferPool *pool, void *private)
 
 static int
 pool_waitIdle(struct _DriBufferPool *pool, void *private,
-	      _glthread_Mutex *mutex, int lazy)
+	      pipe_mutex *mutex, int lazy)
 {
     return 0;
 }
 
 static int
 pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
-         int hint, _glthread_Mutex *mutex, void **virtual)
+         int hint, pipe_mutex *mutex, void **virtual)
 {
     *virtual = (void *)((unsigned long *)private + 2);
     return 0;
diff --git a/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c b/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
index dfcf6d6b19a..391cea50a7f 100644
--- a/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
+++ b/src/gallium/winsys/drm/intel/common/ws_dri_slabpool.c
@@ -37,7 +37,7 @@
 #include "ws_dri_bufpool.h"
 #include "ws_dri_fencemgr.h"
 #include "ws_dri_bufmgr.h"
-#include "glthread.h"
+#include "pipe/p_thread.h"
 
 #define DRI_SLABPOOL_ALLOC_RETRIES 100
 
@@ -53,7 +53,7 @@ struct _DriSlabBuffer {
     uint32_t start;
     uint32_t fenceType;
     int unFenced;
-    _glthread_Cond event;
+    pipe_condvar event;
 };
 
 struct _DriKernelBO {
@@ -84,7 +84,7 @@ struct _DriSlabSizeHeader {
     uint32_t numDelayed;
     struct _DriSlabPool *slabPool;
     uint32_t bufSize;
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
 };
 
 struct _DriFreeSlabManager {
@@ -94,7 +94,7 @@ struct _DriFreeSlabManager {
     drmMMListHead timeoutList;
     drmMMListHead unCached;
     drmMMListHead cached;
-    _glthread_Mutex mutex;
+    pipe_mutex mutex;
 };
 
 
@@ -196,7 +196,7 @@ driSetKernelBOFree(struct _DriFreeSlabManager *fMan,
 {
     struct timeval time;
 
-    _glthread_LOCK_MUTEX(fMan->mutex);
+    pipe_mutex_lock(fMan->mutex);
     gettimeofday(&time, NULL);
     driTimeAdd(&time, &fMan->slabTimeout);
 
@@ -210,7 +210,7 @@ driSetKernelBOFree(struct _DriFreeSlabManager *fMan,
     DRMLISTADDTAIL(&kbo->timeoutHead, &fMan->timeoutList);
     driFreeTimeoutKBOsLocked(fMan, &time);
 
-    _glthread_UNLOCK_MUTEX(fMan->mutex);
+    pipe_mutex_unlock(fMan->mutex);
 }
 
 /*
@@ -237,7 +237,7 @@ driAllocKernelBO(struct _DriSlabSizeHeader *header)
 
     size = (size <= slabPool->maxSlabSize) ? size : slabPool->maxSlabSize;
     size = (size + slabPool->pageSize - 1) & ~(slabPool->pageSize - 1);
-    _glthread_LOCK_MUTEX(fMan->mutex);
+    pipe_mutex_lock(fMan->mutex);
 
     kbo = NULL;
 
@@ -269,7 +269,7 @@ driAllocKernelBO(struct _DriSlabSizeHeader *header)
 	DRMLISTDELINIT(&kbo->timeoutHead);
     }
 
-    _glthread_UNLOCK_MUTEX(fMan->mutex);
+    pipe_mutex_unlock(fMan->mutex);
 
     if (kbo) {
         uint64_t new_mask = kbo->bo.proposedFlags ^ slabPool->proposedFlags;
@@ -360,7 +360,7 @@ driAllocSlab(struct _DriSlabSizeHeader *header)
 	buf->start = i* header->bufSize;
 	buf->mapCount = 0;
 	buf->isSlabBuffer = 1;
-	_glthread_INIT_COND(buf->event);
+	pipe_condvar_init(buf->event);
 	DRMLISTADDTAIL(&buf->head, &slab->freeBuffers);
 	slab->numFree++;
 	buf++;
@@ -494,23 +494,23 @@ driSlabAllocBuffer(struct _DriSlabSizeHeader *header)
     drmMMListHead *list;
     int count = DRI_SLABPOOL_ALLOC_RETRIES;
 
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_lock(header->mutex);
     while(header->slabs.next == &header->slabs && count > 0) {
         driSlabCheckFreeLocked(header, 0);
 	if (header->slabs.next != &header->slabs)
 	  break;
 
-	_glthread_UNLOCK_MUTEX(header->mutex);
+	pipe_mutex_unlock(header->mutex);
 	if (count != DRI_SLABPOOL_ALLOC_RETRIES)
 	    usleep(1);
-	_glthread_LOCK_MUTEX(header->mutex);
+	pipe_mutex_lock(header->mutex);
 	(void) driAllocSlab(header);
 	count--;
     }
 
     list = header->slabs.next;
     if (list == &header->slabs) {
-	_glthread_UNLOCK_MUTEX(header->mutex);
+	pipe_mutex_unlock(header->mutex);
 	return NULL;
     }
     slab = DRMLISTENTRY(struct _DriSlab, list, head);
@@ -520,7 +520,7 @@ driSlabAllocBuffer(struct _DriSlabSizeHeader *header)
     list = slab->freeBuffers.next;
     DRMLISTDELINIT(list);
 
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
     buf = DRMLISTENTRY(struct _DriSlabBuffer, list, head);
     return buf;
 }
@@ -618,7 +618,7 @@ pool_destroy(struct _DriBufferPool *driPool, void *private)
     slab = buf->parent;
     header = slab->header;
 
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_lock(header->mutex);
     buf->unFenced = 0;
     buf->mapCount = 0;
 
@@ -631,18 +631,18 @@ pool_destroy(struct _DriBufferPool *driPool, void *private)
 	driSlabFreeBufferLocked(buf);
     }
 
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
     return 0;
 }
 
 static int
 pool_waitIdle(struct _DriBufferPool *driPool, void *private,
-	      _glthread_Mutex *mutex, int lazy)
+	      pipe_mutex *mutex, int lazy)
 {
    struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
 
    while(buf->unFenced)
-       _glthread_COND_WAIT(buf->event, *mutex);
+       pipe_condvar_wait(buf->event, *mutex);
 
    if (!buf->fence)
      return 0;
@@ -655,7 +655,7 @@ pool_waitIdle(struct _DriBufferPool *driPool, void *private,
 
 static int
 pool_map(struct _DriBufferPool *pool, void *private, unsigned flags,
-         int hint, _glthread_Mutex *mutex, void **virtual)
+         int hint, pipe_mutex *mutex, void **virtual)
 {
    struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
    int busy;
@@ -689,7 +689,7 @@ pool_unmap(struct _DriBufferPool *pool, void *private)
 
    --buf->mapCount;
    if (buf->mapCount == 0 && buf->isSlabBuffer)
-       _glthread_COND_BROADCAST(buf->event);
+      pipe_condvar_broadcast(buf->event);
 
    return 0;
 }
@@ -760,7 +760,7 @@ pool_fence(struct _DriBufferPool *pool, void *private,
    buf->fenceType = bo->fenceFlags;
 
    buf->unFenced = 0;
-   _glthread_COND_BROADCAST(buf->event);
+   pipe_condvar_broadcast(buf->event);
 
    return 0;
 }
@@ -775,7 +775,7 @@ pool_kernel(struct _DriBufferPool *pool, void *private)
 
 static int
 pool_validate(struct _DriBufferPool *pool, void *private,
-	      _glthread_Mutex *mutex)
+	      pipe_mutex *mutex)
 {
    struct _DriSlabBuffer *buf = (struct _DriSlabBuffer *) private;
 
@@ -783,7 +783,7 @@ pool_validate(struct _DriBufferPool *pool, void *private,
        return 0;
 
    while(buf->mapCount != 0)
-       _glthread_COND_WAIT(buf->event, *mutex);
+      pipe_condvar_wait(buf->event, *mutex);
 
    buf->unFenced = 1;
    return 0;
@@ -799,8 +799,8 @@ driInitFreeSlabManager(uint32_t checkIntervalMsec, uint32_t slabTimeoutMsec)
     if (!tmp)
 	return NULL;
 
-    _glthread_INIT_MUTEX(tmp->mutex);
-    _glthread_LOCK_MUTEX(tmp->mutex);
+    pipe_mutex_init(tmp->mutex);
+    pipe_mutex_lock(tmp->mutex);
     tmp->slabTimeout.tv_usec = slabTimeoutMsec*1000;
     tmp->slabTimeout.tv_sec = tmp->slabTimeout.tv_usec / 1000000;
     tmp->slabTimeout.tv_usec -=  tmp->slabTimeout.tv_sec*1000000;
@@ -814,7 +814,7 @@ driInitFreeSlabManager(uint32_t checkIntervalMsec, uint32_t slabTimeoutMsec)
     DRMINITLISTHEAD(&tmp->timeoutList);
     DRMINITLISTHEAD(&tmp->unCached);
     DRMINITLISTHEAD(&tmp->cached);
-    _glthread_UNLOCK_MUTEX(tmp->mutex);
+    pipe_mutex_unlock(tmp->mutex);
 
     return tmp;
 }
@@ -827,9 +827,9 @@ driFinishFreeSlabManager(struct _DriFreeSlabManager *fMan)
     time = fMan->nextCheck;
     driTimeAdd(&time, &fMan->checkInterval);
 
-    _glthread_LOCK_MUTEX(fMan->mutex);
+    pipe_mutex_lock(fMan->mutex);
     driFreeTimeoutKBOsLocked(fMan, &time);
-    _glthread_UNLOCK_MUTEX(fMan->mutex);
+    pipe_mutex_unlock(fMan->mutex);
 
     assert(fMan->timeoutList.next == &fMan->timeoutList);
     assert(fMan->unCached.next == &fMan->unCached);
@@ -842,8 +842,8 @@ static void
 driInitSizeHeader(struct _DriSlabPool *pool, uint32_t size,
 		  struct _DriSlabSizeHeader *header)
 {
-    _glthread_INIT_MUTEX(header->mutex);
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_init(header->mutex);
+    pipe_mutex_lock(header->mutex);
 
     DRMINITLISTHEAD(&header->slabs);
     DRMINITLISTHEAD(&header->freeSlabs);
@@ -853,7 +853,7 @@ driInitSizeHeader(struct _DriSlabPool *pool, uint32_t size,
     header->slabPool = pool;
     header->bufSize = size;
 
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
 }
 
 static void
@@ -862,7 +862,7 @@ driFinishSizeHeader(struct _DriSlabSizeHeader *header)
     drmMMListHead *list, *next;
     struct _DriSlabBuffer *buf;
 
-    _glthread_LOCK_MUTEX(header->mutex);
+    pipe_mutex_lock(header->mutex);
     for (list = header->delayedBuffers.next, next = list->next;
 	 list != &header->delayedBuffers;
 	 list = next, next = list->next) {
@@ -875,7 +875,7 @@ driFinishSizeHeader(struct _DriSlabSizeHeader *header)
 	header->numDelayed--;
 	driSlabFreeBufferLocked(buf);
     }
-    _glthread_UNLOCK_MUTEX(header->mutex);
+    pipe_mutex_unlock(header->mutex);
 }
 
 static void
diff --git a/src/gallium/winsys/drm/intel/dri/intel_lock.c b/src/gallium/winsys/drm/intel/dri/intel_lock.c
index 406284c98fb..ad1c202429e 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_lock.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_lock.c
@@ -27,7 +27,7 @@
 
 
 #include "main/glheader.h"
-#include "glapi/glthread.h"
+#include "pipe/p_thread.h"
 #include <GL/internal/glcore.h>
 #include "state_tracker/st_public.h"
 #include "intel_context.h"
@@ -35,7 +35,7 @@
 
 
 
-_glthread_DECLARE_STATIC_MUTEX( lockMutex );
+pipe_static_mutex( lockMutex );
 
 
 static void
@@ -72,7 +72,7 @@ void LOCK_HARDWARE( struct intel_context *intel )
 {
     char __ret = 0;
 
-    _glthread_LOCK_MUTEX(lockMutex);
+    pipe_mutex_lock(lockMutex);
     assert(!intel->locked);
 
     DRM_CAS(intel->driHwLock, intel->hHWContext,
@@ -96,7 +96,7 @@ void UNLOCK_HARDWARE( struct intel_context *intel )
 
    DRM_UNLOCK(intel->driFd, intel->driHwLock, intel->hHWContext);
 
-   _glthread_UNLOCK_MUTEX(lockMutex);
+   pipe_mutex_unlock(lockMutex);
 
    DBG(LOCK, "%s - unlocked\n", __progname);
 }
diff --git a/src/gallium/winsys/drm/intel/dri/intel_screen.c b/src/gallium/winsys/drm/intel/dri/intel_screen.c
index 46d4861e77c..3a486481f56 100644
--- a/src/gallium/winsys/drm/intel/dri/intel_screen.c
+++ b/src/gallium/winsys/drm/intel/dri/intel_screen.c
@@ -83,7 +83,7 @@ intelCreateSurface(struct intel_screen *intelScreen, struct pipe_winsys *winsys,
                                      buffer);
 
    /* Unref the buffer we don't need it anyways */
-   pipe_buffer_reference(screen->winsys, &buffer, NULL);
+   pipe_buffer_reference(screen, &buffer, NULL);
 
    surface = screen->get_tex_surface(screen,
                                      texture,
diff --git a/src/gallium/winsys/egl_xlib/sw_winsys.c b/src/gallium/winsys/egl_xlib/sw_winsys.c
index ae81d7f801b..2fd190da52e 100644
--- a/src/gallium/winsys/egl_xlib/sw_winsys.c
+++ b/src/gallium/winsys/egl_xlib/sw_winsys.c
@@ -216,7 +216,7 @@ surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
    surf->refcount--;
    if (surf->refcount == 0) {
       if (surf->buffer)
-         pipe_buffer_reference(winsys, &surf->buffer, NULL);
+         winsys_buffer_reference(winsys, &surf->buffer, NULL);
       free(surf);
    }
    *s = NULL;
diff --git a/src/gallium/winsys/gdi/wmesa.c b/src/gallium/winsys/gdi/wmesa.c
index 730fb1b5417..ed3dd2b9277 100644
--- a/src/gallium/winsys/gdi/wmesa.c
+++ b/src/gallium/winsys/gdi/wmesa.c
@@ -463,7 +463,7 @@ wm_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
    surf->refcount--;
    if (surf->refcount == 0) {
       if (surf->buffer)
-	pipe_buffer_reference(winsys, &surf->buffer, NULL);
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
       free(surf);
    }
    *s = NULL;
diff --git a/src/gallium/winsys/xlib/Makefile b/src/gallium/winsys/xlib/Makefile
index ec92c790685..11c76324113 100644
--- a/src/gallium/winsys/xlib/Makefile
+++ b/src/gallium/winsys/xlib/Makefile
@@ -33,10 +33,7 @@ XLIB_WINSYS_SOURCES = \
 XLIB_WINSYS_OBJECTS = $(XLIB_WINSYS_SOURCES:.c=.o)
 
 
-ifeq ($(CONFIG_NAME), linux-cell)
-# The SPU code is in a separate .a file, unfortunately
-CELL_SPU_LIB = $(TOP)/src/gallium/drivers/cell/spu/g3d_spu.a
-endif
+# Note: CELL_SPU_LIB is only defined for cell configs
 
 LIBS = \
 	$(GALLIUM_DRIVERS) \
diff --git a/src/gallium/winsys/xlib/SConscript b/src/gallium/winsys/xlib/SConscript
index 8650f595a72..324fbef306a 100644
--- a/src/gallium/winsys/xlib/SConscript
+++ b/src/gallium/winsys/xlib/SConscript
@@ -36,8 +36,10 @@ if env['platform'] == 'linux' \
         drivers += [trace]
 
     # TODO: write a wrapper function http://www.scons.org/wiki/WrapperFunctions
-    env.SharedLibrary(
+    libgl = env.SharedLibrary(
         target ='GL',
         source = sources,
         LIBS = glapi + mesa + drivers + auxiliaries + env['LIBS'],
     )
+
+    env.InstallSharedLibrary(libgl, version=(1, 5))
diff --git a/src/gallium/winsys/xlib/glxapi.c b/src/gallium/winsys/xlib/glxapi.c
index c2ccce6f520..c059fc3edb5 100644
--- a/src/gallium/winsys/xlib/glxapi.c
+++ b/src/gallium/winsys/xlib/glxapi.c
@@ -37,6 +37,7 @@
 #include "main/glheader.h"
 #include "glapi/glapi.h"
 #include "glxapi.h"
+#include "pipe/p_thread.h"
 
 
 extern struct _glxapi_table *_real_GetGLXDispatchTable(void);
@@ -127,26 +128,13 @@ get_dispatch(Display *dpy)
 /**
  * GLX API current context.
  */
-#if defined(GLX_USE_TLS)
-PUBLIC __thread void * CurrentContext
-    __attribute__((tls_model("initial-exec")));
-#elif defined(THREADS)
-static _glthread_TSD ContextTSD;         /**< Per-thread context pointer */
-#else
-static GLXContext CurrentContext = 0;
-#endif
+pipe_tsd ContextTSD;
 
 
 static void
 SetCurrentContext(GLXContext c)
 {
-#if defined(GLX_USE_TLS)
-   CurrentContext = c;
-#elif defined(THREADS)
-   _glthread_SetTSD(&ContextTSD, c);
-#else
-   CurrentContext = c;
-#endif
+   pipe_tsd_set(&ContextTSD, c);
 }
 
 
@@ -238,13 +226,7 @@ glXGetConfig(Display *dpy, XVisualInfo *visinfo, int attrib, int *value)
 GLXContext PUBLIC
 glXGetCurrentContext(void)
 {
-#if defined(GLX_USE_TLS)
-   return CurrentContext;
-#elif defined(THREADS)
-   return (GLXContext) _glthread_GetTSD(&ContextTSD);
-#else
-   return CurrentContext;
-#endif
+   return (GLXContext) pipe_tsd_get(&ContextTSD);
 }
 
 
diff --git a/src/gallium/winsys/xlib/xm_api.c b/src/gallium/winsys/xlib/xm_api.c
index 72563404202..28bd6ceab42 100644
--- a/src/gallium/winsys/xlib/xm_api.c
+++ b/src/gallium/winsys/xlib/xm_api.c
@@ -62,7 +62,6 @@
 #include "xmesaP.h"
 #include "main/context.h"
 #include "main/framebuffer.h"
-#include "glapi/glthread.h"
 
 #include "state_tracker/st_public.h"
 #include "state_tracker/st_context.h"
@@ -75,7 +74,7 @@
 /**
  * Global X driver lock
  */
-_glthread_Mutex _xmesa_lock;
+pipe_mutex _xmesa_lock;
 
 
 int xmesa_mode;
@@ -245,10 +244,10 @@ xmesa_get_window_size(XMesaDisplay *dpy, XMesaBuffer b,
 #else
    Status stat;
 
-   _glthread_LOCK_MUTEX(_xmesa_lock);
+   pipe_mutex_lock(_xmesa_lock);
    XSync(b->xm_visual->display, 0); /* added for Chromium */
    stat = get_drawable_size(dpy, b->drawable, width, height);
-   _glthread_UNLOCK_MUTEX(_xmesa_lock);
+   pipe_mutex_unlock(_xmesa_lock);
 
    if (!stat) {
       /* probably querying a window that's recently been destroyed */
@@ -350,12 +349,17 @@ create_xmesa_buffer(XMesaDrawable d, BufferType type,
 
    if (vis->mesa_visual.depthBits == 0)
       depthFormat = PIPE_FORMAT_NONE;
+#ifdef GALLIUM_CELL /* XXX temporary for Cell! */
+   else
+      depthFormat = PIPE_FORMAT_S8Z24_UNORM;
+#else
    else if (vis->mesa_visual.depthBits <= 16)
-      depthFormat = PIPE_FORMAT_Z16_UNORM;
+      depthFormat = PIPE_FORMAT_Z16UNORM;
    else if (vis->mesa_visual.depthBits <= 24)
       depthFormat = PIPE_FORMAT_S8Z24_UNORM;
    else
       depthFormat = PIPE_FORMAT_Z32_UNORM;
+#endif
 
    if (vis->mesa_visual.stencilBits == 8) {
       if (depthFormat == PIPE_FORMAT_S8Z24_UNORM)
@@ -779,7 +783,7 @@ XMesaContext XMesaCreateContext( XMesaVisual v, XMesaContext share_list )
    uint pf;
 
    if (firstTime) {
-      _glthread_INIT_MUTEX(_xmesa_lock);
+      pipe_mutex_init(_xmesa_lock);
       firstTime = GL_FALSE;
    }
 
@@ -1299,6 +1303,7 @@ void XMesaFlush( XMesaContext c )
 #ifdef XFree86Server
       /* NOT_NEEDED */
 #else
+      st_finish(c->st);
       XSync( c->xm_visual->display, False );
 #endif
    }
diff --git a/src/gallium/winsys/xlib/xm_winsys.c b/src/gallium/winsys/xlib/xm_winsys.c
index 68ead7f528e..c4a30d37023 100644
--- a/src/gallium/winsys/xlib/xm_winsys.c
+++ b/src/gallium/winsys/xlib/xm_winsys.c
@@ -276,6 +276,39 @@ xm_buffer_destroy(struct pipe_winsys *pws,
 
 
 /**
+ * For Cell.  Basically, rearrange the pixels/quads from this layout:
+ *  +--+--+--+--+
+ *  |p0|p1|p2|p3|....
+ *  +--+--+--+--+
+ *
+ * to this layout:
+ *  +--+--+
+ *  |p0|p1|....
+ *  +--+--+
+ *  |p2|p3|
+ *  +--+--+
+ */
+static void
+twiddle_tile(uint *tile)
+{
+   uint tile2[TILE_SIZE * TILE_SIZE];
+   int y, x;
+
+   for (y = 0; y < TILE_SIZE; y+=2) {
+      for (x = 0; x < TILE_SIZE; x+=2) {
+         int k = 4 * (y/2 * TILE_SIZE/2 + x/2);
+         tile2[y * TILE_SIZE + (x + 0)] = tile[k];
+         tile2[y * TILE_SIZE + (x + 1)] = tile[k+1];
+         tile2[(y + 1) * TILE_SIZE + (x + 0)] = tile[k+2];
+         tile2[(y + 1) * TILE_SIZE + (x + 1)] = tile[k+3];
+      }
+   }
+   memcpy(tile, tile2, sizeof(tile2));
+}
+
+
+
+/**
  * Display a surface that's in a tiled configuration.  That is, all the
  * pixels for a TILE_SIZExTILE_SIZE block are contiguous in memory.
  */
@@ -287,16 +320,16 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
    const uint tilesPerRow = (surf->width + TILE_SIZE - 1) / TILE_SIZE;
    uint x, y;
 
-   /* check that the XImage has been previously initialized */
-   assert(ximage->format);
-   assert(ximage->bitmap_unit);
-
    if (XSHM_ENABLED(xm_buf) && (xm_buf->tempImage == NULL)) {
       alloc_shm_ximage(xm_buf, b, TILE_SIZE, TILE_SIZE);
    }
 
    ximage = (XSHM_ENABLED(xm_buf)) ? xm_buf->tempImage : b->tempImage;
 
+   /* check that the XImage has been previously initialized */
+   assert(ximage->format);
+   assert(ximage->bitmap_unit);
+
    if (!XSHM_ENABLED(xm_buf)) {
       /* update XImage's fields */
       ximage->width = TILE_SIZE;
@@ -306,24 +339,32 @@ xmesa_display_surface_tiled(XMesaBuffer b, const struct pipe_surface *surf)
 
    for (y = 0; y < surf->height; y += TILE_SIZE) {
       for (x = 0; x < surf->width; x += TILE_SIZE) {
-         int dx = x;
-         int dy = y;
          int tx = x / TILE_SIZE;
          int ty = y / TILE_SIZE;
          int offset = ty * tilesPerRow + tx;
+         int w = TILE_SIZE;
+         int h = TILE_SIZE;
+
+         if (y + h > surf->height)
+            h = surf->height - y;
+         if (x + w > surf->width)
+            w = surf->width - x;
 
          offset *= 4 * TILE_SIZE * TILE_SIZE;
 
          ximage->data = (char *) xm_buf->data + offset;
 
+         twiddle_tile((uint *) ximage->data);
+
          if (XSHM_ENABLED(xm_buf)) {
 #if defined(USE_XSHM) && !defined(XFree86Server)
             XShmPutImage(b->xm_visual->display, b->drawable, b->gc,
-                         ximage, 0, 0, x, y, TILE_SIZE, TILE_SIZE, False);
+                         ximage, 0, 0, x, y, w, h, False);
 #endif
-         } else {
+         }
+         else {
             XPutImage(b->xm_visual->display, b->drawable, b->gc,
-                      ximage, 0, 0, dx, dy, TILE_SIZE, TILE_SIZE);
+                      ximage, 0, 0, x, y, w, h);
          }
       }
    }
@@ -543,7 +584,7 @@ xm_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
    surf->refcount--;
    if (surf->refcount == 0) {
       if (surf->buffer)
-	pipe_buffer_reference(winsys, &surf->buffer, NULL);
+	winsys_buffer_reference(winsys, &surf->buffer, NULL);
       free(surf);
    }
    *s = NULL;
diff --git a/src/gallium/winsys/xlib/xm_winsys_aub.c b/src/gallium/winsys/xlib/xm_winsys_aub.c
index 35c4ebc4ba3..b7c10b6bcae 100644
--- a/src/gallium/winsys/xlib/xm_winsys_aub.c
+++ b/src/gallium/winsys/xlib/xm_winsys_aub.c
@@ -308,7 +308,7 @@ aub_i915_surface_release(struct pipe_winsys *winsys, struct pipe_surface **s)
    surf->refcount--;
    if (surf->refcount == 0) {
       if (surf->buffer)
-         pipe_buffer_reference(winsys, &surf->buffer, NULL);
+         winsys_buffer_reference(winsys, &surf->buffer, NULL);
       free(surf);
    }
    *s = NULL;
diff --git a/src/gallium/winsys/xlib/xmesaP.h b/src/gallium/winsys/xlib/xmesaP.h
index 9b15b2ddf99..fcaeee52bcf 100644
--- a/src/gallium/winsys/xlib/xmesaP.h
+++ b/src/gallium/winsys/xlib/xmesaP.h
@@ -35,9 +35,10 @@
 
 #include "state_tracker/st_context.h"
 #include "state_tracker/st_public.h"
+#include "pipe/p_thread.h"
 
 
-extern _glthread_Mutex _xmesa_lock;
+extern pipe_mutex _xmesa_lock;
 
 extern XMesaBuffer XMesaBufferList;