21 files changed, 919 insertions, 592 deletions
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 98554d7f521..1f6860da119 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -49,6 +49,15 @@
    }
 
 
+
+#define JOIN(x, y) JOIN_AGAIN(x, y)
+#define JOIN_AGAIN(x, y) x ## y
+
+#define STATIC_ASSERT(e) \
+{typedef char JOIN(assertion_failed_at_line_, __LINE__) [(e) ? 1 : -1];}
+
+
+
 /** for sanity checking */
 #define ASSERT_ALIGN16(ptr) \
   ASSERT((((unsigned long) (ptr)) & 0xf) == 0);
@@ -134,6 +143,11 @@ struct cell_fence
    volatile uint status[CELL_MAX_SPUS][4];
 };
 
+#ifdef __SPU__
+typedef vector unsigned int opcode_t;
+#else
+typedef unsigned int opcode_t[4];
+#endif
 
 /**
  * Fence command sent to SPUs.  In response, the SPUs will write
@@ -141,8 +155,9 @@ struct cell_fence
  */
 struct cell_command_fence
 {
-   uint64_t opcode;      /**< CELL_CMD_FENCE */
+   opcode_t opcode;      /**< CELL_CMD_FENCE */
    struct cell_fence *fence;
+   uint32_t pad_[3];
 };
 
 
@@ -163,7 +178,7 @@ struct cell_command_fence
  */
 struct cell_command_fragment_ops
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_OPS */
 
    /* Fields for the fallback case */
    struct pipe_depth_stencil_alpha_state dsa;
@@ -189,8 +204,9 @@ struct cell_command_fragment_ops
  */
 struct cell_command_fragment_program
 {
-   uint64_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
+   opcode_t opcode;      /**< CELL_CMD_STATE_FRAGMENT_PROGRAM */
    uint num_inst;        /**< Number of instructions */
+   uint32_t pad[3];
    unsigned code[SPU_MAX_FRAGMENT_PROGRAM_INSTS];
 };
 
@@ -200,10 +216,11 @@ struct cell_command_fragment_program
  */
 struct cell_command_framebuffer
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
+   opcode_t opcode;     /**< CELL_CMD_STATE_FRAMEBUFFER */
    int width, height;
    void *color_start, *depth_start;
    enum pipe_format color_format, depth_format;
+   uint32_t pad_[2];
 };
 
 
@@ -212,7 +229,7 @@ struct cell_command_framebuffer
  */
 struct cell_command_rasterizer
 {
-   uint64_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
+   opcode_t opcode;    /**< CELL_CMD_STATE_RASTERIZER */
    struct pipe_rasterizer_state rasterizer;
 };
 
@@ -222,9 +239,10 @@ struct cell_command_rasterizer
  */
 struct cell_command_clear_surface
 {
-   uint64_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
+   opcode_t opcode;     /**< CELL_CMD_CLEAR_SURFACE */
    uint surface; /**< Temporary: 0=color, 1=Z */
    uint value;
+   uint32_t pad[2];
 };
 
 
@@ -271,7 +289,7 @@ struct cell_shader_info
 #define SPU_VERTS_PER_BATCH 64
 struct cell_command_vs
 {
-   uint64_t opcode;       /**< CELL_CMD_VS_EXECUTE */
+   opcode_t opcode;       /**< CELL_CMD_VS_EXECUTE */
    uint64_t vOut[SPU_VERTS_PER_BATCH];
    unsigned num_elts;
    unsigned elts[SPU_VERTS_PER_BATCH];
@@ -283,7 +301,7 @@ struct cell_command_vs
 
 struct cell_command_render
 {
-   uint64_t opcode;   /**< CELL_CMD_RENDER */
+   opcode_t opcode;   /**< CELL_CMD_RENDER */
    uint prim_type;    /**< PIPE_PRIM_x */
    uint num_verts;
    uint vertex_size;  /**< bytes per vertex */
@@ -292,27 +310,30 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_release_verts
 {
-   uint64_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
+   opcode_t opcode;         /**< CELL_CMD_RELEASE_VERTS */
    uint vertex_buf;    /**< in [0, CELL_NUM_BUFFERS-1] */
+   uint32_t pad_[3];
 };
 
 
 struct cell_command_sampler
 {
-   uint64_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
+   opcode_t opcode;         /**< CELL_CMD_STATE_SAMPLER */
    uint unit;
    struct pipe_sampler_state state;
+   uint32_t pad_[1];
 };
 
 
 struct cell_command_texture
 {
-   uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   opcode_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
    uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
    void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.c b/src/gallium/drivers/cell/ppu/cell_batch.c
index 962775cd335..fe144f8b849 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.c
+++ b/src/gallium/drivers/cell/ppu/cell_batch.c
@@ -108,15 +108,16 @@ emit_fence(struct cell_context *cell)
       fence->status[i][0] = CELL_FENCE_EMITTED;
    }
 
+   STATIC_ASSERT(sizeof(struct cell_command_fence) % 16 == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + sizeof(struct cell_command_fence) <= CELL_BUFFER_SIZE);
 
    fence_cmd = (struct cell_command_fence *) (cell->buffer[batch] + size);
-   fence_cmd->opcode = CELL_CMD_FENCE;
+   fence_cmd->opcode[0] = CELL_CMD_FENCE;
    fence_cmd->fence = fence;
 
    /* update batch buffer size */
    cell->buffer_size[batch] = size + sizeof(struct cell_command_fence);
-   assert(sizeof(struct cell_command_fence) % 8 == 0);
 }
 
 
@@ -192,69 +193,18 @@ cell_batch_free_space(const struct cell_context *cell)
 
 
 /**
- * Append data to the current batch buffer.
- * \param data  address of block of bytes to append
- * \param bytes  size of block of bytes
- */
-void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes)
-{
-   uint size;
-
-   ASSERT(bytes % 8 == 0);
-   ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(cell->cur_batch >= 0);
-
-#ifdef ASSERT
-   {
-      uint spu;
-      for (spu = 0; spu < cell->num_spus; spu++) {
-         ASSERT(cell->buffer_status[spu][cell->cur_batch][0]
-                 == CELL_BUFFER_STATUS_USED);
-      }
-   }
-#endif
-
-   size = cell->buffer_size[cell->cur_batch];
-
-   if (bytes > cell_batch_free_space(cell)) {
-      cell_batch_flush(cell);
-      size = 0;
-   }
-
-   ASSERT(size + bytes <= CELL_BUFFER_SIZE);
-
-   memcpy(cell->buffer[cell->cur_batch] + size, data, bytes);
-
-   cell->buffer_size[cell->cur_batch] = size + bytes;
-}
-
-
-/**
  * Allocate space in the current batch buffer for 'bytes' space.
+ * Bytes must be a multiple of 16 bytes.  Allocation will be 16 byte aligned.
  * \return address in batch buffer to put data
  */
 void *
-cell_batch_alloc(struct cell_context *cell, uint bytes)
-{
-   return cell_batch_alloc_aligned(cell, bytes, 1);
-}
-
-
-/**
- * Same as \sa cell_batch_alloc, but return an address at a particular
- * alignment.
- */
-void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment)
+cell_batch_alloc16(struct cell_context *cell, uint bytes)
 {
    void *pos;
-   uint size, padbytes;
+   uint size;
 
-   ASSERT(bytes % 8 == 0);
+   ASSERT(bytes % 16 == 0);
    ASSERT(bytes <= CELL_BUFFER_SIZE);
-   ASSERT(alignment > 0);
    ASSERT(cell->cur_batch >= 0);
 
 #ifdef ASSERT
@@ -269,17 +219,12 @@ cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
 
    size = cell->buffer_size[cell->cur_batch];
 
-   padbytes = (alignment - (size % alignment)) % alignment;
-
-   if (padbytes + bytes > cell_batch_free_space(cell)) {
+   if (bytes > cell_batch_free_space(cell)) {
       cell_batch_flush(cell);
       size = 0;
    }
-   else {
-      size += padbytes;
-   }
 
-   ASSERT(size % alignment == 0);
+   ASSERT(size % 16 == 0);
    ASSERT(size + bytes <= CELL_BUFFER_SIZE);
 
    pos = (void *) (cell->buffer[cell->cur_batch] + size);
diff --git a/src/gallium/drivers/cell/ppu/cell_batch.h b/src/gallium/drivers/cell/ppu/cell_batch.h
index f74dd600791..290136031a1 100644
--- a/src/gallium/drivers/cell/ppu/cell_batch.h
+++ b/src/gallium/drivers/cell/ppu/cell_batch.h
@@ -44,15 +44,8 @@ cell_batch_flush(struct cell_context *cell);
 extern uint
 cell_batch_free_space(const struct cell_context *cell);
 
-extern void
-cell_batch_append(struct cell_context *cell, const void *data, uint bytes);
-
-extern void *
-cell_batch_alloc(struct cell_context *cell, uint bytes);
-
 extern void *
-cell_batch_alloc_aligned(struct cell_context *cell, uint bytes,
-                         uint alignment);
+cell_batch_alloc16(struct cell_context *cell, uint bytes);
 
 extern void
 cell_init_batch_buffers(struct cell_context *cell);
diff --git a/src/gallium/drivers/cell/ppu/cell_clear.c b/src/gallium/drivers/cell/ppu/cell_clear.c
index 037635e4660..c2e276988ca 100644
--- a/src/gallium/drivers/cell/ppu/cell_clear.c
+++ b/src/gallium/drivers/cell/ppu/cell_clear.c
@@ -99,10 +99,11 @@ cell_clear_surface(struct pipe_context *pipe, struct pipe_surface *ps,
 
    /* Build a CLEAR command and place it in the current batch buffer */
    {
+      STATIC_ASSERT(sizeof(struct cell_command_clear_surface) % 16 == 0);
       struct cell_command_clear_surface *clr
          = (struct cell_command_clear_surface *)
-         cell_batch_alloc(cell, sizeof(*clr));
-      clr->opcode = CELL_CMD_CLEAR_SURFACE;
+         cell_batch_alloc16(cell, sizeof(*clr));
+      clr->opcode[0] = CELL_CMD_CLEAR_SURFACE;
       clr->surface = surfIndex;
       clr->value = clearValue;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_flush.c b/src/gallium/drivers/cell/ppu/cell_flush.c
index a64967b4b9e..8275c9dc9c7 100644
--- a/src/gallium/drivers/cell/ppu/cell_flush.c
+++ b/src/gallium/drivers/cell/ppu/cell_flush.c
@@ -72,8 +72,9 @@ cell_flush_int(struct cell_context *cell, unsigned flags)
    flushing = TRUE;
 
    if (flags & CELL_FLUSH_WAIT) {
-      uint64_t *cmd = (uint64_t *) cell_batch_alloc(cell, sizeof(uint64_t));
-      *cmd = CELL_CMD_FINISH;
+      STATIC_ASSERT(sizeof(opcode_t) % 16 == 0);
+      opcode_t *cmd = (opcode_t*) cell_batch_alloc16(cell, sizeof(opcode_t));
+      *cmd[0] = CELL_CMD_FINISH;
    }
 
    cell_batch_flush(cell);
@@ -101,11 +102,11 @@ void
 cell_flush_buffer_range(struct cell_context *cell, void *ptr,
 			unsigned size)
 {
-   uint64_t batch[1 + (ROUNDUP8(sizeof(struct cell_buffer_range)) / 8)];
-   struct cell_buffer_range *br = (struct cell_buffer_range *) & batch[1];
-
+   STATIC_ASSERT((sizeof(opcode_t) + sizeof(struct cell_buffer_range)) % 16 == 0);
+   uint32_t *batch = (uint32_t*)cell_batch_alloc16(cell, 
+      sizeof(opcode_t) + sizeof(struct cell_buffer_range));
+   struct cell_buffer_range *br = (struct cell_buffer_range *) &batch[4];
    batch[0] = CELL_CMD_FLUSH_BUFFER_RANGE;
    br->base = (uintptr_t) ptr;
    br->size = size;
-   cell_batch_append(cell, batch, sizeof(batch));
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 2c64eb1bccd..0ea8f017ef9 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -2,6 +2,7 @@
  * 
  * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
  * All Rights Reserved.
+ * Copyright 2009 VMware, Inc.  All Rights Reserved.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the
@@ -25,11 +26,10 @@
  * 
  **************************************************************************/
 
-
-
 /**
  * Generate SPU per-fragment code (actually per-quad code).
  * \author Brian Paul
+ * \author Bob Ellison
  */
 
 
@@ -55,7 +55,7 @@
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
  *
- * Returns true if the Z-buffer needs to be updated.
+ * Returns TRUE if the Z-buffer needs to be updated.
  */
 static boolean
 gen_depth_test(struct spe_function *f,
@@ -134,10 +134,10 @@ gen_depth_test(struct spe_function *f,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
-      return true;
+      return TRUE;
    }
 
-   return false;
+   return FALSE;
 }
 
 
@@ -237,41 +237,136 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
    spe_release_register(f, amask_reg);
 }
 
-/* This pair of functions is used inline to allocate and deallocate
+
+/**
+ * This pair of functions is used inline to allocate and deallocate
  * optional constant registers.  Once a constant is discovered to be 
  * needed, we will likely need it again, so we don't want to deallocate
  * it and have to allocate and load it again unnecessarily.
  */
-static inline void
-setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
+static INLINE void
+setup_optional_register(struct spe_function *f,
+                        int *r)
 {
-   if (*is_already_set) return;
-   *r = spe_allocate_available_register(f);
-   *is_already_set = true;
+   if (*r < 0)
+      *r = spe_allocate_available_register(f);
 }
 
-static inline void
-release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+static INLINE void
+release_optional_register(struct spe_function *f,
+                          int r)
 {
-    if (!*is_already_set) return;
-    spe_release_register(f, r);
-    *is_already_set = false;
+   if (r >= 0)
+      spe_release_register(f, r);
 }
 
-static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+static INLINE void
+setup_const_register(struct spe_function *f,
+                     int *r,
+                     float value)
 {
-   if (*is_already_set) return;
-   setup_optional_register(f, is_already_set, r);
+   if (*r >= 0)
+      return;
+   setup_optional_register(f, r);
    spe_load_float(f, *r, value);
 }
 
-static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+static INLINE void
+release_const_register(struct spe_function *f,
+                       int r)
 {
-    release_optional_register(f, is_already_set, r);
+   release_optional_register(f, r);
 }
 
+
+
+/**
+ * Unpack/convert framebuffer colors from four 32-bit packed colors
+ * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
+ * Each 8-bit color component is expanded into a float in [0.0, 1.0].
+ */
+static void
+unpack_colors(struct spe_function *f,
+              enum pipe_format color_format,
+              int fbRGBA_reg,
+              int fbR_reg, int fbG_reg, int fbB_reg, int fbA_reg)
+{
+   int mask0_reg = spe_allocate_available_register(f);
+   int mask1_reg = spe_allocate_available_register(f);
+   int mask2_reg = spe_allocate_available_register(f);
+   int mask3_reg = spe_allocate_available_register(f);
+
+   spe_load_int(f, mask0_reg, 0xff);
+   spe_load_int(f, mask1_reg, 0xff00);
+   spe_load_int(f, mask2_reg, 0xff0000);
+   spe_load_int(f, mask3_reg, 0xff000000);
+
+   spe_comment(f, 0, "Unpack framebuffer colors, convert to floats");
+
+   switch (color_format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbG = fbG >> 8 */
+      spe_roti(f, fbG_reg, fbG_reg, -8);
+
+      /* fbR = fbR >> 16 */
+      spe_roti(f, fbR_reg, fbR_reg, -16);
+
+      /* fbA = fbA >> 24 */
+      spe_roti(f, fbA_reg, fbA_reg, -24);
+      break;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      /* fbA = fbRGBA & mask */
+      spe_and(f, fbA_reg, fbRGBA_reg, mask0_reg);
+
+      /* fbR = fbRGBA & mask */
+      spe_and(f, fbR_reg, fbRGBA_reg, mask1_reg);
+
+      /* fbG = fbRGBA & mask */
+      spe_and(f, fbG_reg, fbRGBA_reg, mask2_reg);
+
+      /* fbB = fbRGBA & mask */
+      spe_and(f, fbB_reg, fbRGBA_reg, mask3_reg);
+
+      /* fbR = fbR >> 8 */
+      spe_roti(f, fbR_reg, fbR_reg, -8);
+
+      /* fbG = fbG >> 16 */
+      spe_roti(f, fbG_reg, fbG_reg, -16);
+
+      /* fbB = fbB >> 24 */
+      spe_roti(f, fbB_reg, fbB_reg, -24);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+
+   /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
+   spe_cuflt(f, fbR_reg, fbR_reg, 8);
+   spe_cuflt(f, fbG_reg, fbG_reg, 8);
+   spe_cuflt(f, fbB_reg, fbB_reg, 8);
+   spe_cuflt(f, fbA_reg, fbA_reg, 8);
+
+   spe_release_register(f, mask0_reg);
+   spe_release_register(f, mask1_reg);
+   spe_release_register(f, mask2_reg);
+   spe_release_register(f, mask3_reg);
+}
+
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -310,90 +405,14 @@ gen_blend(const struct pipe_blend_state *blend,
     * if we do use them, make sure we only allocate them once by
     * keeping a flag on each one.
     */
-   boolean one_reg_set = false;
-   unsigned int one_reg;
-   boolean constR_reg_set = false, constG_reg_set = false, 
-      constB_reg_set = false, constA_reg_set = false;
-   unsigned int constR_reg, constG_reg, constB_reg, constA_reg;
+   int one_reg = -1;
+   int constR_reg = -1, constG_reg = -1, constB_reg = -1, constA_reg = -1;
 
    ASSERT(blend->blend_enable);
 
-   /* Unpack/convert framebuffer colors from four 32-bit packed colors
-    * (fbRGBA) to four float RGBA vectors (fbR, fbG, fbB, fbA).
-    * Each 8-bit color component is expanded into a float in [0.0, 1.0].
-    */
-   {
-      int mask_reg = spe_allocate_available_register(f);
-
-      /* mask = {0x000000ff, 0x000000ff, 0x000000ff, 0x000000ff} */
-      spe_load_int(f, mask_reg, 0xff);
-
-      /* XXX there may be more clever ways to implement the following code */
-      switch (color_format) {
-      case PIPE_FORMAT_A8R8G8B8_UNORM:
-         /* fbB = fbB & mask */
-         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbG = fbRGBA & mask */
-         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
-         /* fbG = fbG >> 8 */
-         spe_roti(f, fbG_reg, fbG_reg, -8);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbR = fbRGBA & mask */
-         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
-         /* fbR = fbR >> 16 */
-         spe_roti(f, fbR_reg, fbR_reg, -16);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbA = fbRGBA & mask */
-         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
-         /* fbA = fbA >> 24 */
-         spe_roti(f, fbA_reg, fbA_reg, -24);
-         break;
-
-      case PIPE_FORMAT_B8G8R8A8_UNORM:
-         /* fbA = fbA & mask */
-         spe_and(f, fbA_reg, fbRGBA_reg, mask_reg);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbR = fbRGBA & mask */
-         spe_and(f, fbR_reg, fbRGBA_reg, mask_reg);
-         /* fbR = fbR >> 8 */
-         spe_roti(f, fbR_reg, fbR_reg, -8);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbG = fbRGBA & mask */
-         spe_and(f, fbG_reg, fbRGBA_reg, mask_reg);
-         /* fbG = fbG >> 16 */
-         spe_roti(f, fbG_reg, fbG_reg, -16);
-         /* mask = mask << 8 */
-         spe_roti(f, mask_reg, mask_reg, 8);
-
-         /* fbB = fbRGBA & mask */
-         spe_and(f, fbB_reg, fbRGBA_reg, mask_reg);
-         /* fbB = fbB >> 24 */
-         spe_roti(f, fbB_reg, fbB_reg, -24);
-         break;
-
-      default:
-         ASSERT(0);
-      }
-
-      /* convert int[4] in [0,255] to float[4] in [0.0, 1.0] */
-      spe_cuflt(f, fbR_reg, fbR_reg, 8);
-      spe_cuflt(f, fbG_reg, fbG_reg, 8);
-      spe_cuflt(f, fbB_reg, fbB_reg, 8);
-      spe_cuflt(f, fbA_reg, fbA_reg, 8);
-
-      spe_release_register(f, mask_reg);
-   }
+   /* packed RGBA -> float colors */
+   unpack_colors(f, color_format, fbRGBA_reg,
+                 fbR_reg, fbG_reg, fbB_reg, fbA_reg);
 
    /*
     * Compute Src RGB terms.  We're actually looking for the value
@@ -476,9 +495,9 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
       /* We need the optional constant color registers */
-      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
-      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
-      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
       /* now, factor = (Rc,Gc,Bc), so term = (R*Rc,G*Gc,B*Bc) */
       spe_fm(f, term1R_reg, fragR_reg, constR_reg);
       spe_fm(f, term1G_reg, fragG_reg, constG_reg);
@@ -486,7 +505,7 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
       /* we'll need the optional constant alpha register */
-      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
       /* factor = (Ac,Ac,Ac), so term = (R*Ac,G*Ac,B*Ac) */
       spe_fm(f, term1R_reg, fragR_reg, constA_reg);
       spe_fm(f, term1G_reg, fragG_reg, constA_reg);
@@ -494,9 +513,9 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
       /* We need the optional constant color registers */
-      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
-      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
-      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
       /* factor = (1-Rc,1-Gc,1-Bc), so term = (R*(1-Rc),G*(1-Gc),B*(1-Bc)) 
        * or term = (R-R*Rc, G-G*Gc, B-B*Bc)
        * fnms(a,b,c,d) computes a = d - b*c
@@ -507,9 +526,9 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
       /* We need the optional constant color registers */
-      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
-      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
-      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
       /* factor = (1-Ac,1-Ac,1-Ac), so term = (R*(1-Ac),G*(1-Ac),B*(1-Ac))
        * or term = (R-R*Ac,G-G*Ac,B-B*Ac)
        * fnms(a,b,c,d) computes a = d - b*c
@@ -520,7 +539,7 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_SRC_ALPHA_SATURATE:
       /* We'll need the optional {1,1,1,1} register */
-      setup_const_register(f, &one_reg_set, &one_reg, 1.0f);
+      setup_const_register(f, &one_reg, 1.0f);
       /* factor = (min(A,1-Afb),min(A,1-Afb),min(A,1-Afb)), so 
        * term = (R*min(A,1-Afb), G*min(A,1-Afb), B*min(A,1-Afb))
        * We could expand the term (as a*min(b,c) == min(a*b,a*c)
@@ -598,7 +617,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
    case PIPE_BLENDFACTOR_CONST_COLOR:
       /* We need the optional constA_reg register */
-      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
       /* factor = Ac, so term = A*Ac */
       spe_fm(f, term1A_reg, fragA_reg, constA_reg);
       break;
@@ -606,7 +625,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
       /* We need the optional constA_reg register */
-      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
       /* factor = 1-Ac, so term = A*(1-Ac) = A-A*Ac */
       /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term1A_reg, fragA_reg, constA_reg, fragA_reg);
@@ -703,9 +722,9 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_COLOR:
       /* We need the optional constant color registers */
-      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
-      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
-      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
       /* now, factor = (Rc,Gc,Bc), so term = (Rfb*Rc,Gfb*Gc,Bfb*Bc) */
       spe_fm(f, term2R_reg, fbR_reg, constR_reg);
       spe_fm(f, term2G_reg, fbG_reg, constG_reg);
@@ -713,7 +732,7 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_CONST_ALPHA:
       /* we'll need the optional constant alpha register */
-      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
       /* factor = (Ac,Ac,Ac), so term = (Rfb*Ac,Gfb*Ac,Bfb*Ac) */
       spe_fm(f, term2R_reg, fbR_reg, constA_reg);
       spe_fm(f, term2G_reg, fbG_reg, constA_reg);
@@ -721,9 +740,9 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
       /* We need the optional constant color registers */
-      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
-      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
-      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
       /* factor = (1-Rc,1-Gc,1-Bc), so term = (Rfb*(1-Rc),Gfb*(1-Gc),Bfb*(1-Bc)) 
        * or term = (Rfb-Rfb*Rc, Gfb-Gfb*Gc, Bfb-Bfb*Bc)
        * fnms(a,b,c,d) computes a = d - b*c
@@ -734,9 +753,9 @@ gen_blend(const struct pipe_blend_state *blend,
       break;
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA:
       /* We need the optional constant color registers */
-      setup_const_register(f, &constR_reg_set, &constR_reg, blend_color->color[0]);
-      setup_const_register(f, &constG_reg_set, &constG_reg, blend_color->color[1]);
-      setup_const_register(f, &constB_reg_set, &constB_reg, blend_color->color[2]);
+      setup_const_register(f, &constR_reg, blend_color->color[0]);
+      setup_const_register(f, &constG_reg, blend_color->color[1]);
+      setup_const_register(f, &constB_reg, blend_color->color[2]);
       /* factor = (1-Ac,1-Ac,1-Ac), so term = (Rfb*(1-Ac),Gfb*(1-Ac),Bfb*(1-Ac))
        * or term = (Rfb-Rfb*Ac,Gfb-Gfb*Ac,Bfb-Bfb*Ac)
        * fnms(a,b,c,d) computes a = d - b*c
@@ -806,7 +825,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_CONST_ALPHA: /* fall through */
    case PIPE_BLENDFACTOR_CONST_COLOR:
       /* We need the optional constA_reg register */
-      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
       /* factor = Ac, so term = Afb*Ac */
       spe_fm(f, term2A_reg, fbA_reg, constA_reg);
       break;
@@ -814,7 +833,7 @@ gen_blend(const struct pipe_blend_state *blend,
    case PIPE_BLENDFACTOR_INV_CONST_ALPHA: /* fall through */
    case PIPE_BLENDFACTOR_INV_CONST_COLOR:
       /* We need the optional constA_reg register */
-      setup_const_register(f, &constA_reg_set, &constA_reg, blend_color->color[3]);
+      setup_const_register(f, &constA_reg, blend_color->color[3]);
       /* factor = 1-Ac, so term = Afb*(1-Ac) = Afb-Afb*Ac */
       /* fnms(a,b,c,d) computes a = d - b*c */
       spe_fnms(f, term2A_reg, fbA_reg, constA_reg, fbA_reg);
@@ -910,11 +929,11 @@ gen_blend(const struct pipe_blend_state *blend,
    spe_release_register(f, tmp_reg);
 
    /* Free any optional registers that actually got used */
-   release_const_register(f, &one_reg_set, one_reg);
-   release_const_register(f, &constR_reg_set, constR_reg);
-   release_const_register(f, &constG_reg_set, constG_reg);
-   release_const_register(f, &constB_reg_set, constB_reg);
-   release_const_register(f, &constA_reg_set, constA_reg);
+   release_const_register(f, one_reg);
+   release_const_register(f, constR_reg);
+   release_const_register(f, constG_reg);
+   release_const_register(f, constB_reg);
+   release_const_register(f, constA_reg);
 }
 
 
@@ -1055,6 +1074,7 @@ gen_pack_colors(struct spe_function *f,
    spe_release_register(f, ba_reg);
 }
 
+
 static void
 gen_colormask(struct spe_function *f,
               uint colormask,
@@ -1067,10 +1087,10 @@ gen_colormask(struct spe_function *f,
     * are packed according to the given color format, not
     * necessarily RGBA...
     */
-   unsigned int r_mask;
-   unsigned int g_mask;
-   unsigned int b_mask;
-   unsigned int a_mask;
+   uint r_mask;
+   uint g_mask;
+   uint b_mask;
+   uint a_mask;
 
    /* Calculate exactly where the bits for any particular color
     * end up, so we can mask them correctly.
@@ -1111,11 +1131,13 @@ gen_colormask(struct spe_function *f,
       a_mask = 0;
    }
 
-   /* Get a temporary register to hold the mask that will be applied to the fragment */
+   /* Get a temporary register to hold the mask that will be applied
+    * to the fragment
+    */
    int colormask_reg = spe_allocate_available_register(f);
 
-   /* The actual mask we're going to use is an OR of the remaining R, G, B, and A
-    * masks.  Load the result value into our temporary register.
+   /* The actual mask we're going to use is an OR of the remaining R, G, B,
+    * and A masks.  Load the result value into our temporary register.
     */
    spe_load_uint(f, colormask_reg, r_mask | g_mask | b_mask | a_mask);
 
@@ -1135,7 +1157,9 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
-/* This function is annoyingly similar to gen_depth_test(), above, except
+
+/**
+ * This function is annoyingly similar to gen_depth_test(), above, except
  * that instead of comparing two varying values (i.e. fragment and buffer),
  * we're comparing a varying value with a static value.  As such, we have
  * access to the Compare Immediate instructions where we don't in 
@@ -1146,16 +1170,20 @@ gen_colormask(struct spe_function *f,
  *
  * The return value in the stencil_pass_reg is a bitmask of valid
  * fragments that also passed the stencil test.  The bitmask of valid
- * fragments that failed would be found in (fragment_mask_reg & ~stencil_pass_reg).
+ * fragments that failed would be found in
+ * (fragment_mask_reg & ~stencil_pass_reg).
  */
 static void
-gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
-                 unsigned int stencil_max_value,
-                 unsigned int fragment_mask_reg, unsigned int fbS_reg, 
-                 unsigned int stencil_pass_reg)
+gen_stencil_test(struct spe_function *f,
+                 const struct pipe_stencil_state *state, 
+                 uint stencil_max_value,
+                 int fragment_mask_reg,
+                 int fbS_reg, 
+                 int stencil_pass_reg)
 {
-   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
-    * register, taking into account whether each fragment was active to begin with.
+   /* Generate code that puts the set of passing fragments into the
+    * stencil_pass_reg register, taking into account whether each fragment
+    * was active to begin with.
     */
    switch (state->func) {
    case PIPE_FUNC_EQUAL:
@@ -1166,9 +1194,10 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       }
       else {
          /* stencil_pass = fragment_mask & ((s&mask) == (reference&mask)) */
-         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         uint tmp_masked_stencil = spe_allocate_available_register(f);
          spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
-         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->value_mask & state->ref_value);
          spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
          spe_release_register(f, tmp_masked_stencil);
       }
@@ -1182,9 +1211,10 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       }
       else {
          /* stencil_pass = fragment_mask & ~((s&mask) == (reference&mask)) */
-         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
          spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
-         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_compare_equal_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                state->value_mask & state->ref_value);
          spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
          spe_release_register(f, tmp_masked_stencil);
       }
@@ -1198,9 +1228,10 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       }
       else {
          /* stencil_pass = fragment_mask & ((reference&mask) < (s & mask)) */
-         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
          spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
-         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->value_mask & state->ref_value);
          spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
          spe_release_register(f, tmp_masked_stencil);
       }
@@ -1214,7 +1245,7 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
           * comparing directly.  Compare Logical Greater Than Word (clgt) 
           * treats its operands as unsigned - no sign extension.
           */
-         unsigned int tmp_reg = spe_allocate_available_register(f);
+         int tmp_reg = spe_allocate_available_register(f);
          spe_load_uint(f, tmp_reg, state->ref_value);
          spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
          spe_and(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
@@ -1222,8 +1253,8 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       }
       else {
          /* stencil_pass = fragment_mask & ((reference&mask) > (s&mask)) */
-         unsigned int tmp_reg = spe_allocate_available_register(f);
-         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
          spe_load_uint(f, tmp_reg, state->value_mask & state->ref_value);
          spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
          spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
@@ -1237,14 +1268,16 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       if (state->value_mask == stencil_max_value) {
          /* stencil_pass = fragment_mask & (reference >= s) 
           *              = fragment_mask & ~(s > reference) */
-         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+         spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg,
+                                  state->ref_value);
          spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
       }
       else {
          /* stencil_pass = fragment_mask & ~((s&mask) > (reference&mask)) */
-         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
          spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
-         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil, state->value_mask & state->ref_value);
+         spe_compare_greater_uint(f, stencil_pass_reg, tmp_masked_stencil,
+                                  state->value_mask & state->ref_value);
          spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
          spe_release_register(f, tmp_masked_stencil);
       }
@@ -1255,7 +1288,7 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
          /* stencil_pass = fragment_mask & (reference <= s) ]
           *               = fragment_mask & ~(reference > s) */
          /* As above, we have to do this by loading a register */
-         unsigned int tmp_reg = spe_allocate_available_register(f);
+         int tmp_reg = spe_allocate_available_register(f);
          spe_load_uint(f, tmp_reg, state->ref_value);
          spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
          spe_andc(f, stencil_pass_reg, fragment_mask_reg, stencil_pass_reg);
@@ -1263,8 +1296,8 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
       }
       else {
          /* stencil_pass = fragment_mask & ~((reference&mask) > (s&mask)) */
-         unsigned int tmp_reg = spe_allocate_available_register(f);
-         unsigned int tmp_masked_stencil = spe_allocate_available_register(f);
+         int tmp_reg = spe_allocate_available_register(f);
+         int tmp_masked_stencil = spe_allocate_available_register(f);
          spe_load_uint(f, tmp_reg, state->ref_value & state->value_mask);
          spe_and_uint(f, tmp_masked_stencil, fbS_reg, state->value_mask);
          spe_clgt(f, stencil_pass_reg, tmp_reg, tmp_masked_stencil);
@@ -1290,7 +1323,9 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
     */
 }
 
-/* This function generates code that calculates a set of new stencil values
+
+/**
+ * This function generates code that calculates a set of new stencil values
  * given the earlier values and the operation to apply.  It does not
  * apply any tests.  It is intended to be called up to 3 times
  * (for the stencil fail operation, for the stencil pass-z fail operation,
@@ -1302,9 +1337,12 @@ gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state,
  * in the stencil buffer - in other words, it should be usable as a mask.
  */
 static void
-gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
-                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
-                   unsigned int fbS_reg, unsigned int newS_reg)
+gen_stencil_values(struct spe_function *f,
+                   uint stencil_op,
+                   uint stencil_ref_value,
+                   uint stencil_max_value,
+                   int fbS_reg,
+                   int newS_reg)
 {
    /* The code below assumes that newS_reg and fbS_reg are not the same
     * register; if they can be, the calculations below will have to use
@@ -1346,7 +1384,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
 
    case PIPE_STENCIL_OP_INCR: {
       /* newS = (s == max ? max : s + 1) */
-      unsigned int equals_reg = spe_allocate_available_register(f);
+      int equals_reg = spe_allocate_available_register(f);
 
       spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
       /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
@@ -1359,7 +1397,7 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
    }
    case PIPE_STENCIL_OP_DECR: {
       /* newS = (s == 0 ? 0 : s - 1) */
-      unsigned int equals_reg = spe_allocate_available_register(f);
+      int equals_reg = spe_allocate_available_register(f);
 
       spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
       /* Add Word Immediate with a (-1) value works */
@@ -1397,7 +1435,8 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
 }
 
 
-/* This function generates code to get all the necessary possible
+/**
+ * This function generates code to get all the necessary possible
  * stencil values.  For each of the output registers (fail_reg,
  * zfail_reg, and zpass_reg), it either allocates a new register
  * and calculates a new set of values based on the stencil operation,
@@ -1412,13 +1451,15 @@ gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
  * and released by the corresponding spe_release_register_set() call.
  */
 static void
-gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *stencil,
-                       const unsigned int depth_enabled,
-                       unsigned int fbS_reg, 
-                       unsigned int *fail_reg, unsigned int *zfail_reg, 
-                       unsigned int *zpass_reg)
+gen_get_stencil_values(struct spe_function *f,
+                       const struct pipe_stencil_state *stencil,
+                       const uint depth_enabled,
+                       int fbS_reg, 
+                       int *fail_reg,
+                       int *zfail_reg, 
+                       int *zpass_reg)
 {
-   unsigned zfail_op;
+   uint zfail_op;
 
    /* Stenciling had better be enabled here */
    ASSERT(stencil->enabled);
@@ -1480,7 +1521,8 @@ gen_get_stencil_values(struct spe_function *f, const struct pipe_stencil_state *
    }
 }
 
-/* Note that fbZ_reg may *not* be set on entry, if in fact
+/**
+ * Note that fbZ_reg may *not* be set on entry, if in fact
  * the depth test is not enabled.  This function must not use
  * the register if depth is not enabled.
  */
@@ -1494,7 +1536,7 @@ gen_stencil_depth_test(struct spe_function *f,
    /* True if we've generated code that could require writeback to the
     * depth and/or stencil buffers
     */
-   boolean modified_buffers = false;
+   boolean modified_buffers = FALSE;
 
    boolean need_to_calculate_stencil_values;
    boolean need_to_writemask_stencil_values;
@@ -1504,11 +1546,11 @@ gen_stencil_depth_test(struct spe_function *f,
    /* Registers.  We may or may not actually allocate these, depending
     * on whether the state values indicate that we need them.
     */
-   unsigned int stencil_pass_reg, stencil_fail_reg;
-   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
-   unsigned int stencil_writemask_reg;
-   unsigned int zmask_reg;
-   unsigned int newS_reg;
+   int stencil_pass_reg, stencil_fail_reg;
+   int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   int stencil_writemask_reg;
+   int zmask_reg;
+   int newS_reg;
 
    /* Stenciling is quite complex: up to six different configurable stencil 
     * operations/calculations can be required (three each for front-facing
@@ -1555,27 +1597,27 @@ gen_stencil_depth_test(struct spe_function *f,
    if (stencil->fail_op == PIPE_STENCIL_OP_KEEP &&
        stencil->zfail_op == PIPE_STENCIL_OP_KEEP &&
        stencil->zpass_op == PIPE_STENCIL_OP_KEEP) {
-       need_to_calculate_stencil_values = false;
-       need_to_writemask_stencil_values = false;
+       need_to_calculate_stencil_values = FALSE;
+       need_to_writemask_stencil_values = FALSE;
     }
     else if (stencil->write_mask == 0x0) {
       /* All changes are writemasked out, so no need to calculate
        * what those changes might be, and no need to write anything back.
        */
-      need_to_calculate_stencil_values = false;
-      need_to_writemask_stencil_values = false;
+      need_to_calculate_stencil_values = FALSE;
+      need_to_writemask_stencil_values = FALSE;
    }
    else if (stencil->write_mask == 0xff) {
       /* Still trivial, but a little less so.  We need to write the stencil
        * values, but we don't need to mask them.
        */
-      need_to_calculate_stencil_values = true;
-      need_to_writemask_stencil_values = false;
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = FALSE;
    }
    else {
       /* The general case: calculate, mask, and write */
-      need_to_calculate_stencil_values = true;
-      need_to_writemask_stencil_values = true;
+      need_to_calculate_stencil_values = TRUE;
+      need_to_writemask_stencil_values = TRUE;
 
       /* While we're here, generate code that calculates what the
        * writemask should be.  If backface stenciling is enabled,
@@ -1633,7 +1675,9 @@ gen_stencil_depth_test(struct spe_function *f,
        * This function will allocate a variant number of registers that
        * will be released as part of the register set.
        */
-      spe_comment(f, 0, facing == CELL_FACING_FRONT ? "Computing front-facing stencil values" : "Computing back-facing stencil values");
+      spe_comment(f, 0, facing == CELL_FACING_FRONT
+                  ? "Computing front-facing stencil values"
+                  : "Computing back-facing stencil values");
       gen_get_stencil_values(f, stencil, dsa->depth.enabled, fbS_reg, 
          &stencil_fail_values, &stencil_pass_depth_fail_values, 
          &stencil_pass_depth_pass_values);
@@ -1652,7 +1696,8 @@ gen_stencil_depth_test(struct spe_function *f,
    if (dsa->depth.enabled) {
       spe_comment(f, 0, "Running stencil depth test");
       zmask_reg = spe_allocate_available_register(f);
-      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                         fbZ_reg, zmask_reg);
    }
 
    if (need_to_calculate_stencil_values) {
@@ -1675,7 +1720,7 @@ gen_stencil_depth_test(struct spe_function *f,
       if (stencil_fail_values != fbS_reg) {
          spe_comment(f, 0, "Loading stencil fail values");
          spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
-         modified_buffers = true;
+         modified_buffers = TRUE;
       }
 
       /* Same for the stencil pass/depth fail values.  If this calculation
@@ -1689,14 +1734,17 @@ gen_stencil_depth_test(struct spe_function *f,
           * depth passing mask.  Note that zmask_reg *must* have been
           * set above if we're here.
           */
-         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         uint stencil_pass_depth_fail_mask =
+            spe_allocate_available_register(f);
+
          spe_comment(f, 0, "Loading stencil pass/depth fail values");
          spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
 
-         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values,
+                  stencil_pass_depth_fail_mask);
 
          spe_release_register(f, stencil_pass_depth_fail_mask);
-         modified_buffers = true;
+         modified_buffers = TRUE;
       }
 
       /* Same for the stencil pass/depth pass mask.  Note that we
@@ -1707,7 +1755,7 @@ gen_stencil_depth_test(struct spe_function *f,
        */
       if (stencil_pass_depth_pass_values != fbS_reg) {
          if (dsa->depth.enabled) {
-            unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            uint stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
             /* We'll need a separate register */
             spe_comment(f, 0, "Loading stencil pass/depth pass values");
             spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
@@ -1719,7 +1767,7 @@ gen_stencil_depth_test(struct spe_function *f,
             spe_comment(f, 0, "Loading stencil pass values");
             spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
          }
-         modified_buffers = true;
+         modified_buffers = TRUE;
       }
 
       /* Almost done.  If we need to writemask, do it now, leaving the
@@ -1749,7 +1797,7 @@ gen_stencil_depth_test(struct spe_function *f,
    spe_comment(f, 0, "Releasing stencil register set");
    spe_release_register_set(f);
 
-   /* Return true if we could have modified the stencil and/or
+   /* Return TRUE if we could have modified the stencil and/or
     * depth buffers.
     */
    return modified_buffers;
@@ -1757,6 +1805,200 @@ gen_stencil_depth_test(struct spe_function *f,
 
 
 /**
+ * Generate depth and/or stencil test code.
+ * \param cell  context
+ * \param dsa  depth/stencil/alpha state
+ * \param f  spe function to emit
+ * \param facing  either CELL_FACING_FRONT or CELL_FACING_BACK
+ * \param mask_reg  register containing the pixel alive/dead mask
+ * \param depth_tile_reg  register containing address of z/stencil tile
+ * \param quad_offset_reg  offset to quad from start of tile
+ * \param fragZ_reg  register containg fragment Z values
+ */
+static void
+gen_depth_stencil(struct cell_context *cell,
+                  const struct pipe_depth_stencil_alpha_state *dsa,
+                  struct spe_function *f,
+                  uint facing,
+                  int mask_reg,
+                  int depth_tile_reg,
+                  int quad_offset_reg,
+                  int fragZ_reg)
+
+{
+   const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
+   boolean write_depth_stencil;
+
+   /* framebuffer's combined z/stencil values register */
+   int fbZS_reg = spe_allocate_available_register(f);
+
+   /* Framebufer Z values register */
+   int fbZ_reg = spe_allocate_available_register(f);
+
+   /* Framebuffer stencil values register (may not be used) */
+   int fbS_reg = spe_allocate_available_register(f);
+
+   /* 24-bit mask register (may not be used) */
+   int zmask_reg = spe_allocate_available_register(f);
+
+   /**
+    * The following code:
+    * 1. fetch quad of packed Z/S values from the framebuffer tile.
+    * 2. extract the separate the Z and S values from packed values
+    * 3. convert fragment Z values from float in [0,1] to 32/24/16-bit ints
+    *
+    * The instructions for doing this are interleaved for better performance.
+    */
+   spe_comment(f, 0, "Fetch Z/stencil quad from tile");
+
+   switch(zs_format) {
+   case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+   case PIPE_FORMAT_X8Z24_UNORM:
+      /* prepare mask to extract Z vals from ZS vals */
+      spe_load_uint(f, zmask_reg, 0x00ffffff);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by masking */
+      spe_and(f, fbZ_reg, fbZS_reg, zmask_reg);
+
+      /* extract 8-bit stencil values by shifting */
+      spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+      break;
+
+   case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+   case PIPE_FORMAT_Z24X8_UNORM:
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* right shift 32-bit fragment Z to 24 bits */
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+
+      /* extract 24-bit Z values from ZS values by shifting */
+      spe_rotmi(f, fbZ_reg, fbZS_reg, -8);
+
+      /* extract 8-bit stencil values by masking */
+      spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+      break;
+
+   case PIPE_FORMAT_Z32_UNORM:
+      /* Load: fbZ_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZ_reg, depth_tile_reg, quad_offset_reg);
+
+      /* convert fragment Z from [0,1] to 32-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+
+      /* No stencil, so can't do anything there */
+      break;
+
+   case PIPE_FORMAT_Z16_UNORM:
+      /* XXX This code for 16bpp Z is broken! */
+
+      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+
+      /* Copy over 4 32-bit values */
+      spe_move(f, fbZ_reg, fbZS_reg);
+
+      /* convert Z from [0,1] to 16-bit ints */
+      spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+      spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+      /* No stencil */
+      break;
+
+   default:
+      ASSERT(0); /* invalid format */
+   }
+
+   /* If stencil is enabled, use the stencil-specific code
+    * generator to generate both the stencil and depth (if needed)
+    * tests.  Otherwise, if only depth is enabled, generate
+    * a quick depth test.  The test generators themselves will
+    * report back whether the depth/stencil buffer has to be
+    * written back.
+    */
+   if (dsa->stencil[0].enabled) {
+      /* This will perform the stencil and depth tests, and update
+       * the mask_reg, fbZ_reg, and fbS_reg as required by the
+       * tests.
+       */
+      ASSERT(fbS_reg >= 0);
+      spe_comment(f, 0, "Perform stencil test");
+
+      /* Note that fbZ_reg may not be set on entry, if stenciling
+       * is enabled but there's no Z-buffer.  The 
+       * gen_stencil_depth_test() function must ignore the
+       * fbZ_reg register if depth is not enabled.
+       */
+      write_depth_stencil = gen_stencil_depth_test(f, dsa, facing,
+                                                   mask_reg, fragZ_reg,
+                                                   fbZ_reg, fbS_reg);
+   }
+   else if (dsa->depth.enabled) {
+      int zmask_reg = spe_allocate_available_register(f);
+      ASSERT(fbZ_reg >= 0);
+      spe_comment(f, 0, "Perform depth test");
+      write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg,
+                                           fbZ_reg, zmask_reg);
+      spe_release_register(f, zmask_reg);
+   }
+   else {
+      write_depth_stencil = FALSE;
+   }
+
+   if (write_depth_stencil) {
+      /* Merge latest Z and Stencil values into fbZS_reg.
+       * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
+       * fbS_reg has four 8-bit Z values in bits [7..0].
+       */
+      spe_comment(f, 0, "Store quad's depth/stencil values in tile");
+      if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
+          zs_format == PIPE_FORMAT_X8Z24_UNORM) {
+         spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
+               zs_format == PIPE_FORMAT_Z24X8_UNORM) {
+         spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+         spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
+         spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+      }
+      else if (zs_format == PIPE_FORMAT_S8_UNORM) {
+         ASSERT(0);   /* XXX to do */
+      }
+      else {
+         ASSERT(0); /* bad zs_format */
+      }
+
+      /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
+      spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
+   }
+
+   /* Don't need these any more */
+   spe_release_register(f, fbZS_reg);
+   spe_release_register(f, fbZ_reg);
+   spe_release_register(f, fbS_reg);
+   spe_release_register(f, zmask_reg);
+}
+
+
+
+/**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
  * framebuffer write) as specified by the current context state.
@@ -1782,7 +2024,9 @@ gen_stencil_depth_test(struct spe_function *f,
  *              the fragment ops appended.
  */
 void
-cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct spe_function *f)
+cell_gen_fragment_function(struct cell_context *cell,
+                           const uint facing,
+                           struct spe_function *f)
 {
    const struct pipe_depth_stencil_alpha_state *dsa = cell->depth_stencil;
    const struct pipe_blend_state *blend = cell->blend;
@@ -1809,12 +2053,13 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    int quad_offset_reg;
 
    int fbRGBA_reg;  /**< framebuffer's RGBA colors for quad */
-   int fbZS_reg;    /**< framebuffer's combined z/stencil values for quad */
 
    if (cell->debug_flags & CELL_DEBUG_ASM) {
-      spe_print_code(f, true);
+      spe_print_code(f, TRUE);
       spe_indent(f, 8);
-      spe_comment(f, -4, facing == CELL_FACING_FRONT ? "Begin front-facing per-fragment ops": "Begin back-facing per-fragment ops");
+      spe_comment(f, -4, facing == CELL_FACING_FRONT
+                  ? "Begin front-facing per-fragment ops"
+                  : "Begin back-facing per-fragment ops");
    }
 
    spe_allocate_register(f, x_reg);
@@ -1830,7 +2075,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
-   fbZS_reg = spe_allocate_available_register(f);
 
    /* compute offset of quad from start of tile, in bytes */
    {
@@ -1855,177 +2099,14 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
-   /* If we need the stencil buffers (because one- or two-sided stencil is
-    * enabled) or the depth buffer (because the depth test is enabled),
-    * go grab them.  Note that if either one- or two-sided stencil is
-    * enabled, dsa->stencil[0].enabled will be true.
-    */
+   /* generate depth and/or stencil test code */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
-      const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
-      boolean write_depth_stencil;
-
-      /* We may or may not need to allocate a register for Z or stencil values */
-      boolean fbS_reg_set = false, fbZ_reg_set = false;
-      unsigned int fbS_reg, fbZ_reg = 0;
-
-      spe_comment(f, 0, "Fetching Z/stencil quad from tile");
-
-      /* fetch quad of depth/stencil values from tile at (x,y) */
-      /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
-      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
-      spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
-
-      /* From the Z/stencil buffer format, pull out the bits we need for
-       * Z and/or stencil.  We'll also convert the incoming fragment Z
-       * value in fragZ_reg from a floating point value in [0.0..1.0] to
-       * an unsigned integer value with the appropriate resolution.
-       * Note that even if depth or stencil is *not* enabled, if it's
-       * present in the buffer, we pull it out and put it back later;
-       * otherwise, we can inadvertently destroy the contents of
-       * buffers we're not supposed to touch (e.g., if the user is
-       * clearing the depth buffer but not the stencil buffer, a
-       * quad of constant depth is drawn over the surface; the stencil
-       * buffer must be maintained).
-       */
-      switch(zs_format) {
-
-         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
-         case PIPE_FORMAT_X8Z24_UNORM:
-            /* Pull out both Z and stencil */
-            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
-
-            /* four 24-bit Z values in the low-order bits */
-            spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
-
-            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-             * to a 24-bit unsigned integer
-             */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-
-            /* four 8-bit stencil values in the high-order bits */
-            spe_rotmi(f, fbS_reg, fbZS_reg, -24);
-         break;
-
-         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
-         case PIPE_FORMAT_Z24X8_UNORM:
-            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-            setup_optional_register(f, &fbS_reg_set, &fbS_reg);
-
-            /* shift by 8 to get the upper 24-bit values */
-            spe_rotmi(f, fbS_reg, fbZS_reg, -8);
-
-            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-             * to a 24-bit unsigned integer
-             */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-
-            /* 8-bit stencil in the low-order bits - mask them out */
-            spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
-         break;
-
-         case PIPE_FORMAT_Z32_UNORM:
-            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-            /* Copy over 4 32-bit values */
-            spe_move(f, fbZ_reg, fbZS_reg);
-
-            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-             * to a 32-bit unsigned integer
-             */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* No stencil, so can't do anything there */
-         break;
-
-         case PIPE_FORMAT_Z16_UNORM:
-            /* XXX Not sure this is correct, but it was here before, so we're
-             * going with it for now
-             */
-            setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
-            /* Copy over 4 32-bit values */
-            spe_move(f, fbZ_reg, fbZS_reg);
-
-            /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
-             * to a 16-bit unsigned integer
-             */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-            /* No stencil */
-
-         default:
-            ASSERT(0); /* invalid format */
-      }
-
-      /* If stencil is enabled, use the stencil-specific code
-       * generator to generate both the stencil and depth (if needed)
-       * tests.  Otherwise, if only depth is enabled, generate
-       * a quick depth test.  The test generators themselves will
-       * report back whether the depth/stencil buffer has to be
-       * written back.
-       */
-      if (dsa->stencil[0].enabled) {
-         /* This will perform the stencil and depth tests, and update
-          * the mask_reg, fbZ_reg, and fbS_reg as required by the
-          * tests.
-          */
-         ASSERT(fbS_reg_set);
-         spe_comment(f, 0, "Perform stencil test");
-
-         /* Note that fbZ_reg may not be set on entry, if stenciling
-          * is enabled but there's no Z-buffer.  The 
-          * gen_stencil_depth_test() function must ignore the
-          * fbZ_reg register if depth is not enabled.
-          */
-         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
-      }
-      else if (dsa->depth.enabled) {
-         int zmask_reg = spe_allocate_available_register(f);
-         ASSERT(fbZ_reg_set);
-         spe_comment(f, 0, "Perform depth test");
-         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
-         spe_release_register(f, zmask_reg);
-      }
-      else {
-         write_depth_stencil = false;
-      }
-
-      if (write_depth_stencil) {
-         /* Merge latest Z and Stencil values into fbZS_reg.
-          * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
-          * fbS_reg has four 8-bit Z values in bits [7..0].
-          */
-         spe_comment(f, 0, "Store quad's depth/stencil values in tile");
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
-         }
-         else if (zs_format == PIPE_FORMAT_S8_UNORM) {
-            ASSERT(0);   /* XXX to do */
-         }
-         else {
-            ASSERT(0); /* bad zs_format */
-         }
-
-         /* Store: memory[depth_tile_reg + quad_offset_reg] = fbZS */
-         spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
-      }
-
-      /* Don't need these any more */
-      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
-      release_optional_register(f, &fbS_reg_set, fbS_reg);
+      gen_depth_stencil(cell, dsa, f,
+                        facing,
+                        mask_reg,
+                        depth_tile_reg,
+                        quad_offset_reg,
+                        fragZ_reg);
    }
 
    /* Get framebuffer quad/colors.  We'll need these for blending,
@@ -2089,7 +2170,6 @@ cell_gen_fragment_function(struct cell_context *cell, const uint facing, struct
    spe_bi(f, SPE_REG_RA, 0, 0);  /* return from function call */
 
    spe_release_register(f, fbRGBA_reg);
-   spe_release_register(f, fbZS_reg);
    spe_release_register(f, quad_offset_reg);
 
    if (cell->debug_flags & CELL_DEBUG_ASM) {
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index 0a0af81f53f..39b85faeb86 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -133,7 +133,7 @@ lookup_fragment_ops(struct cell_context *cell)
        */
       ops = CALLOC_VARIANT_LENGTH_STRUCT(cell_command_fragment_ops, total_code_size);
       /* populate the new cell_command_fragment_ops object */
-      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      ops->opcode[0] = CELL_CMD_STATE_FRAGMENT_OPS;
       ops->total_code_size = total_code_size;
       ops->front_code_index = 0;
       memcpy(ops->code, spe_code_front.store, front_code_size);
@@ -178,10 +178,10 @@ static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
 {
-   uint64_t *dst = (uint64_t *) 
-       cell_batch_alloc(cell, ROUNDUP8(sizeof(uint64_t) + state_size));
+   uint32_t *dst = (uint32_t *) 
+       cell_batch_alloc16(cell, ROUNDUP16(sizeof(opcode_t) + state_size));
    *dst = cmd;
-   memcpy(dst + 1, state, state_size);
+   memcpy(dst + 4, state, state_size);
 }
 
 
@@ -195,9 +195,10 @@ cell_emit_state(struct cell_context *cell)
    if (cell->dirty & CELL_NEW_FRAMEBUFFER) {
       struct pipe_surface *cbuf = cell->framebuffer.cbufs[0];
       struct pipe_surface *zbuf = cell->framebuffer.zsbuf;
+      STATIC_ASSERT(sizeof(struct cell_command_framebuffer) % 16 == 0);
       struct cell_command_framebuffer *fb
-         = cell_batch_alloc(cell, sizeof(*fb));
-      fb->opcode = CELL_CMD_STATE_FRAMEBUFFER;
+         = cell_batch_alloc16(cell, sizeof(*fb));
+      fb->opcode[0] = CELL_CMD_STATE_FRAMEBUFFER;
       fb->color_start = cell->cbuf_map[0];
       fb->color_format = cbuf->format;
       fb->depth_start = cell->zsbuf_map;
@@ -211,17 +212,19 @@ cell_emit_state(struct cell_context *cell)
    }
 
    if (cell->dirty & (CELL_NEW_RASTERIZER)) {
+      STATIC_ASSERT(sizeof(struct cell_command_rasterizer) % 16 == 0);
       struct cell_command_rasterizer *rast =
-         cell_batch_alloc(cell, sizeof(*rast));
-      rast->opcode = CELL_CMD_STATE_RASTERIZER;
+         cell_batch_alloc16(cell, sizeof(*rast));
+      rast->opcode[0] = CELL_CMD_STATE_RASTERIZER;
       rast->rasterizer = *cell->rasterizer;
    }
 
    if (cell->dirty & (CELL_NEW_FS)) {
       /* Send new fragment program to SPUs */
+      STATIC_ASSERT(sizeof(struct cell_command_fragment_program) % 16 == 0);
       struct cell_command_fragment_program *fp
-            = cell_batch_alloc(cell, sizeof(*fp));
-      fp->opcode = CELL_CMD_STATE_FRAGMENT_PROGRAM;
+            = cell_batch_alloc16(cell, sizeof(*fp));
+      fp->opcode[0] = CELL_CMD_STATE_FRAGMENT_PROGRAM;
       fp->num_inst = cell->fs->code.num_inst;
       memcpy(&fp->code, cell->fs->code.store,
              SPU_MAX_FRAGMENT_PROGRAM_INSTS * SPE_INST_SIZE);
@@ -238,14 +241,14 @@ cell_emit_state(struct cell_context *cell)
       const uint shader = PIPE_SHADER_FRAGMENT;
       const uint num_const = cell->constants[shader].size / sizeof(float);
       uint i, j;
-      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
-      uint64_t *ibuf = (uint64_t *) buf;
+      float *buf = cell_batch_alloc16(cell, ROUNDUP16(32 + num_const * sizeof(float)));
+      uint32_t *ibuf = (uint32_t *) buf;
       const float *constants = pipe_buffer_map(cell->pipe.screen,
                                                cell->constants[shader].buffer,
                                                PIPE_BUFFER_USAGE_CPU_READ);
       ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
-      ibuf[1] = num_const;
-      j = 4;
+      ibuf[4] = num_const;
+      j = 8;
       for (i = 0; i < num_const; i++) {
          buf[j++] = constants[i];
       }
@@ -258,7 +261,7 @@ cell_emit_state(struct cell_context *cell)
       struct cell_command_fragment_ops *fops, *fops_cmd;
       /* Note that cell_command_fragment_ops is a variant-sized record */
       fops = lookup_fragment_ops(cell);
-      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd) + fops->total_code_size);
+      fops_cmd = cell_batch_alloc16(cell, ROUNDUP16(sizeof(*fops_cmd) + fops->total_code_size));
       memcpy(fops_cmd, fops, sizeof(*fops) + fops->total_code_size);
    }
 
@@ -267,9 +270,10 @@ cell_emit_state(struct cell_context *cell)
       for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
          if (cell->dirty_samplers & (1 << i)) {
             if (cell->sampler[i]) {
+               STATIC_ASSERT(sizeof(struct cell_command_sampler) % 16 == 0);
                struct cell_command_sampler *sampler
-                  = cell_batch_alloc(cell, sizeof(*sampler));
-               sampler->opcode = CELL_CMD_STATE_SAMPLER;
+                  = cell_batch_alloc16(cell, sizeof(*sampler));
+               sampler->opcode[0] = CELL_CMD_STATE_SAMPLER;
                sampler->unit = i;
                sampler->state = *cell->sampler[i];
             }
@@ -282,9 +286,10 @@ cell_emit_state(struct cell_context *cell)
       uint i;
       for (i = 0;i < CELL_MAX_SAMPLERS; i++) {
          if (cell->dirty_textures & (1 << i)) {
+            STATIC_ASSERT(sizeof(struct cell_command_texture) % 16 == 0);
             struct cell_command_texture *texture
-               =  cell_batch_alloc(cell, sizeof(*texture));
-            texture->opcode = CELL_CMD_STATE_TEXTURE;
+               =  (struct cell_command_texture *)cell_batch_alloc16(cell, sizeof(*texture));
+            texture->opcode[0] = CELL_CMD_STATE_TEXTURE;
             texture->unit = i;
             if (cell->texture[i]) {
                uint level;
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index 65ba51b6bb2..ab54e796895 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -116,10 +116,11 @@ cell_vbuf_release_vertices(struct vbuf_render *vbr, void *vertices,
 
    /* Tell SPUs they can release the vert buf */
    if (cvbr->vertex_buf != ~0U) {
+      STATIC_ASSERT(sizeof(struct cell_command_release_verts) % 16 == 0);
       struct cell_command_release_verts *release
          = (struct cell_command_release_verts *)
-         cell_batch_alloc(cell, sizeof(struct cell_command_release_verts));
-      release->opcode = CELL_CMD_RELEASE_VERTS;
+         cell_batch_alloc16(cell, sizeof(struct cell_command_release_verts));
+      release->opcode[0] = CELL_CMD_RELEASE_VERTS;
       release->vertex_buf = cvbr->vertex_buf;
    }
 
@@ -210,15 +211,16 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
    /* build/insert batch RENDER command */
    {
-      const uint index_bytes = ROUNDUP8(nr_indices * 2);
-      const uint vertex_bytes = nr_vertices * 4 * cell->vertex_info.size;
+      const uint index_bytes = ROUNDUP16(nr_indices * 2);
+      const uint vertex_bytes = ROUNDUP16(nr_vertices * 4 * cell->vertex_info.size);
+      STATIC_ASSERT(sizeof(struct cell_command_render) % 16 == 0);
       const uint batch_size = sizeof(struct cell_command_render) + index_bytes;
 
       struct cell_command_render *render
          = (struct cell_command_render *)
-         cell_batch_alloc(cell, batch_size);
+         cell_batch_alloc16(cell, batch_size);
 
-      render->opcode = CELL_CMD_RENDER;
+      render->opcode[0] = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
 
       render->num_indexes = nr_indices;
@@ -236,7 +238,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
           min_index == 0 &&
           vertex_bytes + 16 <= cell_batch_free_space(cell)) {
          /* vertex data inlined, after indices, at 16-byte boundary */
-         void *dst = cell_batch_alloc_aligned(cell, vertex_bytes, 16);
+         void *dst = cell_batch_alloc16(cell, vertex_bytes);
          memcpy(dst, vertices, vertex_bytes);
          render->inline_verts = TRUE;
          render->vertex_buf = ~0;
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index 8500d19754e..5c0179d9546 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -292,10 +292,10 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 
 
 static uint
-cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+cmd_state_fs_constants(const qword *buffer, uint pos)
 {
-   const uint num_const = buffer[pos + 1];
-   const float *constants = (const float *) &buffer[pos + 2];
+   const uint num_const = spu_extract((vector unsigned int)buffer[pos+1], 0);
+   const float *constants = (const float *) &buffer[pos+2];
    uint i;
 
    D_PRINTF(CELL_DEBUG_CMD, "CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
@@ -306,8 +306,8 @@ cmd_state_fs_constants(const uint64_t *buffer, uint pos)
       spu.constants[i] = spu_splats(constants[i]);
    }
 
-   /* return new buffer pos (in 8-byte words) */
-   return pos + 2 + num_const / 2;
+   /* return new buffer pos (in 16-byte words) */
+   return pos + 2 + (ROUNDUP16(num_const * sizeof(float)) / 16);
 }
 
 
@@ -547,8 +547,8 @@ cmd_batch(uint opcode)
 {
    const uint buf = (opcode >> 8) & 0xff;
    uint size = (opcode >> 16);
-   uint64_t buffer[CELL_BUFFER_SIZE / 8] ALIGN16_ATTRIB;
-   const unsigned usize = size / sizeof(buffer[0]);
+   qword buffer[CELL_BUFFER_SIZE / 16] ALIGN16_ATTRIB;
+   const unsigned usize = ROUNDUP16(size) / sizeof(buffer[0]);
    uint pos;
 
    D_PRINTF(CELL_DEBUG_CMD, "BATCH buffer %u, len %u, from %p\n",
@@ -578,7 +578,7 @@ cmd_batch(uint opcode)
     * Loop over commands in the batch buffer
     */
    for (pos = 0; pos < usize; /* no incr */) {
-      switch (buffer[pos]) {
+      switch (si_to_uint(buffer[pos])) {
       /*
        * rendering commands
        */
@@ -587,7 +587,7 @@ cmd_batch(uint opcode)
             struct cell_command_clear_surface *clr
                = (struct cell_command_clear_surface *) &buffer[pos];
             cmd_clear_surface(clr);
-            pos += sizeof(*clr) / 8;
+            pos += sizeof(*clr) / 16;
          }
          break;
       case CELL_CMD_RENDER:
@@ -596,7 +596,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_render *) &buffer[pos];
             uint pos_incr;
             cmd_render(render, &pos_incr);
-            pos += pos_incr;
+            pos += ((pos_incr+1)&~1) / 2; // should 'fix' cmd_render return
          }
          break;
       /*
@@ -607,7 +607,7 @@ cmd_batch(uint opcode)
             struct cell_command_framebuffer *fb
                = (struct cell_command_framebuffer *) &buffer[pos];
             cmd_state_framebuffer(fb);
-            pos += sizeof(*fb) / 8;
+            pos += sizeof(*fb) / 16;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_OPS:
@@ -616,7 +616,7 @@ cmd_batch(uint opcode)
                = (struct cell_command_fragment_ops *) &buffer[pos];
             cmd_state_fragment_ops(fops);
             /* This is a variant-sized command */
-            pos += (sizeof(*fops) + fops->total_code_size)/ 8;
+            pos += ROUNDUP16(sizeof(*fops) + fops->total_code_size) / 16;
          }
          break;
       case CELL_CMD_STATE_FRAGMENT_PROGRAM:
@@ -624,7 +624,7 @@ cmd_batch(uint opcode)
             struct cell_command_fragment_program *fp
                = (struct cell_command_fragment_program *) &buffer[pos];
             cmd_state_fragment_program(fp);
-            pos += sizeof(*fp) / 8;
+            pos += sizeof(*fp) / 16;
          }
          break;
       case CELL_CMD_STATE_FS_CONSTANTS:
@@ -635,7 +635,7 @@ cmd_batch(uint opcode)
             struct cell_command_rasterizer *rast =
                (struct cell_command_rasterizer *) &buffer[pos];
             spu.rasterizer = rast->rasterizer;
-            pos += sizeof(*rast) / 8;
+            pos += sizeof(*rast) / 16;
          }
          break;
       case CELL_CMD_STATE_SAMPLER:
@@ -643,7 +643,7 @@ cmd_batch(uint opcode)
             struct cell_command_sampler *sampler
                = (struct cell_command_sampler *) &buffer[pos];
             cmd_state_sampler(sampler);
-            pos += sizeof(*sampler) / 8;
+            pos += sizeof(*sampler) / 16;
          }
          break;
       case CELL_CMD_STATE_TEXTURE:
@@ -651,37 +651,37 @@ cmd_batch(uint opcode)
             struct cell_command_texture *texture
                = (struct cell_command_texture *) &buffer[pos];
             cmd_state_texture(texture);
-            pos += sizeof(*texture) / 8;
+            pos += sizeof(*texture) / 16;
          }
          break;
       case CELL_CMD_STATE_VERTEX_INFO:
          cmd_state_vertex_info((struct vertex_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct vertex_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct vertex_info)) / 16;
          break;
       case CELL_CMD_STATE_VIEWPORT:
          (void) memcpy(& draw.viewport, &buffer[pos+1],
                        sizeof(struct pipe_viewport_state));
-         pos += (1 + ROUNDUP8(sizeof(struct pipe_viewport_state)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct pipe_viewport_state)) / 16;
          break;
       case CELL_CMD_STATE_UNIFORMS:
-         draw.constants = (const float (*)[4]) (uintptr_t) buffer[pos + 1];
+         draw.constants = (const float (*)[4]) (uintptr_t)spu_extract((vector unsigned int)buffer[pos+1],0);
          pos += 2;
          break;
       case CELL_CMD_STATE_VS_ARRAY_INFO:
          cmd_state_vs_array_info((struct cell_array_info *) &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_array_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_array_info)) / 16;
          break;
       case CELL_CMD_STATE_BIND_VS:
 #if 0
          spu_bind_vertex_shader(&draw,
                                 (struct cell_shader_info *) &buffer[pos+1]);
 #endif
-         pos += (1 + ROUNDUP8(sizeof(struct cell_shader_info)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_shader_info)) / 16;
          break;
       case CELL_CMD_STATE_ATTRIB_FETCH:
          cmd_state_attrib_fetch((struct cell_attribute_fetch_code *)
                                 &buffer[pos+1]);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_attribute_fetch_code)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_attribute_fetch_code)) / 16;
          break;
       /*
        * misc commands
@@ -695,7 +695,7 @@ cmd_batch(uint opcode)
             struct cell_command_fence *fence_cmd =
                (struct cell_command_fence *) &buffer[pos];
             cmd_fence(fence_cmd);
-            pos += sizeof(*fence_cmd) / 8;
+            pos += sizeof(*fence_cmd) / 16;
          }
          break;
       case CELL_CMD_RELEASE_VERTS:
@@ -703,7 +703,7 @@ cmd_batch(uint opcode)
             struct cell_command_release_verts *release
                = (struct cell_command_release_verts *) &buffer[pos];
             cmd_release_verts(release);
-            pos += sizeof(*release) / 8;
+            pos += sizeof(*release) / 16;
          }
          break;
       case CELL_CMD_FLUSH_BUFFER_RANGE: {
@@ -711,11 +711,11 @@ cmd_batch(uint opcode)
 	     &buffer[pos+1];
 
 	 spu_dcache_mark_dirty((unsigned) br->base, br->size);
-         pos += (1 + ROUNDUP8(sizeof(struct cell_buffer_range)) / 8);
+         pos += 1 + ROUNDUP16(sizeof(struct cell_buffer_range)) / 16;
 	 break;
       }
       default:
-         printf("SPU %u: bad opcode: 0x%llx\n", spu.init.id, buffer[pos]);
+         printf("SPU %u: bad opcode: 0x%x\n", spu.init.id, si_to_uint(buffer[pos]));
          ASSERT(0);
          break;
       }
diff --git a/src/gallium/drivers/cell/spu/spu_shuffle.h b/src/gallium/drivers/cell/spu/spu_shuffle.h
index 7cbdb814d28..74f2a0b6d2e 100644
--- a/src/gallium/drivers/cell/spu/spu_shuffle.h
+++ b/src/gallium/drivers/cell/spu/spu_shuffle.h
@@ -171,7 +171,7 @@
    SHUFFLE_PATTERN_16_##M##__, \
    SHUFFLE_PATTERN_16_##N##__, \
    SHUFFLE_PATTERN_16_##O##__, \
-   SHUFFLE_PATTERN_16_##P
+   SHUFFLE_PATTERN_16_##P##__
 
 #define SHUFFLE16(A, B, C, D, E, F, G, H, I, J, K, L, M, N, O, P) \
    ((const vector unsigned char){ \
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 322be1252e9..0d9fcb99970 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -57,7 +57,7 @@ struct vertex_header {
 
 /* XXX fix this */
 #undef CEILF
-#define CEILF(X) ((float) (int) ((X) + 0.99999))
+#define CEILF(X) ((float) (int) ((X) + 0.99999f))
 
 
 #define QUAD_TOP_LEFT     0
diff --git a/src/gallium/drivers/nv50/nv50_context.h b/src/gallium/drivers/nv50/nv50_context.h
index 0958bba334a..061a4c064b6 100644
--- a/src/gallium/drivers/nv50/nv50_context.h
+++ b/src/gallium/drivers/nv50/nv50_context.h
@@ -67,11 +67,18 @@ struct nv50_rasterizer_stateobj {
 	struct nouveau_stateobj *so;
 };
 
+struct nv50_miptree_level {
+	struct pipe_buffer **image;
+	int *image_offset;
+	unsigned image_dirty_cpu[512/32];
+	unsigned image_dirty_gpu[512/32];
+};
+
 struct nv50_miptree {
 	struct pipe_texture base;
 	struct pipe_buffer *buffer;
 
-	int *image_offset;
+	struct nv50_miptree_level level[PIPE_MAX_TEXTURE_LEVELS];
 	int image_nr;
 	int total_size;
 };
@@ -84,7 +91,6 @@ nv50_miptree(struct pipe_texture *pt)
 
 struct nv50_surface {
 	struct pipe_surface base;
-	struct pipe_buffer *untiled;
 };
 
 static INLINE struct nv50_surface *
@@ -186,4 +192,8 @@ extern boolean nv50_state_validate(struct nv50_context *nv50);
 /* nv50_tex.c */
 extern void nv50_tex_validate(struct nv50_context *);
 
+/* nv50_miptree.c */
+extern void nv50_miptree_sync(struct pipe_screen *, struct nv50_miptree *,
+			      unsigned level, unsigned image);
+
 #endif
diff --git a/src/gallium/drivers/nv50/nv50_miptree.c b/src/gallium/drivers/nv50/nv50_miptree.c
index 24973712324..63a23d06b89 100644
--- a/src/gallium/drivers/nv50/nv50_miptree.c
+++ b/src/gallium/drivers/nv50/nv50_miptree.c
@@ -27,14 +27,16 @@
 #include "nv50_context.h"
 
 static struct pipe_texture *
-nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
+nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *tmp)
 {
 	struct pipe_winsys *ws = pscreen->winsys;
 	struct nv50_miptree *mt = CALLOC_STRUCT(nv50_miptree);
-	unsigned usage, width = pt->width[0], height = pt->height[0];
-	int i;
+	struct pipe_texture *pt = &mt->base;
+	unsigned usage, width = tmp->width[0], height = tmp->height[0];
+	unsigned depth = tmp->depth[0];
+	int i, l;
 
-	mt->base = *pt;
+	mt->base = *tmp;
 	mt->base.refcount = 1;
 	mt->base.screen = pscreen;
 
@@ -59,17 +61,38 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 		mt->image_nr = 1;
 		break;
 	}
-	mt->image_offset = CALLOC(mt->image_nr, sizeof(int));
+
+	for (l = 0; l <= pt->last_level; l++) {
+		struct nv50_miptree_level *lvl = &mt->level[l];
+
+		pt->width[l] = width;
+		pt->height[l] = height;
+		pt->depth[l] = depth;
+		pt->nblocksx[l] = pf_get_nblocksx(&pt->block, width);
+		pt->nblocksy[l] = pf_get_nblocksy(&pt->block, height);
+
+		lvl->image_offset = CALLOC(mt->image_nr, sizeof(int));
+		lvl->image = CALLOC(mt->image_nr, sizeof(struct pipe_buffer *));
+
+		width = MAX2(1, width >> 1);
+		height = MAX2(1, height >> 1);
+		depth = MAX2(1, depth >> 1);
+	}
 
 	for (i = 0; i < mt->image_nr; i++) {
-		int image_size;
+		for (l = 0; l <= pt->last_level; l++) {
+			struct nv50_miptree_level *lvl = &mt->level[l];
+			int size;
+
+			size  = align(pt->width[l], 8) * pt->block.size;
+			size  = align(size, 64);
+			size *= align(pt->height[l], 8) * pt->block.size;
 
-		image_size  = align(width, 8) * pt->block.size;
-		image_size  = align(image_size, 64);
-		image_size *= align(height, 8) * pt->block.size;
+			lvl->image[i] = ws->buffer_create(ws, 256, 0, size);
+			lvl->image_offset[i] = mt->total_size;
 
-		mt->image_offset[i] = mt->total_size;
-		mt->total_size += image_size;
+			mt->total_size += size;
+		}
 	}
 
 	mt->buffer = ws->buffer_create(ws, 256, usage, mt->total_size);
@@ -81,6 +104,24 @@ nv50_miptree_create(struct pipe_screen *pscreen, const struct pipe_texture *pt)
 	return &mt->base;
 }
 
+static INLINE void
+mark_dirty(uint32_t *flags, unsigned image)
+{
+	flags[image / 32] |= (1 << (image % 32));
+}
+
+static INLINE void
+mark_clean(uint32_t *flags, unsigned image)
+{
+	flags[image / 32] &= ~(1 << (image % 32));
+}
+
+static INLINE int
+is_dirty(uint32_t *flags, unsigned image)
+{
+	return !!(flags[image / 32] & (1 << (image % 32)));
+}
+
 static void
 nv50_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
 {
@@ -96,12 +137,86 @@ nv50_miptree_release(struct pipe_screen *pscreen, struct pipe_texture **ppt)
 	}
 }
 
+void
+nv50_miptree_sync(struct pipe_screen *pscreen, struct nv50_miptree *mt,
+		  unsigned level, unsigned image)
+{
+	struct nouveau_winsys *nvws = nv50_screen(pscreen)->nvws;
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *dst, *src;
+	unsigned face = 0, zslice = 0;
+
+	if (!is_dirty(lvl->image_dirty_cpu, image))
+		return;
+
+	if (mt->base.target == PIPE_TEXTURE_CUBE)
+		face = image;
+	else
+	if (mt->base.target == PIPE_TEXTURE_3D)
+		zslice = image;
+
+	/* Mark as clean already - so we don't continually call this function
+	 * trying to get a GPU_WRITE pipe_surface!
+	 */
+	mark_clean(lvl->image_dirty_cpu, image);
+
+	/* Pretend we're doing CPU access so we get the backing pipe_surface
+	 * and not a view into the larger miptree.
+	 */
+	src = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_CPU_READ);
+
+	/* Pretend we're only reading with the GPU so surface doesn't get marked
+	 * as dirtied by the GPU.
+	 */
+	dst = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_GPU_READ);
+
+	nvws->surface_copy(nvws, dst, 0, 0, src, 0, 0, dst->width, dst->height);
+
+	pscreen->tex_surface_release(pscreen, &dst);
+	pscreen->tex_surface_release(pscreen, &src);
+}
+
+/* The reverse of the above */
+void
+nv50_miptree_sync_cpu(struct pipe_screen *pscreen, struct nv50_miptree *mt,
+		      unsigned level, unsigned image)
+{
+	struct nouveau_winsys *nvws = nv50_screen(pscreen)->nvws;
+	struct nv50_miptree_level *lvl = &mt->level[level];
+	struct pipe_surface *dst, *src;
+	unsigned face = 0, zslice = 0;
+
+	if (!is_dirty(lvl->image_dirty_gpu, image))
+		return;
+
+	if (mt->base.target == PIPE_TEXTURE_CUBE)
+		face = image;
+	else
+	if (mt->base.target == PIPE_TEXTURE_3D)
+		zslice = image;
+
+	mark_clean(lvl->image_dirty_gpu, image);
+
+	src = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_GPU_READ);
+	dst = pscreen->get_tex_surface(pscreen, &mt->base, face, level, zslice,
+				       PIPE_BUFFER_USAGE_CPU_READ);
+
+	nvws->surface_copy(nvws, dst, 0, 0, src, 0, 0, dst->width, dst->height);
+
+	pscreen->tex_surface_release(pscreen, &dst);
+	pscreen->tex_surface_release(pscreen, &src);
+}
+
 static struct pipe_surface *
 nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 			 unsigned face, unsigned level, unsigned zslice,
 			 unsigned flags)
 {
 	struct nv50_miptree *mt = nv50_miptree(pt);
+	struct nv50_miptree_level *lvl = &mt->level[level];
 	struct nv50_surface *s;
 	struct pipe_surface *ps;
 	int img;
@@ -128,12 +243,29 @@ nv50_miptree_surface_new(struct pipe_screen *pscreen, struct pipe_texture *pt,
 	ps->nblocksx = pt->nblocksx[level];
 	ps->nblocksy = pt->nblocksy[level];
 	ps->stride = ps->width * ps->block.size;
-	ps->offset = mt->image_offset[img];
 	ps->usage = flags;
 	ps->status = PIPE_SURFACE_STATUS_DEFINED;
 
-	pipe_texture_reference(&ps->texture, pt);
-	pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+	if (flags & PIPE_BUFFER_USAGE_CPU_READ_WRITE) {
+		assert(!(flags & PIPE_BUFFER_USAGE_GPU_READ_WRITE));
+		nv50_miptree_sync_cpu(pscreen, mt, level, img);
+
+		ps->offset = 0;
+		pipe_texture_reference(&ps->texture, pt);
+		pipe_buffer_reference(pscreen, &ps->buffer, lvl->image[img]);
+
+		if (flags & PIPE_BUFFER_USAGE_CPU_WRITE)
+			mark_dirty(lvl->image_dirty_cpu, img);
+	} else {
+		nv50_miptree_sync(pscreen, mt, level, img);
+
+		ps->offset = lvl->image_offset[img];
+		pipe_texture_reference(&ps->texture, pt);
+		pipe_buffer_reference(pscreen, &ps->buffer, mt->buffer);
+
+		if (flags & PIPE_BUFFER_USAGE_GPU_WRITE)
+			mark_dirty(lvl->image_dirty_gpu, img);
+	}
 
 	return ps;
 }
diff --git a/src/gallium/drivers/nv50/nv50_program.c b/src/gallium/drivers/nv50/nv50_program.c
index d66e1d0949d..7686f746eb2 100644
--- a/src/gallium/drivers/nv50/nv50_program.c
+++ b/src/gallium/drivers/nv50/nv50_program.c
@@ -32,7 +32,7 @@
 #include "nv50_context.h"
 
 #define NV50_SU_MAX_TEMP 64
-#define NV50_PROGRAM_DUMP
+//#define NV50_PROGRAM_DUMP
 
 /* ARL - gallium craps itself on progs/vp/arl.txt
  *
@@ -841,6 +841,28 @@ emit_neg(struct nv50_pc *pc, struct nv50_reg *dst, struct nv50_reg *src)
 	emit(pc, e);
 }
 
+static void
+emit_kil(struct nv50_pc *pc, struct nv50_reg *src)
+{
+	struct nv50_program_exec *e;
+	const int r_pred = 1;
+
+	/* Sets predicate reg ? */
+	e = exec(pc);
+	e->inst[0] = 0xa00001fd;
+	e->inst[1] = 0xc4014788;
+	set_src_0(pc, src, e);
+	set_pred_wr(pc, 1, r_pred, e);
+	emit(pc, e);
+
+	/* This is probably KILP */
+	e = exec(pc);
+	e->inst[0] = 0x000001fe;
+	set_long(pc, e);
+	set_pred(pc, 1 /* LT? */, r_pred, e);
+	emit(pc, e);
+}
+
 static struct nv50_reg *
 tgsi_dst(struct nv50_pc *pc, int c, const struct tgsi_full_dst_register *dst)
 {
@@ -1069,6 +1091,12 @@ nv50_program_tx_insn(struct nv50_pc *pc, const union tgsi_full_token *tok)
 		}
 		free_temp(pc, temp);
 		break;
+	case TGSI_OPCODE_KIL:
+		emit_kil(pc, src[0][0]);
+		emit_kil(pc, src[0][1]);
+		emit_kil(pc, src[0][2]);
+		emit_kil(pc, src[0][3]);
+		break;
 	case TGSI_OPCODE_LIT:
 		emit_lit(pc, &dst[0], mask, &src[0][0]);
 		break;
@@ -1602,13 +1630,19 @@ nv50_program_validate_code(struct nv50_context *nv50, struct nv50_program *p)
 	if (!upload)
 		return;
 
+#ifdef NV50_PROGRAM_DUMP
 	NOUVEAU_ERR("-------\n");
 	up = ptr = MALLOC(p->exec_size * 4);
 	for (e = p->exec_head; e; e = e->next) {
 		NOUVEAU_ERR("0x%08x\n", e->inst[0]);
 		if (is_long(e))
 			NOUVEAU_ERR("0x%08x\n", e->inst[1]);
+	}
 
+#endif
+
+	up = ptr = MALLOC(p->exec_size * 4);
+	for (e = p->exec_head; e; e = e->next) {
 		*(ptr++) = e->inst[0];
 		if (is_long(e))
 			*(ptr++) = e->inst[1];
@@ -1705,7 +1739,7 @@ nv50_fragprog_validate(struct nv50_context *nv50)
 	so_reloc (so, p->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_RD |
 		  NOUVEAU_BO_LOW, 0, 0);
 	so_method(so, tesla, 0x1904, 4);
-	so_data  (so, 0x01040404); /* p: 0x01000404 */
+	so_data  (so, 0x00040404); /* p: 0x01000404 */
 	so_data  (so, 0x00000004);
 	so_data  (so, 0x00000000);
 	so_data  (so, 0x00000000);
diff --git a/src/gallium/drivers/nv50/nv50_query.c b/src/gallium/drivers/nv50/nv50_query.c
index 777e77906d5..b923c820eba 100644
--- a/src/gallium/drivers/nv50/nv50_query.c
+++ b/src/gallium/drivers/nv50/nv50_query.c
@@ -21,41 +21,101 @@
  */
 
 #include "pipe/p_context.h"
+#include "pipe/p_inlines.h"
 
 #include "nv50_context.h"
 
+struct nv50_query {
+	struct pipe_buffer *buffer;
+	unsigned type;
+	boolean ready;
+	uint64_t result;
+};
+
+static INLINE struct nv50_query *
+nv50_query(struct pipe_query *pipe)
+{
+	return (struct nv50_query *)pipe;
+}
+
 static struct pipe_query *
 nv50_query_create(struct pipe_context *pipe, unsigned type)
 {
-	NOUVEAU_ERR("unimplemented\n");
-	return NULL;
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv50_query *q = CALLOC_STRUCT(nv50_query);
+
+	assert (q->type == PIPE_QUERY_OCCLUSION_COUNTER);
+	q->type = type;
+
+	q->buffer = ws->buffer_create(ws, 256, 0, 16);
+	if (!q->buffer) {
+		FREE(q);
+		return NULL;
+	}
+
+	return (struct pipe_query *)q;
 }
 
 static void
-nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *q)
+nv50_query_destroy(struct pipe_context *pipe, struct pipe_query *pq)
 {
-	NOUVEAU_ERR("unimplemented\n");
+	struct nv50_query *q = nv50_query(pq);
+
+	if (q) {
+		pipe_buffer_reference(pipe, &q->buffer, NULL);
+		FREE(q);
+	}
 }
 
 static void
-nv50_query_begin(struct pipe_context *pipe, struct pipe_query *q)
+nv50_query_begin(struct pipe_context *pipe, struct pipe_query *pq)
 {
-	NOUVEAU_ERR("unimplemented\n");
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_query *q = nv50_query(pq);
+
+	BEGIN_RING(tesla, 0x1530, 1);
+	OUT_RING  (1);
+	BEGIN_RING(tesla, 0x1514, 1);
+	OUT_RING  (1);
+
+	q->ready = FALSE;
 }
 
 static void
-nv50_query_end(struct pipe_context *pipe, struct pipe_query *q)
+nv50_query_end(struct pipe_context *pipe, struct pipe_query *pq)
 {
-	NOUVEAU_ERR("unimplemented\n");
+	struct nv50_context *nv50 = nv50_context(pipe);
+	struct nv50_query *q = nv50_query(pq);
+
+	BEGIN_RING(tesla, 0x1b00, 4);
+	OUT_RELOCh(q->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RELOCl(q->buffer, 0, NOUVEAU_BO_VRAM | NOUVEAU_BO_WR);
+	OUT_RING  (0x00000000);
+	OUT_RING  (0x0100f002);
+	FIRE_RING (NULL);
 }
 
 static boolean
-nv50_query_result(struct pipe_context *pipe, struct pipe_query *q,
+nv50_query_result(struct pipe_context *pipe, struct pipe_query *pq,
 		  boolean wait, uint64_t *result)
 {
-	NOUVEAU_ERR("unimplemented\n");
-	*result = 0xdeadcafe;
-	return TRUE;
+	struct pipe_winsys *ws = pipe->winsys;
+	struct nv50_query *q = nv50_query(pq);
+
+	/*XXX: Want to be able to return FALSE here instead of blocking
+	 *     until the result is available..
+	 */
+
+	if (!q->ready) {
+		uint32_t *map = ws->buffer_map(ws, q->buffer,
+					       PIPE_BUFFER_USAGE_CPU_READ);
+		q->result = map[1];
+		q->ready = TRUE;
+		ws->buffer_unmap(ws, q->buffer);
+	}
+
+	*result = q->result;
+	return q->ready;
 }
 
 void
diff --git a/src/gallium/drivers/nv50/nv50_screen.c b/src/gallium/drivers/nv50/nv50_screen.c
index 52f6a406882..ef46233f839 100644
--- a/src/gallium/drivers/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nv50/nv50_screen.c
@@ -57,6 +57,10 @@ nv50_screen_is_format_supported(struct pipe_screen *pscreen,
 		case PIPE_FORMAT_A8_UNORM:
 		case PIPE_FORMAT_I8_UNORM:
 		case PIPE_FORMAT_A8L8_UNORM:
+		case PIPE_FORMAT_DXT1_RGB:
+		case PIPE_FORMAT_DXT1_RGBA:
+		case PIPE_FORMAT_DXT3_RGBA:
+		case PIPE_FORMAT_DXT5_RGBA:
 			return TRUE;
 		default:
 			break;
@@ -90,23 +94,23 @@ nv50_screen_get_param(struct pipe_screen *pscreen, int param)
 	case PIPE_CAP_MAX_TEXTURE_IMAGE_UNITS:
 		return 32;
 	case PIPE_CAP_NPOT_TEXTURES:
-		return 0;
+		return 1;
 	case PIPE_CAP_TWO_SIDED_STENCIL:
 		return 1;
 	case PIPE_CAP_GLSL:
 		return 0;
 	case PIPE_CAP_S3TC:
-		return 0;
+		return 1;
 	case PIPE_CAP_ANISOTROPIC_FILTER:
-		return 0;
+		return 1;
 	case PIPE_CAP_POINT_SPRITE:
 		return 0;
 	case PIPE_CAP_MAX_RENDER_TARGETS:
 		return 8;
 	case PIPE_CAP_OCCLUSION_QUERY:
-		return 0;
+		return 1;
 	case PIPE_CAP_TEXTURE_SHADOW_MAP:
-		return 0;
+		return 1;
 	case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
 		return 13;
 	case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
diff --git a/src/gallium/drivers/nv50/nv50_state.c b/src/gallium/drivers/nv50/nv50_state.c
index 95f9d408b5e..38c1d938b8e 100644
--- a/src/gallium/drivers/nv50/nv50_state.c
+++ b/src/gallium/drivers/nv50/nv50_state.c
@@ -175,6 +175,32 @@ nv50_sampler_state_create(struct pipe_context *pipe,
 		break;
 	}
 
+	if (cso->max_anisotropy >= 16.0)
+		tsc[0] |= (7 << 20);
+	else
+	if (cso->max_anisotropy >= 12.0)
+		tsc[0] |= (6 << 20);
+	else
+	if (cso->max_anisotropy >= 10.0)
+		tsc[0] |= (5 << 20);
+	else
+	if (cso->max_anisotropy >= 8.0)
+		tsc[0] |= (4 << 20);
+	else
+	if (cso->max_anisotropy >= 6.0)
+		tsc[0] |= (3 << 20);
+	else
+	if (cso->max_anisotropy >= 4.0)
+		tsc[0] |= (2 << 20);
+	else
+	if (cso->max_anisotropy >= 2.0)
+		tsc[0] |= (1 << 20);
+
+	if (cso->compare_mode == PIPE_TEX_COMPARE_R_TO_TEXTURE) {
+		tsc[0] |= (1 << 8);
+		tsc[0] |= (nvgl_comparison_op(cso->compare_func) & 0x7);
+	}
+
 	return (void *)tsc;
 }
 
diff --git a/src/gallium/drivers/nv50/nv50_surface.c b/src/gallium/drivers/nv50/nv50_surface.c
index 5bf97d3a6bf..3f45a2fe186 100644
--- a/src/gallium/drivers/nv50/nv50_surface.c
+++ b/src/gallium/drivers/nv50/nv50_surface.c
@@ -63,48 +63,17 @@ static void *
 nv50_surface_map(struct pipe_screen *screen, struct pipe_surface *ps,
 		 unsigned flags )
 {
-	struct nouveau_winsys *nvws = nv50_screen(screen)->nvws;
 	struct pipe_winsys *ws = screen->winsys;
-	struct nv50_surface *s = nv50_surface(ps);
-	struct nv50_surface m = *s;
-	void *map;
 
-	if (!s->untiled) {
-		s->untiled = ws->buffer_create(ws, 0, 0, ps->buffer->size);
-
-		m.base.buffer = s->untiled;
-		nvws->surface_copy(nvws, &m.base, 0, 0, &s->base, 0, 0,
-					 ps->width, ps->height);
-	}
-
-	/* Map original tiled surface to disallow it being validated while
-	 * untiled mirror is mapped.
-	 */
-	ws->buffer_map(ws, ps->buffer, flags);
-
-	map = ws->buffer_map(ws, s->untiled, flags);
-	if (!map)
-		return NULL;
-
-	return map;
+	return ws->buffer_map(ws, ps->buffer, flags);
 }
 
 static void
 nv50_surface_unmap(struct pipe_screen *pscreen, struct pipe_surface *ps)
 {
-	struct nouveau_winsys *nvws = nv50_screen(pscreen)->nvws;
 	struct pipe_winsys *ws = pscreen->winsys;
-	struct nv50_surface *s = nv50_surface(ps);
-	struct nv50_surface m = *s;
 
-	ws->buffer_unmap(ws, s->untiled);
 	ws->buffer_unmap(ws, ps->buffer);
-
-	m.base.buffer = s->untiled;
-	nvws->surface_copy(nvws, &s->base, 0, 0, &m.base, 0, 0,
-				 ps->width, ps->height);
-
-	pipe_buffer_reference(pscreen, &s->untiled, NULL);
 }
 
 void
diff --git a/src/gallium/drivers/nv50/nv50_tex.c b/src/gallium/drivers/nv50/nv50_tex.c
index fde3c97c059..239407c92bb 100644
--- a/src/gallium/drivers/nv50/nv50_tex.c
+++ b/src/gallium/drivers/nv50/nv50_tex.c
@@ -85,6 +85,34 @@ nv50_tex_construct(struct nouveau_stateobj *so, struct nv50_miptree *mt)
 			    NV50TIC_0_0_MAPB_C0 | NV50TIC_0_0_TYPEB_UNORM |
 			    NV50TIC_0_0_FMT_8_8);
 		break;
+	case PIPE_FORMAT_DXT1_RGB:
+		so_data(so, NV50TIC_0_0_MAPA_ONE | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT1);
+		break;
+	case PIPE_FORMAT_DXT1_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT1);
+		break;
+	case PIPE_FORMAT_DXT3_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT3);
+		break;
+	case PIPE_FORMAT_DXT5_RGBA:
+		so_data(so, NV50TIC_0_0_MAPA_C3 | NV50TIC_0_0_TYPEA_UNORM |
+			    NV50TIC_0_0_MAPR_C0 | NV50TIC_0_0_TYPER_UNORM |
+			    NV50TIC_0_0_MAPG_C1 | NV50TIC_0_0_TYPEG_UNORM |
+			    NV50TIC_0_0_MAPB_C2 | NV50TIC_0_0_TYPEB_UNORM |
+			    NV50TIC_0_0_FMT_DXT5);
+		break;
 	default:
 		return 1;
 	}
@@ -105,14 +133,23 @@ nv50_tex_validate(struct nv50_context *nv50)
 {
 	struct nouveau_grobj *tesla = nv50->screen->tesla;
 	struct nouveau_stateobj *so;
-	int i;
+	int unit, level, image;
 
 	so = so_new(nv50->miptree_nr * 8 + 3, nv50->miptree_nr * 2);
 	so_method(so, tesla, 0x0f00, 1);
 	so_data  (so, NV50_CB_TIC);
 	so_method(so, tesla, 0x40000f04, nv50->miptree_nr * 8);
-	for (i = 0; i < nv50->miptree_nr; i++) {
-		if (nv50_tex_construct(so, nv50->miptree[i])) {
+	for (unit = 0; unit < nv50->miptree_nr; unit++) {
+		struct nv50_miptree *mt = nv50->miptree[unit];
+
+		for (level = 0; level <= mt->base.last_level; level++) {
+			for (image = 0; image < mt->image_nr; image++) {
+				nv50_miptree_sync(&nv50->screen->pipe, mt,
+						  level, image);
+			}
+		}
+
+		if (nv50_tex_construct(so, mt)) {
 			NOUVEAU_ERR("failed tex validate\n");
 			so_ref(NULL, &so);
 			return;
diff --git a/src/gallium/drivers/nv50/nv50_texture.h b/src/gallium/drivers/nv50/nv50_texture.h
index 6861d67b4d5..aca622c73b1 100644
--- a/src/gallium/drivers/nv50/nv50_texture.h
+++ b/src/gallium/drivers/nv50/nv50_texture.h
@@ -50,6 +50,9 @@
 #define NV50TIC_0_0_FMT_5_6_5                                     0x00000015
 #define NV50TIC_0_0_FMT_8_8                                       0x00000018
 #define NV50TIC_0_0_FMT_8                                         0x0000001d
+#define NV50TIC_0_0_FMT_DXT1                                      0x00000024
+#define NV50TIC_0_0_FMT_DXT3                                      0x00000025
+#define NV50TIC_0_0_FMT_DXT5                                      0x00000026
 
 #define NV50TIC_0_1_OFFSET_LOW_MASK                               0xffffffff
 #define NV50TIC_0_1_OFFSET_LOW_SHIFT                                       0
diff --git a/src/gallium/drivers/nv50/nv50_vbo.c b/src/gallium/drivers/nv50/nv50_vbo.c
index 584336682e4..435dc9777da 100644
--- a/src/gallium/drivers/nv50/nv50_vbo.c
+++ b/src/gallium/drivers/nv50/nv50_vbo.c
@@ -60,6 +60,10 @@ nv50_draw_arrays(struct pipe_context *pipe, unsigned mode, unsigned start,
 	OUT_RING  (0);
 	BEGIN_RING(tesla, 0x142c, 1);
 	OUT_RING  (0);
+	BEGIN_RING(tesla, 0x1440, 1);
+	OUT_RING  (0);
+	BEGIN_RING(tesla, 0x1334, 1);
+	OUT_RING  (0);
 
 	BEGIN_RING(tesla, NV50TCL_VERTEX_BEGIN, 1);
 	OUT_RING  (nv50_prim(mode));