155 files changed, 6383 insertions, 4225 deletions
diff --git a/src/egl/drivers/xdri/Makefile b/src/egl/drivers/xdri/Makefile
index afd551dea5c..a721b997e69 100644
--- a/src/egl/drivers/xdri/Makefile
+++ b/src/egl/drivers/xdri/Makefile
@@ -48,7 +48,7 @@ $(TOP)/$(LIB_DIR)/$(DRIVER_NAME): $(OBJECTS)
 	$(TOP)/bin/mklib -o $(DRIVER_NAME) \
 		-noprefix \
 		-major 1 -minor 0 \
-		-L $(TOP)/$(LIB_DIR) \
+		-L$(TOP)/$(LIB_DIR) \
 		-install $(TOP)/$(LIB_DIR) \
 		$(OBJECTS) $(DRM_LIB) $(MISC_LIBS)
 
diff --git a/src/egl/drivers/xdri/egl_xdri.c b/src/egl/drivers/xdri/egl_xdri.c
index 83d4b86d98f..3b3e312746e 100644
--- a/src/egl/drivers/xdri/egl_xdri.c
+++ b/src/egl/drivers/xdri/egl_xdri.c
@@ -654,7 +654,10 @@ xdri_eglInitialize(_EGLDriver *drv, EGLDisplay dpy,
 
    xdri_drv->Base.Initialized = EGL_TRUE;
 
-   snprintf(name, sizeof(name), "X/DRI:%s", xdri_drv->dri_driver_name);
+   if (xdri_drv->dri_driver_name)
+      snprintf(name, sizeof(name), "X/DRI:%s", xdri_drv->dri_driver_name);
+   else
+      snprintf(name, sizeof(name), "X/DRI");
    xdri_drv->Base.Name = name;
 
    /* we're supporting EGL 1.4 */
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 78249054f2a..b439bc4059f 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -274,6 +274,14 @@ draw_enable_point_sprites(struct draw_context *draw, boolean enable)
 }
 
 
+void
+draw_set_force_passthrough( struct draw_context *draw, boolean enable )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->force_passthrough = enable;
+}
+
+
 /**
  * Ask the draw module for the location/slot of the given vertex attribute in
  * a post-transformed vertex.
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
index 0ab3681b647..3eeb4535311 100644
--- a/src/gallium/auxiliary/draw/draw_context.h
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -160,6 +160,9 @@ void draw_set_render( struct draw_context *draw,
 void draw_set_driver_clipping( struct draw_context *draw,
                                boolean bypass_clipping );
 
+void draw_set_force_passthrough( struct draw_context *draw, 
+                                 boolean enable );
+
 /*******************************************************************************
  * Draw pipeline 
  */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
index c0cf4269dbb..9825e116c32 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_vbuf.c
@@ -231,9 +231,9 @@ vbuf_set_prim( struct vbuf_stage *vbuf, uint prim )
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vbuf->vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vbuf->vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
-      switch (vbuf->vinfo->emit[i]) {
+      switch (vbuf->vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index 626a2e3e304..5d531146c5f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -163,12 +163,15 @@ struct draw_context
 
    struct {
       boolean bypass_clipping;
+      boolean bypass_vs;
    } driver;
 
    boolean flushing;         /**< debugging/sanity */
    boolean suspend_flushing; /**< internally set */
    boolean bypass_clipping;  /**< set if either api or driver bypass_clipping true */
 
+   boolean force_passthrough; /**< never clip or shade */
+
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
@@ -193,7 +196,7 @@ struct draw_context
 
       const float (*aligned_constants)[4];
 
-      float (*aligned_constant_storage)[4];
+      const float (*aligned_constant_storage)[4];
       unsigned const_storage_size;
 
 
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index 669c11c993c..87ec6ae20c2 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -69,26 +69,26 @@ draw_pt_arrays(struct draw_context *draw,
          return TRUE;
    }
 
-
-   if (!draw->render) {
-      opt |= PT_PIPELINE;
-   }
-
-   if (draw_need_pipeline(draw,
-                          draw->rasterizer,
-                          prim)) {
-      opt |= PT_PIPELINE;
-   }
-
-   if (!draw->bypass_clipping && !draw->pt.test_fse) {
-      opt |= PT_CLIPTEST;
+   if (!draw->force_passthrough) {
+      if (!draw->render) {
+         opt |= PT_PIPELINE;
+      }
+      
+      if (draw_need_pipeline(draw,
+                             draw->rasterizer,
+                             prim)) {
+         opt |= PT_PIPELINE;
+      }
+
+      if (!draw->bypass_clipping && !draw->pt.test_fse) {
+         opt |= PT_CLIPTEST;
+      }
+      
+      if (!draw->rasterizer->bypass_vs) {
+         opt |= PT_SHADE;
+      }
    }
-
-   if (!draw->rasterizer->bypass_vs) {
-      opt |= PT_SHADE;
-   }
-
-
+      
    if (opt == 0) 
       middle = draw->pt.middle.fetch_emit;
    else if (opt == PT_SHADE && !draw->pt.no_fse)
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index d4eca80588b..d520b05869b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -84,11 +84,11 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
       unsigned emit_sz = 0;
       unsigned src_buffer = 0;
       unsigned output_format;
-      unsigned src_offset = (vinfo->src_index[i] * 4 * sizeof(float) );
+      unsigned src_offset = (vinfo->attrib[i].src_index * 4 * sizeof(float) );
 
 
          
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 5a4db6cfe56..3966ad48ba7 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -121,7 +121,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
    memset(&key, 0, sizeof(key));
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->src_index[i]];
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[vinfo->attrib[i].src_index];
 
       unsigned emit_sz = 0;
       unsigned input_format = src->src_format;
@@ -129,7 +129,7 @@ static void fetch_emit_prepare( struct draw_pt_middle_end *middle,
       unsigned input_offset = src->src_offset;
       unsigned output_format;
 
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_4F:
 	 output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
 	 emit_sz = 4 * sizeof(float);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
index 73fc70c1bc9..f7e6a1a8eeb 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -79,6 +79,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
    unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
    const struct vertex_info *vinfo;
    unsigned i;
+   unsigned nr_vbs = 0;
    
 
    if (!draw->render->set_primitive( draw->render, 
@@ -102,7 +103,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
 
    fse->key.viewport = !draw->identity_viewport;
    fse->key.clip = !draw->bypass_clipping;
-   fse->key.pad = 0;
+   fse->key.const_vbuffers = 0;
 
    memset(fse->key.element, 0, 
           fse->key.nr_elements * sizeof(fse->key.element[0]));
@@ -116,16 +117,23 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
        */
       fse->key.element[i].in.buffer = src->vertex_buffer_index;
       fse->key.element[i].in.offset = src->src_offset;
+      nr_vbs = MAX2(nr_vbs, src->vertex_buffer_index + 1);
    }
    
+   for (i = 0; i < 5 && i < nr_vbs; i++) {
+      if (draw->pt.vertex_buffer[i].pitch == 0)
+         fse->key.const_vbuffers |= (1<<i);
+   }
 
+   if (0) debug_printf("%s: lookup const_vbuffers: %x\n", __FUNCTION__, fse->key.const_vbuffers);
+   
    {
       unsigned dst_offset = 0;
 
       for (i = 0; i < vinfo->num_attribs; i++) {
          unsigned emit_sz = 0;
 
-         switch (vinfo->emit[i]) {
+         switch (vinfo->attrib[i].emit) {
          case EMIT_4F:
             emit_sz = 4 * sizeof(float);
             break;
@@ -153,8 +161,8 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
           * numbers, not to positions in the hw vertex description --
           * that's handled by the output_offset field.
           */
-         fse->key.element[i].out.format = vinfo->emit[i];
-         fse->key.element[i].out.vs_output = vinfo->src_index[i];
+         fse->key.element[i].out.format = vinfo->attrib[i].emit;
+         fse->key.element[i].out.vs_output = vinfo->attrib[i].src_index;
          fse->key.element[i].out.offset = dst_offset;
       
          dst_offset += emit_sz;
@@ -162,13 +170,7 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       }
    }
 
-
-   /* Would normally look up a vertex shader and peruse its list of
-    * varients somehow.  We omitted that step and put all the
-    * hardcoded "shaders" into an array.  We're just making the
-    * assumption that this happens to be a matching shader...  ie
-    * you're running isosurf, aren't you?
-    */
+   
    fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, 
                                          &fse->key );
 
@@ -177,18 +179,17 @@ static void fse_prepare( struct draw_pt_middle_end *middle,
       return ;
    }
 
+   if (0) debug_printf("%s: found const_vbuffers: %x\n", __FUNCTION__, 
+                       fse->active->key.const_vbuffers);
+
    /* Now set buffer pointers:
     */
-   for (i = 0; i < num_vs_inputs; i++) {
-      unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index;
-
-      fse->active->set_input( fse->active, 
-                              i, 
-                              
-                              ((const ubyte *) draw->pt.user.vbuffer[buf] + 
-                               draw->pt.vertex_buffer[buf].buffer_offset),
-                              
-                              draw->pt.vertex_buffer[buf].pitch );
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      fse->active->set_buffer( fse->active, 
+                               i, 
+                               ((const ubyte *) draw->pt.user.vbuffer[i] + 
+                                draw->pt.vertex_buffer[i].buffer_offset),
+                              draw->pt.vertex_buffer[i].pitch );
    }
 
    *max_vertices = (draw->render->max_vertex_buffer_bytes / 
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
index 1446f785c51..3214213e445 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.c
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -49,7 +49,7 @@ draw_compute_vertex_size(struct vertex_info *vinfo)
 
    vinfo->size = 0;
    for (i = 0; i < vinfo->num_attribs; i++) {
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_OMIT:
          break;
       case EMIT_4UB:
@@ -81,8 +81,8 @@ draw_dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
    unsigned i, j;
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      j = vinfo->src_index[i];
-      switch (vinfo->emit[i]) {
+      j = vinfo->attrib[i].src_index;
+      switch (vinfo->attrib[i].emit) {
       case EMIT_OMIT:
          debug_printf("EMIT_OMIT:");
          break;
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 16c65c43175..a943607d7ed 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -75,12 +75,41 @@ struct vertex_info
 {
    uint num_attribs;
    uint hwfmt[4];      /**< hardware format info for this format */
-   enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS];
-   enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS];   /**< EMIT_x */
-   uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */
    uint size;          /**< total vertex size in dwords */
+   
+   /* Keep this small and at the end of the struct to allow quick
+    * memcmp() comparisons.
+    */
+   struct {
+      ubyte interp_mode:4;      /**< INTERP_x */
+      ubyte emit:4;             /**< EMIT_x */
+      ubyte src_index;          /**< map to post-xform attribs */
+   } attrib[PIPE_MAX_SHADER_INPUTS];
 };
 
+static INLINE int
+draw_vinfo_size( const struct vertex_info *a )
+{
+   return ((const char *)&a->attrib[a->num_attribs] -
+           (const char *)a);
+}
+
+static INLINE int
+draw_vinfo_compare( const struct vertex_info *a,
+                    const struct vertex_info *b )
+{
+   unsigned sizea = draw_vinfo_size( a );
+   return memcmp( a, b, sizea );
+}
+
+static INLINE void
+draw_vinfo_copy( struct vertex_info *dst,
+                 const struct vertex_info *src )
+{
+   unsigned size = draw_vinfo_size( src );
+   memcpy( dst, src, size );
+}
+
 
 
 /**
@@ -91,14 +120,15 @@ struct vertex_info
  */
 static INLINE uint
 draw_emit_vertex_attr(struct vertex_info *vinfo,
-                      enum attrib_emit emit, enum interp_mode interp,
+                      enum attrib_emit emit, 
+                      enum interp_mode interp, /* only used by softpipe??? */
                       uint src_index)
 {
    const uint n = vinfo->num_attribs;
    assert(n < PIPE_MAX_SHADER_INPUTS);
-   vinfo->emit[n] = emit;
-   vinfo->interp_mode[n] = interp;
-   vinfo->src_index[n] = src_index;
+   vinfo->attrib[n].emit = emit;
+   vinfo->attrib[n].interp_mode = interp;
+   vinfo->attrib[n].src_index = src_index;
    vinfo->num_attribs++;
    return n;
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index 45992d19867..68c24abad3b 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -64,7 +64,7 @@ struct draw_vs_varient_key {
    unsigned nr_outputs:8;
    unsigned viewport:1;
    unsigned clip:1;
-   unsigned pad:5;
+   unsigned const_vbuffers:5;
    struct draw_varient_element element[PIPE_MAX_ATTRIBS];
 };
 
@@ -76,7 +76,7 @@ struct draw_vs_varient {
 
    struct draw_vertex_shader *vs;
 
-   void (*set_input)( struct draw_vs_varient *,
+   void (*set_buffer)( struct draw_vs_varient *,
                       unsigned i,
                       const void *ptr,
                       unsigned stride );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
index a556477a767..87232865e23 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -92,9 +92,9 @@ struct x86_reg aos_get_x86( struct aos_compilation *cp,
          assert(which_reg == 1);
          offset = Offset(struct aos_machine, constants);
          break;
-      case X86_ATTRIBS:
+      case X86_BUFFERS:
          assert(which_reg == 0);
-         offset = Offset(struct aos_machine, attrib);
+         offset = Offset(struct aos_machine, buffer);
          break;
       default:
          assert(0);
@@ -196,6 +196,18 @@ static void spill( struct aos_compilation *cp, unsigned idx )
 }
 
 
+void aos_spill_all( struct aos_compilation *cp )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+}
+
+
 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
                                         struct x86_reg reg )
 {
@@ -1939,6 +1951,11 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
    save_fpu_state( &cp );
    set_fpu_round_nearest( &cp );
 
+   aos_init_inputs( &cp, linear );
+
+   cp.x86_reg[0] = 0;
+   cp.x86_reg[1] = 0;
+   
    /* Note address for loop jump 
     */
    label = x86_get_label(cp.func);
@@ -2018,13 +2035,7 @@ static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
 
       /* Incr index
        */   
-      if (linear) {
-         x86_inc(cp.func, cp.idx_EBX);
-      } 
-      else {
-         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
-      }
-
+      aos_incr_inputs( &cp, linear );
    }
    /* decr count, loop if not zero
     */
@@ -2065,15 +2076,13 @@ static void vaos_set_buffer( struct draw_vs_varient *varient,
                              unsigned stride )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
-   unsigned i;
 
-   for (i = 0; i < vaos->base.key.nr_inputs; i++) {
-      if (vaos->base.key.element[i].in.buffer == buf) {
-         vaos->attrib[i].input_ptr = ((char *)ptr +
-                                      vaos->base.key.element[i].in.offset);
-         vaos->attrib[i].input_stride = stride;
-      }
+   if (buf < vaos->nr_vb) {
+      vaos->buffer[buf].base_ptr = (char *)ptr;
+      vaos->buffer[buf].stride = stride;
    }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
 }
 
 
@@ -2086,10 +2095,12 @@ static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_elts( machine,
                        elts,
@@ -2105,10 +2116,13 @@ static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;
 
+   if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
+                       vaos->base.key.const_vbuffers);
+
    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
-   machine->attrib = vaos->attrib;
+   machine->buffer = vaos->buffer;
 
    vaos->gen_run_linear( machine,
                          start,
@@ -2127,7 +2141,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
 
-   FREE( vaos->attrib );
+   FREE( vaos->buffer );
 
    x86_release_func( &vaos->func[0] );
    x86_release_func( &vaos->func[1] );
@@ -2140,6 +2154,7 @@ static void vaos_destroy( struct draw_vs_varient *varient )
 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
                                                  const struct draw_vs_varient_key *key )
 {
+   unsigned i;
    struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
 
    if (!vaos)
@@ -2147,17 +2162,22 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    
    vaos->base.key = *key;
    vaos->base.vs = vs;
-   vaos->base.set_input = vaos_set_buffer;
+   vaos->base.set_buffer = vaos_set_buffer;
    vaos->base.destroy = vaos_destroy;
    vaos->base.run_linear = vaos_run_linear;
    vaos->base.run_elts = vaos_run_elts;
 
    vaos->draw = vs->draw;
 
-   vaos->attrib = MALLOC( key->nr_inputs * sizeof(vaos->attrib[0]) );
-   if (!vaos->attrib)
+   for (i = 0; i < key->nr_inputs; i++) 
+      vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
+
+   vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
+   if (!vaos->buffer)
       goto fail;
 
+   debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
+
 #if 0
    tgsi_dump(vs->state.tokens, 0);
 #endif
@@ -2179,8 +2199,8 @@ static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
    return &vaos->base;
 
  fail:
-   if (vaos && vaos->attrib)
-      FREE(vaos->attrib);
+   if (vaos && vaos->buffer)
+      FREE(vaos->buffer);
 
    if (vaos)
       x86_release_func( &vaos->func[0] );
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
index 7fe6f79db0d..264387517b0 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -87,9 +87,10 @@ struct lit_info {
 #define MAX_SHINE_TAB    4
 #define MAX_LIT_INFO     16
 
-struct aos_attrib {
-   const void *input_ptr;
-   unsigned input_stride;
+struct aos_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
 };
 
 
@@ -123,7 +124,7 @@ struct aos_machine {
    const float (*immediates)[4];     /* points to shader data */
    const float (*constants)[4];      /* points to draw data */
 
-   const struct aos_attrib *attrib; /* points to ? */
+   const struct aos_buffer *buffer; /* points to ? */
 };
 
 
@@ -175,12 +176,15 @@ void aos_adopt_xmm_reg( struct aos_compilation *cp,
                         unsigned idx,
                         unsigned dirty );
 
+void aos_spill_all( struct aos_compilation *cp );
+
 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
                                    unsigned file,
                                    unsigned idx );
 
-boolean aos_fetch_inputs( struct aos_compilation *cp,
-                          boolean linear );
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
 
 boolean aos_emit_outputs( struct aos_compilation *cp );
 
@@ -210,7 +214,7 @@ do {                                                                    \
 #define X86_NULL       0
 #define X86_IMMEDIATES 1
 #define X86_CONSTANTS  2
-#define X86_ATTRIBS    3
+#define X86_BUFFERS    3
 
 struct x86_reg aos_get_x86( struct aos_compilation *cp,
                             unsigned which_reg,
@@ -232,7 +236,8 @@ struct draw_vs_varient_aos_sse {
    struct draw_vs_varient base;
    struct draw_context *draw;
 
-   struct aos_attrib *attrib;
+   struct aos_buffer *buffer;
+   unsigned nr_vb;
 
    vaos_run_linear_func gen_run_linear;
    vaos_run_elts_func gen_run_elts;
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
index 26297c74f82..dd79bc799aa 100644
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -54,6 +54,7 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
 				 struct x86_reg data,
 				 struct x86_reg src_ptr )
 {
+#if 1
    sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
    /* data = z ? ? ? */
    sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
@@ -62,6 +63,16 @@ static void emit_load_R32G32B32( struct aos_compilation *cp,
    /* data = ? 0 z 1 */
    sse_movlps(cp->func, data, src_ptr);
    /* data = x y z 1 */
+#else
+   sse_movups(cp->func, data, src_ptr);
+   /* data = x y z ? */
+   sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
+   /* data = ? x y z */
+   sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
+   /* data = 1 x y z */
+   sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
+   /* data = x y z 1 */
+#endif
 }
 
 static void emit_load_R32G32( struct aos_compilation *cp, 
@@ -95,28 +106,6 @@ static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
 
 
 
-static void get_src_ptr( struct aos_compilation *cp,
-                         struct x86_reg src,
-                         struct x86_reg elt,
-                         unsigned a )
-{
-   struct x86_reg attrib = x86_make_disp(aos_get_x86( cp, 0, X86_ATTRIBS ), 
-                                         a * sizeof(struct aos_attrib));
-
-   struct x86_reg input_ptr = x86_make_disp(attrib, 
-                                            Offset(struct aos_attrib, input_ptr));
-
-   struct x86_reg input_stride = x86_make_disp(attrib, 
-                                               Offset(struct aos_attrib, input_stride));
-
-   /* Calculate pointer to current attrib:
-    */
-   x86_mov(cp->func, src, input_stride);
-   x86_imul(cp->func, src, elt);
-   x86_add(cp->func, src, input_ptr);
-}
-
-
 /* Extended swizzles?  Maybe later.
  */  
 static void emit_swizzle( struct aos_compilation *cp,
@@ -128,22 +117,60 @@ static void emit_swizzle( struct aos_compilation *cp,
 }
 
 
+
+static boolean get_buffer_ptr( struct aos_compilation *cp,
+                               boolean linear,
+                               unsigned buf_idx,
+                               struct x86_reg elt,
+                               struct x86_reg ptr)
+{
+   struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                      buf_idx * sizeof(struct aos_buffer));
+
+   struct x86_reg buf_stride = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, stride));
+   if (linear) {
+      struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                             Offset(struct aos_buffer, ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_ptr);
+      x86_mov(cp->func, elt, buf_stride);
+      x86_add(cp->func, elt, ptr);
+      if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
+      x86_mov(cp->func, buf_ptr, elt);
+   }
+   else {
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(cp->func, ptr, buf_stride);
+      x86_imul(cp->func, ptr, elt);
+      x86_add(cp->func, ptr, buf_base_ptr);
+   }
+
+   cp->insn_counter++;
+
+   return TRUE;
+}
+
+
 static boolean load_input( struct aos_compilation *cp,
                            unsigned idx,
-                           boolean linear )
+                           struct x86_reg bufptr )
 {
    unsigned format = cp->vaos->base.key.element[idx].in.format;
-   struct x86_reg src = cp->tmp_EAX;
+   unsigned offset = cp->vaos->base.key.element[idx].in.offset;
    struct x86_reg dataXMM = aos_get_xmm_reg(cp);
 
    /* Figure out source pointer address:
     */
-   get_src_ptr(cp, 
-               src, 
-               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX),
-               idx);
-
-   src = x86_deref(src);
+   struct x86_reg src = x86_make_disp(bufptr, offset);
 
    aos_adopt_xmm_reg( cp,
                       dataXMM,
@@ -179,20 +206,128 @@ static boolean load_input( struct aos_compilation *cp,
    return TRUE;
 }
 
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+static boolean load_inputs( struct aos_compilation *cp,
+                            unsigned buffer,
+                            struct x86_reg ptr )
 {
    unsigned i;
-   
+
    for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
-      if (!load_input( cp, i, linear ))
-         return FALSE;
-      cp->insn_counter++;
+      if (cp->vaos->base.key.element[i].in.buffer == buffer) {
+
+         if (!load_input( cp, i, ptr ))
+            return FALSE;
+
+         cp->insn_counter++;
+      }
+   }
+   
+   return TRUE;
+}
+
+boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned i;
+   for (i = 0; i < cp->vaos->nr_vb; i++) {
+      struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                         i * sizeof(struct aos_buffer));
+
+      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
+                                                  Offset(struct aos_buffer, base_ptr));
+
+      if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         x86_mov(cp->func, ptr, buf_base_ptr);
+
+         /* Load all inputs for this constant vertex buffer
+          */
+         load_inputs( cp, i, x86_deref(ptr) );
+         
+         /* Then just force them out to aos_machine.input[]
+          */
+         aos_spill_all( cp );
+
+      }
+      else if (linear) {
+
+         struct x86_reg elt = cp->idx_EBX;
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         struct x86_reg buf_stride = x86_make_disp(buf, 
+                                                   Offset(struct aos_buffer, stride));
+
+         struct x86_reg buf_ptr = x86_make_disp(buf, 
+                                                Offset(struct aos_buffer, ptr));
+
+
+         /* Calculate pointer to current attrib:
+          */
+         x86_mov(cp->func, ptr, buf_stride);
+         x86_imul(cp->func, ptr, elt);
+         x86_add(cp->func, ptr, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (cp->vaos->nr_vb == 1) 
+            x86_mov( cp->func, elt, ptr );
+         else
+            x86_mov( cp->func, buf_ptr, ptr );
+
+         cp->insn_counter++;
+      }
+   }
+
+   return TRUE;
+}
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned j;
+
+   for (j = 0; j < cp->vaos->nr_vb; j++) {
+      if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
+         /* just retreive pre-transformed input */
+      }
+      else if (linear && cp->vaos->nr_vb == 1) {
+         load_inputs( cp, 0, cp->idx_EBX );
+      }
+      else {
+         struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
+         struct x86_reg ptr = cp->tmp_EAX;
+
+         if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
+            return FALSE;
+
+         if (!load_inputs( cp, j, ptr ))
+            return FALSE;
+      }
    }
 
    return TRUE;
 }
 
+boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
+{
+   if (linear && cp->vaos->nr_vb == 1) {
+      struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
+                                            (0 * sizeof(struct aos_buffer) + 
+                                             Offset(struct aos_buffer, stride)));
+
+      x86_add(cp->func, cp->idx_EBX, stride);
+      sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
+   }
+   else if (linear) {
+      /* Nothing to do */
+   } 
+   else {
+      x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
+   }
+
+   return TRUE;
+}
 
 
 
@@ -203,7 +338,7 @@ static void emit_store_R32G32B32A32( struct aos_compilation *cp,
 				     struct x86_reg dst_ptr,
 				     struct x86_reg dataXMM )
 {
-   sse_movups(cp->func, dst_ptr, dataXMM);
+   sse_movaps(cp->func, dst_ptr, dataXMM);
 }
 
 static void emit_store_R32G32B32( struct aos_compilation *cp, 
@@ -306,7 +441,7 @@ boolean aos_emit_outputs( struct aos_compilation *cp )
 
       if (data.file != file_XMM) {
          struct x86_reg tmp = aos_get_xmm_reg( cp );
-         sse_movups(cp->func, tmp, data);
+         sse_movaps(cp->func, tmp, data);
          data = tmp;
       }
       
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 2ce30b9a02b..727977bc3af 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -32,6 +32,7 @@
   *   Brian Paul
   */
 
+#include "util/u_memory.h"
 #include "pipe/p_shader_tokens.h"
 #include "draw_private.h"
 #include "draw_context.h"
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index 0efabd9de8b..b11ae316627 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -37,7 +37,7 @@
 
 #include "draw_vs.h"
 
-#if defined(PIPE_ARCH_X86)
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
 
 #include "pipe/p_shader_tokens.h"
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
index 4daf05dae7c..7ee567d4789 100644
--- a/src/gallium/auxiliary/draw/draw_vs_varient.c
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -64,10 +64,10 @@ struct draw_vs_varient_generic {
 
 
 
-static void vsvg_set_input( struct draw_vs_varient *varient,
-                            unsigned buffer,
-                            const void *ptr,
-                            unsigned stride )
+static void vsvg_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buffer,
+                             const void *ptr,
+                             unsigned stride )
 {
    struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
 
@@ -265,7 +265,7 @@ struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
 
    vsvg->base.key = *key;
    vsvg->base.vs = vs;
-   vsvg->base.set_input     = vsvg_set_input;
+   vsvg->base.set_buffer    = vsvg_set_buffer;
    vsvg->base.run_elts      = vsvg_run_elts;
    vsvg->base.run_linear    = vsvg_run_linear;
    vsvg->base.destroy       = vsvg_destroy;
diff --git a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
index 0fc5c4ec5ca..fcc5c05794a 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_builtins.cpp
@@ -1,140 +1,140 @@
 static const unsigned char llvm_builtins_data[] = {
-0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x29,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
+0x42,0x43,0xc0,0xde,0x21,0x0c,0x00,0x00,0x27,0x02,0x00,0x00,0x01,0x10,0x00,0x00,
 0x10,0x00,0x00,0x00,0x07,0x81,0x23,0x91,0x41,0xc8,0x04,0x49,0x06,0x10,0x32,0x39,
 0x92,0x01,0x84,0x0c,0x25,0x05,0x08,0x19,0x1e,0x04,0x8b,0x62,0x80,0x14,0x45,0x02,
 0x42,0x92,0x0b,0x42,0xa4,0x10,0x32,0x14,0x38,0x08,0x18,0x49,0x0a,0x32,0x44,0x24,
 0x48,0x0a,0x90,0x21,0x23,0x44,0x72,0x80,0x8c,0x14,0x21,0x86,0x0a,0x8a,0x0a,0x64,
-0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x02,0x00,0x00,0x00,0x0b,0x04,0x00,0x0c,
-0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x32,0x22,0x48,0x09,
-0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,0xc6,0x05,0x42,0x52,
-0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,0x29,0x80,0x21,0x00,
-0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,0x80,0x50,0x2b,0x03,
-0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,0x14,0x01,0x80,0x11,
-0x80,0x22,0x88,0x00,0x13,0xa2,0x74,0xb0,0x03,0x3c,0xb0,0x83,0x36,0x80,0x87,0x71,
-0x68,0x03,0x76,0x48,0x07,0x77,0xa8,0x07,0x7c,0x68,0x83,0x73,0x70,0x87,0x7a,0xd8,
-0x70,0x0f,0xe5,0xd0,0x06,0xf0,0xa0,0x07,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x71,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,
-0xe9,0x80,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,0x71,0x60,0x07,0x7a,
-0x10,0x07,0x76,0xa0,0x07,0x71,0x60,0x07,0x6d,0x90,0x0e,0x73,0x20,0x07,0x7a,0x30,
-0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x6d,0x90,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,
-0x72,0xa0,0x07,0x76,0x40,0x07,0x6d,0x60,0x0e,0x73,0x20,0x07,0x7a,0x30,0x07,0x72,
-0xa0,0x07,0x73,0x20,0x07,0x6d,0x60,0x0e,0x76,0x40,0x07,0x7a,0x30,0x07,0x72,0xa0,
-0x07,0x76,0x40,0x07,0x6d,0x60,0x0f,0x76,0x40,0x07,0x7a,0x60,0x07,0x74,0xa0,0x07,
-0x76,0x40,0x07,0x6d,0x60,0x0f,0x71,0x20,0x07,0x78,0xa0,0x07,0x71,0x20,0x07,0x78,
-0xa0,0x07,0x71,0x20,0x07,0x78,0xd0,0x06,0xe1,0x00,0x07,0x7a,0x00,0x07,0x7a,0x60,
-0x07,0x74,0xd0,0x06,0xe6,0x80,0x07,0x70,0xa0,0x07,0x71,0x20,0x07,0x78,0xa0,0x07,
-0x71,0x20,0x07,0x78,0xa0,0xf3,0x40,0x88,0x04,0x32,0x32,0x02,0x04,0x20,0x76,0x46,
-0xfc,0x6c,0x48,0x92,0x00,0x40,0x00,0x00,0x00,0x00,0x0c,0x49,0x12,0x20,0x00,0x00,
-0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x01,0x00,0x00,0x00,0x30,0x24,0x59,0x00,0x20,
-0x08,0x00,0x00,0x00,0x86,0x24,0x0a,0x00,0x04,0x00,0x00,0x00,0xc0,0x90,0x84,0x01,
-0x02,0x00,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
-0x05,0x00,0x02,0x00,0x00,0x00,0x60,0x48,0x72,0x00,0x01,0x00,0x00,0x00,0x00,0x0c,
-0x49,0x14,0x00,0x08,0x00,0x00,0x00,0x80,0x21,0x49,0x01,0x00,0x41,0x00,0x00,0x00,
-0x90,0x05,0x02,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
+0x0c,0x1f,0x00,0x00,0x49,0x18,0x00,0x00,0x03,0x00,0x00,0x00,0x0b,0x84,0xff,0xff,
+0xff,0xff,0x1f,0xc0,0x00,0x00,0x00,0x00,0x51,0x20,0x00,0x00,0x12,0x00,0x00,0x00,
+0x32,0x22,0x48,0x09,0x20,0x65,0x82,0x84,0x00,0x26,0x45,0x48,0x05,0x09,0x26,0x45,
+0xc6,0x05,0x42,0x52,0x26,0x08,0xae,0x19,0x80,0x61,0x04,0x02,0x98,0x23,0x00,0x83,
+0x29,0x80,0x21,0x00,0xb2,0x73,0x04,0x01,0x51,0x8a,0xf4,0x08,0x92,0xa4,0x39,0x47,
+0x80,0x50,0x2b,0x03,0x00,0xa0,0x08,0x21,0x5c,0x46,0x2b,0x44,0x08,0x21,0xd4,0x40,
+0x14,0x01,0x80,0x11,0x80,0x22,0x88,0x00,0x13,0x30,0x7c,0xc0,0x03,0x3b,0xf8,0x05,
+0x3b,0xa0,0x83,0x36,0xa8,0x07,0x77,0x58,0x07,0x77,0x78,0x87,0x7b,0x70,0x87,0x36,
+0x60,0x87,0x74,0x70,0x87,0x7a,0xc0,0x87,0x36,0x38,0x07,0x77,0xa8,0x87,0x0d,0xf7,
+0x50,0x0e,0x6d,0x00,0x0f,0x7a,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xe9,0x10,0x07,0x7a,0x80,0x07,0x7a,0x80,0x07,0x6d,0x90,0x0e,
+0x78,0xa0,0x07,0x78,0xa0,0x07,0x78,0xd0,0x06,0xe9,0x10,0x07,0x76,0xa0,0x07,0x71,
+0x60,0x07,0x7a,0x10,0x07,0x76,0xd0,0x06,0xe9,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,
+0x07,0x7a,0x30,0x07,0x72,0xd0,0x06,0xe9,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,
+0x7a,0x60,0x07,0x74,0xd0,0x06,0xe6,0x30,0x07,0x72,0xa0,0x07,0x73,0x20,0x07,0x7a,
+0x30,0x07,0x72,0xd0,0x06,0xe6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,
+0x07,0x74,0xd0,0x06,0xf6,0x60,0x07,0x74,0xa0,0x07,0x76,0x40,0x07,0x7a,0x60,0x07,
+0x74,0xd0,0x06,0xf6,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,
+0x10,0x07,0x72,0x80,0x07,0x6d,0x10,0x0e,0x70,0xa0,0x07,0x70,0xa0,0x07,0x76,0x40,
+0x07,0x6d,0x60,0x0e,0x78,0x00,0x07,0x7a,0x10,0x07,0x72,0x80,0x07,0x7a,0x10,0x07,
+0x72,0x80,0x07,0x3a,0x0f,0x84,0x48,0x20,0x23,0x24,0x40,0x00,0x62,0x67,0x88,0x9f,
+0x19,0x92,0x24,0x00,0x10,0x04,0x00,0x00,0x00,0x43,0x92,0x04,0x08,0x00,0x00,0x00,
+0x00,0x60,0x48,0xa2,0x00,0x40,0x10,0x00,0x00,0x00,0x0c,0x49,0x16,0x00,0x08,0x02,
+0x00,0x00,0x80,0x21,0x89,0x02,0x00,0x41,0x00,0x00,0x00,0x30,0x24,0x61,0x80,0x00,
+0x00,0x00,0x00,0x00,0x86,0x24,0x07,0x10,0x00,0x00,0x00,0x00,0xc0,0x90,0x44,0x01,
+0x80,0x20,0x00,0x00,0x00,0x18,0x92,0x1c,0x40,0x00,0x00,0x00,0x00,0x00,0x43,0x12,
+0x05,0x00,0x82,0x00,0x00,0x00,0x60,0x48,0x52,0x00,0x40,0x10,0x00,0x00,0x00,0x64,
+0x81,0x00,0x00,0x00,0x10,0x00,0x00,0x00,0x32,0x1e,0x98,0x10,0x19,0x11,0x4c,0x90,
 0x8c,0x09,0x26,0x47,0xc6,0x04,0x43,0x8a,0x8a,0x59,0x8b,0x43,0x50,0xd2,0x09,0x02,
 0x81,0xd2,0x73,0x50,0xc9,0x0c,0x2a,0x99,0x41,0x25,0x33,0xa8,0x64,0x56,0x28,0x66,
 0x2d,0x0e,0x41,0xcf,0x2a,0x15,0x04,0x4a,0xcf,0x41,0x25,0x33,0xa8,0x64,0x06,0x95,
 0xcc,0xa0,0x92,0x59,0x01,0x00,0x00,0x00,0x53,0x82,0x26,0x0c,0x04,0x00,0x00,0x00,
 0x22,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x05,0x00,0x00,0x00,
 0x04,0xc6,0x08,0x40,0x10,0x04,0xe1,0x70,0x18,0x23,0x00,0x41,0x10,0x84,0xc3,0x60,
-0x04,0x00,0x00,0x00,0x93,0x0c,0xce,0x43,0x4c,0x31,0x3c,0x8e,0x34,0xc9,0x30,0x41,
-0xc2,0x14,0x03,0x34,0x51,0x93,0x0c,0x4d,0x44,0x4c,0x31,0x44,0x8d,0x35,0x56,0x01,
-0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0x46,0x41,0x08,0xcc,
-0x73,0x9b,0x05,0x21,0x30,0xcf,0x6e,0x18,0x84,0x00,0x2c,0x8b,0x35,0x04,0x80,0x39,
-0x04,0x81,0x5d,0x20,0x80,0x0f,0x0c,0x43,0xe4,0xd3,0x36,0x81,0x04,0x3e,0x30,0x0c,
-0x91,0x4f,0x5b,0x05,0x12,0xf8,0xc0,0x30,0x44,0x7e,0x7d,0x00,0x05,0xd1,0x4c,0x11,
-0x66,0x12,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x2a,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0xc3,0x0d,0xce,0x43,0x4c,0x37,0x3c,0x8e,0x34,0xdc,0x30,0x41,
+0xc2,0x74,0x03,0x34,0x51,0xc3,0x0d,0x4d,0x44,0x4c,0x37,0x44,0x8d,0x35,0x56,0x01,
+0x04,0xc3,0x55,0x21,0x16,0x0e,0x04,0x00,0x0f,0x00,0x00,0x00,0xd6,0x10,0x00,0xe6,
+0x10,0x04,0x76,0x81,0x00,0x3e,0x30,0x0c,0x91,0x4f,0x1b,0x05,0x21,0x30,0x8f,0x6d,
+0x13,0x48,0xe0,0x03,0xc3,0x10,0xf9,0xb4,0x55,0x20,0x81,0x0f,0x0c,0x43,0xe4,0xd7,
+0x66,0x41,0x08,0xcc,0xa3,0x1f,0x40,0x41,0x34,0x53,0x84,0x99,0xc4,0x20,0x30,0x8f,
+0x61,0x10,0x02,0xb0,0x2c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x27,0x00,0x00,0x00,0x13,0x04,0x43,0x2c,0x10,0x00,0x00,0x00,0x08,0x00,0x00,0x00,
 0x24,0x8a,0xa0,0x0c,0x46,0x00,0x4a,0x80,0xc2,0x1c,0x84,0x55,0x55,0xd6,0x1c,0x84,
 0x45,0x51,0x16,0x81,0x19,0x80,0x11,0x80,0x31,0x02,0x10,0x04,0x41,0xfc,0x03,0x00,
-0x63,0x08,0x0d,0x34,0xc9,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
+0x63,0x08,0x0d,0x34,0xdc,0x70,0x55,0xc2,0x2c,0x43,0x20,0x60,0x73,0x0c,0xd3,0x15,
 0x8d,0x21,0x34,0xd1,0x18,0x42,0xf3,0x8c,0x55,0x00,0x81,0xa0,0x6d,0x73,0x0c,0x19,
-0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x13,0x00,0x00,0x00,0x17,0x60,0x20,0xc5,
-0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,0x14,0x13,0xf3,0xd4,0xb8,0x69,
-0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,0xba,0x35,0x0c,0x13,0xf3,0x9c,
-0x80,0xe4,0x36,0x48,0x81,0x10,0xc3,0x4a,0x4c,0x54,0xd4,0x6c,0x8b,0x23,0x28,0x76,
-0x41,0x4c,0xcc,0xa3,0x1b,0x07,0x21,0x00,0xcb,0x72,0x00,0x05,0xd1,0x4c,0x11,0x66,
-0x18,0x83,0xc0,0x3c,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
-0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,0x04,0x00,0x00,0x00,
-0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,0x11,0x00,0x00,0x00,
-0x63,0x08,0x4d,0x64,0x16,0xc1,0x49,0x86,0xab,0x22,0x66,0x19,0x02,0x01,0x1b,0x43,
-0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,0x0a,0x20,0x0b,0x34,
-0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x24,0x83,0x57,0x11,0xb3,
-0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,0x48,0xb3,0x04,0xc6,
-0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,0x63,0x08,0xcd,0x64,
-0x64,0x40,0x70,0x92,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,0x60,0x0c,0xc1,0x99,
-0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,0x33,0x38,0xd0,0x00,
-0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,0xe0,0x24,0x03,0x1b,
-0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,0x30,0x63,0x08,0x0f,
-0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,0xa0,0x06,0x70,0x00,
-0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,0x76,0x52,0x4c,0xcc,
-0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,0xc6,0x50,0x8a,0x89,
-0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,0x79,0x68,0x73,0x20,
-0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,0x9e,0xdb,0x32,0x88,
-0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,0xa7,0xb7,0x95,0x62,
-0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,0x34,0x35,0x56,0x62,
-0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,0x2c,0x82,0xd3,0x0c,
-0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,0x24,0x01,0x63,0xec,
-0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,0xfc,0xc4,0xd0,0x90,
-0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,0xc1,0x71,0x7b,0x29,
-0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,0x32,0xf6,0xe6,0x46,
-0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,0x4e,0x33,0x58,0x47,
-0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,0x33,0xe1,0xbc,0xa5,
-0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,0x3a,0x40,0xc6,0xde,
-0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,0xed,0x82,0x10,0x9c,
-0xa6,0xba,0x81,0x44,0x70,0x9a,0xc1,0x17,0x9c,0x66,0x32,0x93,0x42,0x60,0x1e,0x7b,
-0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,0xa7,0x6d,0xa4,0x98,
-0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,0x9c,0x66,0xc0,0x7b,
-0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,0x8b,0xe0,0x34,0x83,
-0x2f,0x38,0xcd,0x64,0xd3,0xe6,0x61,0x08,0x4e,0x53,0xd5,0xf6,0x01,0x14,0x44,0x33,
-0x45,0x18,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x4a,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
-0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,0x1e,0xe1,0x19,0xc6,
-0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,0x15,0xc1,0x31,0x84,
-0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,0x71,0x6c,0x23,0x38,
-0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,0x35,0xc7,0x20,0x79,
-0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,0x04,0x10,0x20,0xd5,
-0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,0x30,0x06,0x64,0xe0,
-0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,0x63,0x21,0x40,0x70,
-0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,0x25,0x00,0x00,0x00,
-0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,0x05,0x31,0x31,0xcf,
-0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,0x41,0x08,0xc0,0xb2,
-0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,0x9b,0x8a,0x21,0x00,
-0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,0x82,0xd3,0x54,0xb7,
-0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,0x9d,0x9b,0x87,0x21,
-0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,0x10,0x82,0xd3,0x54,
-0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,0xb4,0x8d,0x14,0x13,
-0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,0x00,0x2c,0x8b,0xcd,
-0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,0x00,0x05,0xd1,0x4c,
-0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x19,0x00,0x00,0x00,
-0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0x4a,0x60,0x04,
-0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0x48,
-0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,0x44,0x39,0x58,0x85,
-0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x16,0x41,0x4c,0xcc,0x63,0xdb,0x04,0x31,
-0x31,0x4f,0x6e,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x56,0x41,0x4c,
-0xcc,0xd3,0x1b,0x45,0x21,0x00,0xcb,0xb2,0x9b,0x04,0x21,0x00,0xcb,0x02,0x00,0x00,
-0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,
-0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,
-0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,0x34,0xc7,0x20,0x51,
-0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,0x94,0x83,0x58,0x38,
-0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x76,0x51,
-0x4c,0xcc,0x53,0xdb,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,0x31,0x8f,0x6e,0x0d,
-0x43,0x05,0x2c,0x66,0x41,0x4c,0xcc,0xd3,0x1f,0x40,0x41,0x34,0x53,0x84,0x19,0x05,
-0x21,0x00,0xcb,0x02,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x2f,0x00,0x00,0x00,
-0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0xa0,0x04,
-0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,0xc9,0x30,0x49,0xc4,
-0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xc9,0x50,0x49,0xc4,0x2c,0x03,0x21,0x58,
-0x63,0x08,0x4d,0x34,0xc9,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,0x63,0x08,0x8d,0x33,
-0xc9,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,0x1a,0x00,0x00,0x00,
-0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,0x04,0x31,0x31,0x4f,
-0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x47,0x20,0xb9,0x0d,0x52,
-0x20,0xc4,0xb0,0x12,0x13,0x15,0x35,0xdb,0xe2,0x08,0x8a,0x5d,0x10,0x13,0xf3,0xec,
-0x37,0x90,0x2c,0x4e,0xf4,0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,
-0x74,0x02,0xc8,0xe2,0x44,0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,
-0x18,0x11,0x31,0x55,0xc0,0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,
-0x46,0x31,0x08,0xcc,0x03,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,
-0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,0x23,0x59,0xc2,0x20,0x09,0x92,0x1d,0x18,
-0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,
-0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,
-0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,0x84,0x34,0x85,0x31,0x10,0x0a,0xb2,0x3c,
-0x56,0x30,0x08,0xcc,0x63,0x0b,0x44,0x25,0x21,0x0d,0x00,0x00,0x00,0x00,0x00,0x00};
+0xe7,0x60,0x87,0x52,0x38,0x10,0x00,0x00,0x10,0x00,0x00,0x00,0x27,0x50,0x20,0x05,
+0xd1,0x0c,0x17,0x60,0x20,0xc5,0x74,0x10,0x8d,0x65,0x14,0x13,0xf3,0xd4,0xb4,0x6d,
+0x14,0x13,0xf3,0xd4,0xb8,0x69,0x14,0x13,0xf3,0xd4,0xb6,0x75,0x14,0x13,0xf3,0xd4,
+0xba,0x35,0x0c,0x13,0xf3,0xd8,0x05,0x31,0x31,0x8f,0x6e,0x1c,0x84,0x00,0x2c,0xcb,
+0x01,0x14,0x44,0x33,0x45,0x98,0x61,0x0c,0x02,0xf3,0x00,0x00,0x00,0x00,0x00,0x00,
+0x61,0x20,0x00,0x00,0x81,0x00,0x00,0x00,0x13,0x04,0x4d,0x2c,0x10,0x00,0x00,0x00,
+0x04,0x00,0x00,0x00,0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x91,
+0x11,0x00,0x00,0x00,0x63,0x08,0x4d,0x64,0x16,0xc1,0xe1,0x86,0xab,0x22,0x66,0x19,
+0x02,0x01,0x1b,0x43,0x70,0xa2,0x59,0x82,0x61,0x0c,0xe1,0x89,0x66,0x09,0x86,0x81,
+0x0a,0x20,0x0b,0x34,0x61,0x8e,0x81,0xda,0xa2,0x31,0x84,0x46,0xb2,0x8e,0xe0,0x70,
+0x83,0x57,0x11,0xb3,0x0c,0x44,0xf1,0x8d,0x21,0x38,0xd2,0x2c,0x81,0x31,0x86,0xf0,
+0x48,0xb3,0x04,0xc6,0x40,0x05,0x00,0x06,0x44,0x18,0x14,0x73,0x0c,0x9c,0x18,0x48,
+0x63,0x08,0xcd,0x64,0x64,0x40,0x70,0xb8,0xa1,0x0c,0x2a,0x62,0x96,0xe1,0x40,0xcc,
+0x60,0x0c,0xc1,0x99,0x66,0x09,0x92,0x31,0x84,0x67,0x9a,0x25,0x48,0x06,0x2a,0x80,
+0x33,0x38,0xd0,0x00,0x99,0x63,0x18,0x83,0x34,0x98,0xc6,0x10,0x1a,0xc8,0xd6,0x80,
+0xe0,0x70,0x03,0x1b,0x54,0xc4,0x2c,0x83,0xb2,0xb4,0xc1,0x18,0x82,0x03,0xcd,0x12,
+0x30,0x63,0x08,0x0f,0x34,0x4b,0xc0,0x0c,0x54,0x00,0x6e,0xa0,0xbc,0xc1,0x32,0xc7,
+0xa0,0x06,0x70,0x00,0x61,0x1c,0x84,0x03,0x01,0x00,0x00,0x00,0x4e,0x00,0x00,0x00,
+0x76,0x52,0x4c,0xcc,0x73,0xd3,0x24,0x05,0x64,0xec,0xcd,0x8d,0xcc,0xe5,0x87,0x46,
+0xc6,0x50,0x8a,0x89,0x79,0xee,0xdb,0x54,0x8a,0x89,0x79,0xee,0xdd,0x1a,0x88,0x89,
+0x79,0x68,0x73,0x20,0x26,0xe6,0xa9,0xed,0x81,0x98,0x98,0xc7,0x36,0x0b,0x62,0x62,
+0x9e,0xdb,0x32,0x88,0x89,0x79,0x72,0xd3,0x20,0x26,0xe6,0xd9,0x8d,0x83,0x98,0x98,
+0xa7,0xb7,0x95,0x62,0x62,0x9e,0xbb,0x27,0x2d,0x20,0x63,0x6f,0x6e,0x64,0x2e,0x3a,
+0x34,0x35,0x56,0x62,0x08,0x4e,0x53,0xd9,0xba,0xb5,0x14,0x02,0xf3,0xe0,0xf5,0x25,
+0x2c,0x82,0xd3,0x0c,0xbe,0xe0,0x34,0xd3,0x8d,0x9b,0x88,0x21,0x38,0xcd,0x60,0xd7,
+0x24,0x01,0x63,0xec,0xcd,0x8d,0xcc,0x45,0x87,0x44,0x80,0x8c,0xbd,0xb9,0x91,0xb9,
+0xfc,0xc4,0xd0,0x90,0x02,0x8c,0xb1,0x37,0x37,0x32,0x97,0x1f,0x73,0x29,0x26,0xe6,
+0xc1,0x71,0x7b,0x29,0x26,0xe6,0xc1,0x77,0xfb,0x28,0x04,0xe6,0xa9,0x6f,0x52,0x01,
+0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x13,0x73,0x63,0x18,0x83,0xc0,0x3c,0xb6,0x41,0x08,
+0x4e,0x33,0x58,0x47,0x31,0x31,0x4f,0x5d,0x1f,0xc3,0x22,0x38,0xcd,0xe0,0x0b,0x4e,
+0x33,0xe1,0xbc,0xa5,0x18,0x82,0xd3,0x0c,0x77,0x6e,0x20,0xc5,0xc4,0x3c,0xb5,0x4e,
+0x3a,0x40,0xc6,0xde,0xdc,0xc8,0x5c,0x7e,0x64,0x70,0x2c,0xa4,0x98,0x98,0xa7,0xee,
+0x6f,0x20,0x11,0x9c,0x66,0xf0,0x05,0xa7,0x99,0xec,0x82,0x10,0x9c,0xa6,0x32,0x93,
+0x42,0x60,0x1e,0x7b,0xb7,0x98,0x62,0x62,0x9e,0xbc,0x36,0x16,0x43,0x70,0x9a,0x0a,
+0xa7,0x6d,0xa4,0x98,0x98,0xc7,0xbe,0x8d,0xa4,0x98,0x98,0xc7,0xce,0x0d,0xc6,0x10,
+0x9c,0x66,0xc0,0x7b,0x12,0x02,0x32,0xf6,0xe6,0x46,0xe6,0xa2,0x33,0x13,0x73,0x06,
+0x8b,0xe0,0x34,0x83,0x2f,0x38,0xcd,0x64,0xd3,0x07,0x50,0x10,0xcd,0x14,0x61,0xe6,
+0x61,0x08,0x4e,0x53,0xd5,0x36,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x4a,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x07,0x00,0x00,0x00,
+0x24,0xca,0x60,0x04,0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0xb9,0x61,0x0c,0x04,0x10,
+0x1e,0xe1,0x19,0xc6,0x40,0x02,0xe1,0x11,0x1e,0x00,0x00,0x00,0x63,0x08,0xcd,0x63,
+0x15,0xc1,0x31,0x84,0x06,0xb2,0x8b,0xe0,0x18,0x42,0x13,0x59,0x46,0x70,0x0c,0xa1,
+0x71,0x6c,0x23,0x38,0x16,0x02,0x04,0xc7,0x64,0x61,0x1a,0x37,0x16,0x01,0x04,0x48,
+0x35,0xc7,0x20,0x79,0xcf,0x58,0x04,0x10,0x20,0xd5,0x1c,0xc3,0x07,0x06,0xd0,0x58,
+0x04,0x10,0x20,0xd5,0x1c,0x43,0x18,0x88,0x41,0x34,0x16,0x01,0x04,0x48,0x35,0xc7,
+0x30,0x06,0x64,0xe0,0x98,0x47,0xd0,0xc0,0x80,0xa0,0x89,0x01,0x41,0x23,0x03,0x82,
+0x63,0x21,0x40,0x70,0x50,0x66,0x70,0x06,0x68,0x90,0x06,0x58,0x06,0xe1,0x40,0x00,
+0x25,0x00,0x00,0x00,0x56,0x52,0x4c,0xcc,0x73,0xd3,0x56,0x41,0x4c,0xcc,0x53,0xdb,
+0x05,0x31,0x31,0xcf,0x6d,0x19,0xc4,0xc4,0x3c,0xba,0x6d,0x10,0x13,0xf3,0xf4,0xd6,
+0x41,0x08,0xc0,0xb2,0x18,0x46,0x21,0x38,0x4d,0x85,0x9b,0x46,0x21,0x38,0x4d,0xb5,
+0x9b,0x8a,0x21,0x00,0xcb,0x82,0xdf,0x66,0x62,0x08,0x4e,0x53,0xdd,0xb7,0x9d,0x18,
+0x82,0xd3,0x54,0xb7,0x6e,0x28,0x86,0xe0,0x34,0xd5,0xdd,0xdb,0x47,0x31,0x31,0x4f,
+0x9d,0x9b,0x87,0x21,0x00,0xcb,0x52,0xdf,0x06,0x62,0x08,0xc0,0xb2,0xd4,0xbc,0x59,
+0x10,0x82,0xd3,0x54,0x96,0x62,0x08,0x4e,0x53,0xe1,0xb6,0x85,0x14,0x13,0xf3,0xd8,
+0xb4,0x8d,0x14,0x13,0xf3,0xd8,0xb9,0x89,0x18,0x02,0xb0,0x2c,0xf6,0x6d,0x24,0x86,
+0x00,0x2c,0x8b,0xcd,0x1b,0x87,0x21,0x38,0x4d,0x55,0xd3,0xd6,0x30,0x54,0xc0,0x72,
+0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x19,0x00,0x00,0x00,0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0x4a,0x60,0x04,0x80,0xc2,0x0c,0x00,0x00,0x00,0x00,0x00,0x63,0x08,0xcd,0x33,
+0x16,0x01,0x04,0x48,0x34,0xc7,0x00,0x49,0xcf,0x58,0x04,0x10,0x28,0xd1,0x1c,0xc3,
+0x44,0x39,0x58,0x85,0x03,0x01,0x00,0x00,0x0a,0x00,0x00,0x00,0x26,0x41,0x08,0xc0,
+0xb2,0x18,0x45,0x21,0x00,0xcb,0xb2,0x5b,0x04,0x31,0x31,0x8f,0x6d,0x13,0xc4,0xc4,
+0x3c,0xb9,0x35,0x0c,0x15,0xb0,0x58,0x05,0x31,0x31,0x4f,0x7f,0x00,0x05,0xd1,0x4c,
+0x11,0x06,0x00,0x00,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,0x1b,0x00,0x00,0x00,
+0x13,0x04,0x41,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,0x24,0xca,0x60,0x04,
+0xa0,0x04,0x8a,0x80,0xc2,0x0c,0x00,0x00,0x63,0x08,0xcd,0x33,0x16,0x01,0x04,0xca,
+0x34,0xc7,0x20,0x51,0xcf,0x1c,0x43,0x45,0x41,0x73,0x0c,0x16,0x15,0xcd,0x31,0x5c,
+0x94,0x83,0x58,0x38,0x10,0x00,0x00,0x00,0x0b,0x00,0x00,0x00,0x76,0x51,0x4c,0xcc,
+0x53,0xdb,0x86,0x51,0x4c,0xcc,0x53,0xe7,0x36,0x41,0x4c,0xcc,0x63,0x5b,0x05,0x31,
+0x31,0x8f,0x6e,0x16,0xc4,0xc4,0x3c,0xbd,0x51,0x10,0x02,0xb0,0x2c,0xd6,0x30,0x54,
+0xc0,0x72,0x00,0x05,0xd1,0x4c,0x11,0x06,0x00,0x00,0x00,0x00,0x61,0x20,0x00,0x00,
+0x2c,0x00,0x00,0x00,0x13,0x04,0x45,0x2c,0x10,0x00,0x00,0x00,0x03,0x00,0x00,0x00,
+0x24,0xca,0xa0,0x04,0x46,0x00,0x8a,0x80,0xc0,0x08,0x00,0x00,0x63,0x08,0x0d,0x34,
+0xdc,0x30,0x49,0xc4,0x2c,0x03,0x11,0x50,0x63,0x08,0xcd,0x33,0xdc,0x50,0x49,0xc4,
+0x2c,0x03,0x21,0x58,0x63,0x08,0x4d,0x34,0xdc,0x70,0x49,0xc4,0x2c,0x03,0x31,0x60,
+0x63,0x08,0x8d,0x33,0xdc,0x90,0x49,0x84,0x69,0x22,0x70,0xc3,0x27,0x1c,0x08,0x00,
+0x17,0x00,0x00,0x00,0x96,0x51,0x4c,0xcc,0x53,0xdf,0x66,0x41,0x08,0xcc,0x83,0xdb,
+0x04,0x31,0x31,0x4f,0x6d,0x15,0xc4,0xc4,0x3c,0xb7,0x61,0x10,0x02,0xf3,0xf0,0x76,
+0x41,0x4c,0xcc,0xb3,0x1f,0x81,0x11,0x11,0x13,0x15,0x35,0x37,0x90,0x2c,0x4e,0xf4,
+0x47,0x87,0x54,0xd7,0x17,0x70,0x2c,0x4e,0xf4,0x47,0x87,0x74,0x02,0xc8,0xe2,0x44,
+0x7f,0x74,0x48,0xb9,0x69,0x14,0x02,0xf3,0xd4,0xb8,0x6d,0x18,0x11,0x31,0x55,0xc0,
+0x62,0x0d,0x43,0x05,0x2c,0x07,0x50,0x10,0xcd,0x14,0x61,0x46,0x31,0x08,0xcc,0x03,
+0x00,0x00,0x00,0x00,0x71,0x20,0x00,0x00,0x12,0x00,0x00,0x00,0x66,0x40,0x54,0x82,
+0x23,0x19,0xc3,0xa0,0x20,0x8b,0x1d,0x18,0x4f,0x84,0x34,0x53,0x61,0x03,0xc4,0xe3,
+0x58,0x85,0x05,0x14,0xbe,0x34,0x45,0xb5,0x21,0x10,0x82,0x23,0x15,0x46,0x30,0x2c,
+0xc8,0x64,0x02,0x06,0xf0,0x3c,0x91,0x73,0x19,0x00,0xe1,0x4b,0x53,0x64,0x0a,0x84,
+0x84,0x34,0x85,0x25,0x0c,0x92,0x20,0x59,0xc1,0x20,0x30,0x8f,0x2d,0x10,0x95,0x84,
+0x34,0x00,0x00,0x00,0x00,0x00,0x00,0x00};
diff --git a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
index e64bfb1c6cb..3a2f2878a30 100644
--- a/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
+++ b/src/gallium/auxiliary/gallivm/gallivm_cpu.cpp
@@ -46,6 +46,7 @@
 #include "tgsi/tgsi_dump.h"
 
 #include "util/u_memory.h"
+#include "util/u_math.h"
 
 #include <llvm/Module.h>
 #include <llvm/CallingConv.h>
@@ -157,8 +158,8 @@ void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog
    llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
    llvm::ExecutionEngine *ee = cpu->engine;
    assert(ee);
-   /*FIXME : remove */
-   ee->DisableLazyCompilation();
+   /*FIXME : why was this disabled ? we need it for pow/sqrt/... */
+   ee->DisableLazyCompilation(false);
    ee->addModuleProvider(mp);
 
    llvm::Function *func = func_for_shader(prog);
@@ -201,7 +202,6 @@ int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
    unsigned int i, j;
    unsigned slot;
    vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
-
    assert(runner);
 
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
diff --git a/src/gallium/auxiliary/gallivm/instructions.cpp b/src/gallium/auxiliary/gallivm/instructions.cpp
index a82dc30306d..599975d5add 100644
--- a/src/gallium/auxiliary/gallivm/instructions.cpp
+++ b/src/gallium/auxiliary/gallivm/instructions.cpp
@@ -83,6 +83,7 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_llvmPow   = 0;
    m_llvmFloor = 0;
    m_llvmFlog  = 0;
+   m_llvmFexp  = 0;
    m_llvmLit  = 0;
    m_fmtPtr = 0;
 
@@ -92,194 +93,271 @@ Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicB
    m_mod = ParseBitcodeFile(buffer);
 }
 
+llvm::BasicBlock * Instructions::currentBlock() const
+{
+   return m_builder.GetInsertBlock();
+}
+
+llvm::Value * Instructions::abs(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *xabs  = callFAbs(vec[0]);
+   Value *yabs  = callFAbs(vec[1]);
+   Value *zabs  = callFAbs(vec[2]);
+   Value *wabs  = callFAbs(vec[3]);
+   return vectorFromVals(xabs, yabs, zabs, wabs);
+}
+
 llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
 {
    return m_builder.CreateAdd(in1, in2, name("add"));
 }
 
-llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
+llvm::Value * Instructions::arl(llvm::Value *in)
 {
-   Value *mulRes = mul(in1, in2);
-   return add(mulRes, in3);
+   return floor(in);
 }
- 
-llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+
+void Instructions::beginLoop()
 {
-   return m_builder.CreateMul(in1, in2, name("mul"));
+   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
+   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+
+   m_builder.CreateBr(begin);
+   Loop loop;
+   loop.begin = begin;
+   loop.end   = end;
+   m_builder.SetInsertPoint(begin);
+   m_loopStack.push(loop);
 }
 
-const char * Instructions::name(const char *prefix)
+void Instructions::bgnSub(unsigned label)
 {
-   ++m_idx;
-   snprintf(m_name, 32, "%s%d", prefix, m_idx);
-   return m_name;
+   llvm::Function *func = findFunction(label);
+
+   Function::arg_iterator args = func->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("INPUT");
+   m_storage->pushArguments(ptr_INPUT);
+
+   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
+
+   m_func = func;
+   m_builder.SetInsertPoint(entry);
 }
 
-llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+void Instructions::brk()
 {
-   Value *mulRes = mul(in1, in2);
-   Value *x = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(0),
-                                                          name("extractx"));
-   Value *y = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(1),
-                                                          name("extracty"));
-   Value *z = m_builder.CreateExtractElement(mulRes,
-                                                          m_storage->constantInt(2),
-                                                          name("extractz"));
-   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
-   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
-   return vectorFromVals(dot3, dot3, dot3, dot3);
+   assert(!m_loopStack.empty());
+   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
+   m_builder.CreateBr(m_loopStack.top().end);
+   m_builder.SetInsertPoint(unr);
 }
 
-llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+void Instructions::cal(int label, llvm::Value *input)
 {
-   if (!m_llvmFSqrt) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fsqrtArgs;
-      fsqrtArgs.push_back(Type::FloatTy);
-      PAListPtr fsqrtPal;
-      FunctionType* fsqrtType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fsqrtArgs,
-         /*isVarArg=*/false);
-      m_llvmFSqrt = Function::Create(
-         /*Type=*/fsqrtType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.sqrt.f32", m_mod);
-      m_llvmFSqrt->setCallingConv(CallingConv::C);
-      m_llvmFSqrt->setParamAttrs(fsqrtPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
-                                         name("sqrt"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   std::vector<Value*> params;
+   params.push_back(input);
+   llvm::Function *func = findFunction(label);
+
+   m_builder.CreateCall(func, params.begin(), params.end());
 }
 
-llvm::Value * Instructions::rsq(llvm::Value *in1)
+llvm::Value * Instructions::ceil(llvm::Value *in)
 {
-   Value *x = m_builder.CreateExtractElement(in1,
-                                             m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *abs  = callFAbs(x);
-   Value *sqrt = callFSqrt(abs);
-
-   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                       sqrt,
-                                       name("rsqrt"));
-   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callCeil(vec[0]), callCeil(vec[1]),
+                         callCeil(vec[2]), callCeil(vec[3]));
 }
 
-llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
-                                           llvm::Value *z, llvm::Value *w)
+llvm::Value * Instructions::clamp(llvm::Value *in1)
 {
-   Constant *const_vec = Constant::getNullValue(m_floatVecType);
-   Value *res = m_builder.CreateInsertElement(const_vec, x,
-                                              m_storage->constantInt(0),
-                                              name("vecx"));
-   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
-                               name("vecxy"));
-   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
-                               name("vecxyz"));
-   if (w)
-      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
-                                          name("vecxyzw"));
-   return res;
+   llvm::Value *zero = constVector(0.0f, 0.0f, 0.0f, 0.0f);
+   llvm::Value *one = constVector(1.0f, 1.0f, 1.0f, 1.0f);
+   return min( max(zero, in1), one);
 }
 
-llvm::Value *Instructions::callFAbs(llvm::Value *val)
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmFAbs) {
-      // predeclare the intrinsic
-      std::vector<const Type*> fabsArgs;
-      fabsArgs.push_back(Type::FloatTy);
-      PAListPtr fabsPal;
-      FunctionType* fabsType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/fabsArgs,
-         /*isVarArg=*/false);
-      m_llvmFAbs = Function::Create(
-         /*Type=*/fabsType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"fabs", m_mod);
-      m_llvmFAbs->setCallingConv(CallingConv::C);
-      m_llvmFAbs->setParamAttrs(fabsPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
-                                         name("fabs"));
-   call->setCallingConv(CallingConv::C);
+   llvm::Function *func = m_mod->getFunction("cmp");
+   assert(func);
+
+   std::vector<Value*> params;
+   params.push_back(in1);
+   params.push_back(in2);
+   params.push_back(in3);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
    call->setTailCall(false);
    return call;
 }
 
-llvm::Value * Instructions::lit(llvm::Value *in)
+llvm::Value * Instructions::cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   if (!m_llvmLit) {
-      m_llvmLit = m_mod->getFunction("lit");
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *half = ConstantFP::get(APFloat(0.5f));
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], half, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], half, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], half, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], half, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
-   return res;
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+   Constant *zero = Constant::getNullValue(Type::FloatTy);
+
+   Value *xcmp  = m_builder.CreateFCmpOGE(vec1[0], zero, name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec2[0], vec3[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGE(vec1[1], zero, name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec2[1], vec3[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGE(vec1[2], zero, name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec2[2], vec3[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGE(vec1[3], zero, name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec2[3], vec3[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+llvm::Value * Instructions::cos(llvm::Value *in)
 {
-   if (!m_llvmPow) {
-      // predeclare the intrinsic
-      std::vector<const Type*> powArgs;
-      powArgs.push_back(Type::FloatTy);
-      powArgs.push_back(Type::FloatTy);
-      PAListPtr powPal;
-      FunctionType* powType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/powArgs,
-         /*isVarArg=*/false);
-      m_llvmPow = Function::Create(
-         /*Type=*/powType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"llvm.pow.f32", m_mod);
-      m_llvmPow->setCallingConv(CallingConv::C);
-      m_llvmPow->setParamAttrs(powPal);
-   }
-   std::vector<Value*> params;
-   params.push_back(val1);
-   params.push_back(val2);
-   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
-                                         name("pow"));
-   call->setCallingConv(CallingConv::C);
+#if 0
+   llvm::Function *func = m_mod->getFunction("vcos");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
    call->setTailCall(false);
    return call;
+#else
+   std::vector<llvm::Value*> elems = extractVector(in);
+   Function *func = m_mod->getFunction("cosf");
+   assert(func);
+   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+   cos->setCallingConv(CallingConv::C);
+   cos->setTailCall(true);
+   return vectorFromVals(cos, cos, cos, cos);
+#endif
 }
 
-llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
 {
    Value *x1 = m_builder.CreateExtractElement(in1,
                                               m_storage->constantInt(0),
                                               name("x1"));
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(2),
+                                              name("z1"));
+
    Value *x2 = m_builder.CreateExtractElement(in2,
                                               m_storage->constantInt(0),
                                               name("x2"));
-   llvm::Value *val = callPow(x1, x2);
-   return vectorFromVals(val, val, val, val);
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *z2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(2),
+                                              name("z2"));
+   Value *y1z2 = mul(y1, z2);
+   Value *z1y2 = mul(z1, y2);
+
+   Value *z1x2 = mul(z1, x2);
+   Value *x1z2 = mul(x1, z2);
+
+   Value *x1y2 = mul(x1, y2);
+   Value *y1x2 = mul(y1, x2);
+
+   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
 }
 
-llvm::Value * Instructions::rcp(llvm::Value *in1)
+llvm::Value * Instructions::ddx(llvm::Value *in)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
-                                     x1, name("rcp"));
-   return vectorFromVals(res, res, res, res);
+   // FIXME
+   assert(0);
+}
+
+llvm::Value * Instructions::ddy(llvm::Value *in)
+{
+   // FIXME
+   assert(0);
+}
+
+llvm::Value * Instructions::div(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateFDiv(in1, in2, name("div"));
+}
+
+llvm::Value * Instructions::dot2add(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(in3,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot2add = m_builder.CreateAdd(xy, z, name("dot2add"));
+   return vectorFromVals(dot2add, dot2add, dot2add, dot2add);
+}
+
+llvm::Value * Instructions::dp2(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   return vectorFromVals(xy, xy, xy, xy);
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+   return vectorFromVals(dot3, dot3, dot3, dot3);
 }
 
 llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
@@ -321,6 +399,53 @@ llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
                          ry, z, w);
 }
 
+void Instructions::elseop()
+{
+   assert(!m_ifStack.empty());
+   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
+   m_builder.CreateBr(ifend);
+   m_builder.SetInsertPoint(m_ifStack.top());
+   currentBlock()->setName(name("ifelse"));
+   m_ifStack.pop();
+   m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+   assert(!m_ifStack.empty());
+   m_builder.CreateBr(m_ifStack.top());
+   m_builder.SetInsertPoint(m_ifStack.top());
+   m_ifStack.pop();
+}
+
+void Instructions::endLoop()
+{
+   assert(!m_loopStack.empty());
+   Loop loop = m_loopStack.top();
+   m_builder.CreateBr(loop.begin);
+   loop.end->moveAfter(currentBlock());
+   m_builder.SetInsertPoint(loop.end);
+   m_loopStack.pop();
+}
+
+void Instructions::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+void Instructions::endSub()
+{
+   m_func = 0;
+   m_builder.SetInsertPoint(0);
+}
+
+llvm::Value * Instructions::exp(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFExp(vec[0]), callFExp(vec[1]),
+                             callFExp(vec[2]), callFExp(vec[3]));
+}
+
 llvm::Value * Instructions::ex2(llvm::Value *in)
 {
    llvm::Value *val = callPow(ConstantFP::get(APFloat(2.f)),
@@ -330,31 +455,6 @@ llvm::Value * Instructions::ex2(llvm::Value *in)
    return vectorFromVals(val, val, val, val);
 }
 
-llvm::Value * Instructions::callFloor(llvm::Value *val)
-{
-   if (!m_llvmFloor) {
-      // predeclare the intrinsic
-      std::vector<const Type*> floorArgs;
-      floorArgs.push_back(Type::FloatTy);
-      PAListPtr floorPal;
-      FunctionType* floorType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/floorArgs,
-         /*isVarArg=*/false);
-      m_llvmFloor = Function::Create(
-         /*Type=*/floorType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"floorf", m_mod);
-      m_llvmFloor->setCallingConv(CallingConv::C);
-      m_llvmFloor->setParamAttrs(floorPal);
-   }
-   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
-                                          name("floorf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(false);
-   return call;
-}
-
 llvm::Value * Instructions::floor(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -362,42 +462,52 @@ llvm::Value * Instructions::floor(llvm::Value *in)
                          callFloor(vec[2]), callFloor(vec[3]));
 }
 
-llvm::Value * Instructions::arl(llvm::Value *in)
-{
-   return floor(in);
-}
-
 llvm::Value * Instructions::frc(llvm::Value *in)
 {
    llvm::Value *flr = floor(in);
    return sub(in, flr);
 }
 
-llvm::Value * Instructions::callFLog(llvm::Value *val)
+void Instructions::ifop(llvm::Value *in)
 {
-   if (!m_llvmFlog) {
-      // predeclare the intrinsic
-      std::vector<const Type*> flogArgs;
-      flogArgs.push_back(Type::FloatTy);
-      PAListPtr flogPal;
-      FunctionType* flogType = FunctionType::get(
-         /*Result=*/Type::FloatTy,
-         /*Params=*/flogArgs,
-         /*isVarArg=*/false);
-      m_llvmFlog = Function::Create(
-         /*Type=*/flogType,
-         /*Linkage=*/GlobalValue::ExternalLinkage,
-         /*Name=*/"logf", m_mod);
-      m_llvmFlog->setCallingConv(CallingConv::C);
-      m_llvmFlog->setParamAttrs(flogPal);
-   }
-   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
-                                         name("logf"));
-   call->setCallingConv(CallingConv::C);
+   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
+   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+
+   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+   m_builder.CreateCondBr(xcmp, ifthen, ifend);
+   //m_builder.SetInsertPoint(yblock);
+
+   m_builder.SetInsertPoint(ifthen);
+   m_ifStack.push(ifend);
+}
+
+llvm::Value * Instructions::kil(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("kil");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
    call->setTailCall(false);
    return call;
 }
 
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   llvm::Value *m = mul(in1, in2);
+   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+   llvm::Value *s = sub(vec1, in1);
+   return add(m, mul(s, in3));
+}
+
 llvm::Value * Instructions::lg2(llvm::Value *in)
 {
    std::vector<llvm::Value*> vec = extractVector(in);
@@ -407,142 +517,176 @@ llvm::Value * Instructions::lg2(llvm::Value *in)
                              callFLog(vec[2]), callFLog(vec[3])), const_vec);
 }
 
-llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+   if (!m_llvmLit) {
+      m_llvmLit = m_mod->getFunction("lit");
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::log(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3]));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   return add(mulRes, in3);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+                                          name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+                                          name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+                                          name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+                                          name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
 {
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
-                                          name("xcmp"));
+   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
    Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
                                         name("selx"));
 
-   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
-                                          name("ycmp"));
+   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
    Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
                                         name("sely"));
 
-   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
-                                          name("zcmp"));
+   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
    Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
                                         name("selz"));
 
-   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
-                                          name("wcmp"));
+   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
    Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
                                         name("selw"));
 
    return vectorFromVals(selx, sely, selz, selw);
 }
 
-void Instructions::printVector(llvm::Value *val)
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
 {
-   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+   return m_builder.CreateMul(in1, in2, name("mul"));
+}
 
-   if (!m_fmtPtr) {
-      Constant *format = ConstantArray::get(frmt, true);
-      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
-      GlobalVariable* globalFormat = new GlobalVariable(
-         /*Type=*/arrayTy,
-         /*isConstant=*/true,
-         /*Linkage=*/GlobalValue::InternalLinkage,
-         /*Initializer=*/0, // has initializer, specified below
-         /*Name=*/name(".str"),
-         m_mod);
-      globalFormat->setInitializer(format);
+llvm::Value * Instructions::neg(llvm::Value *in)
+{
+   Value *neg = m_builder.CreateNeg(in, name("neg"));
+   return neg;
+}
 
-      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
-      std::vector<Constant*> const_ptr_21_indices;
-      const_ptr_21_indices.push_back(const_int0);
-      const_ptr_21_indices.push_back(const_int0);
-      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
-                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
-   }
+llvm::Value * Instructions::nrm(llvm::Value *in)
+{
+   llvm::Value *v = rsq(in);
+   return mul(v, in);
+}
 
-   Function *func_printf = m_mod->getFunction("printf");
-   if (!func_printf)
-      func_printf = declarePrintf();
-   assert(func_printf);
-   std::vector<llvm::Value*> vec = extractVector(val);
-   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
-   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
-   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
-   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
-   std::vector<Value*> params;
-   params.push_back(m_fmtPtr);
-   params.push_back(dx);
-   params.push_back(dy);
-   params.push_back(dz);
-   params.push_back(dw);
-   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
-                                         name("printf"));
-   call->setCallingConv(CallingConv::C);
-   call->setTailCall(true);
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   llvm::Value *val = callPow(x1, x2);
+   return vectorFromVals(val, val, val, val);
 }
 
-llvm::Function * Instructions::declarePrintf()
+llvm::Value * Instructions::rcp(llvm::Value *in1)
 {
-   std::vector<const Type*> args;
-   PAListPtr params;
-   FunctionType* funcTy = FunctionType::get(
-      /*Result=*/IntegerType::get(32),
-      /*Params=*/args,
-      /*isVarArg=*/true);
-   Function* func_printf = Function::Create(
-      /*Type=*/funcTy,
-      /*Linkage=*/GlobalValue::ExternalLinkage,
-      /*Name=*/"printf", m_mod);
-   func_printf->setCallingConv(CallingConv::C);
-   func_printf->setParamAttrs(params);
-   return func_printf;
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *res = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                     x1, name("rcp"));
+   return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+   Value *x = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *abs  = callFAbs(x);
+   Value *sqrt = callFSqrt(abs);
+
+   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(APFloat(1.f)),
+                                       sqrt,
+                                       name("rsqrt"));
+   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
 }
 
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("scs");
+   assert(func);
 
-llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
+   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::seq(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
    Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
-   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
+
+   Value *xcmp = m_builder.CreateFCmpOEQ(vec1[0], vec2[0], name("xcmp"));
    Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp = m_builder.CreateFCmpOEQ(vec1[1], vec2[1], name("ycmp"));
    Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp = m_builder.CreateFCmpOEQ(vec1[2], vec2[2], name("zcmp"));
    Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp = m_builder.CreateFCmpOEQ(vec1[3], vec2[3], name("wcmp"));
    Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
    return vectorFromVals(x, y, z, w);
 }
+
+llvm::Value * Instructions::sfl(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   return vectorFromVals(const0f, const0f, const0f, const0f);
+}
+
 llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
@@ -566,157 +710,118 @@ llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
    return vectorFromVals(x, y, z, w);
 }
 
-
-llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
 {
    Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
    Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
    std::vector<llvm::Value*> vec1 = extractVector(in1);
    std::vector<llvm::Value*> vec2 = extractVector(in2);
-
-   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
    Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
    Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
    Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
    Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
    return vectorFromVals(x, y, z, w);
 }
 
-llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+llvm::Value * Instructions::sin(llvm::Value *in)
 {
-   Value *x1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(0),
-                                              name("x1"));
-   Value *y1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(1),
-                                              name("y1"));
-   Value *z1 = m_builder.CreateExtractElement(in1,
-                                              m_storage->constantInt(2),
-                                              name("z1"));
+   llvm::Function *func = m_mod->getFunction("vsin");
+   assert(func);
 
-   Value *x2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(0),
-                                              name("x2"));
-   Value *y2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(1),
-                                              name("y2"));
-   Value *z2 = m_builder.CreateExtractElement(in2,
-                                              m_storage->constantInt(2),
-                                              name("z2"));
-   Value *y1z2 = mul(y1, z2);
-   Value *z1y2 = mul(z1, y2);
+   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+   call->setTailCall(false);
+   return call;
+}
 
-   Value *z1x2 = mul(z1, x2);
-   Value *x1z2 = mul(x1, z2);
+llvm::Value * Instructions::sle(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-   Value *x1y2 = mul(x1, y2);
-   Value *y1x2 = mul(y1, x2);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
-}
+   Value *xcmp = m_builder.CreateFCmpOLE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
+   Value *ycmp = m_builder.CreateFCmpOLE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-llvm::Value * Instructions::abs(llvm::Value *in)
-{
-   std::vector<llvm::Value*> vec = extractVector(in);
-   Value *xabs  = callFAbs(vec[0]);
-   Value *yabs  = callFAbs(vec[1]);
-   Value *zabs  = callFAbs(vec[2]);
-   Value *wabs  = callFAbs(vec[3]);
-   return vectorFromVals(xabs, yabs, zabs, wabs);
+   Value *zcmp = m_builder.CreateFCmpOLE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOLE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
-void Instructions::ifop(llvm::Value *in)
+llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
 {
-   BasicBlock *ifthen = BasicBlock::Create(name("ifthen"), m_func,0);
-   BasicBlock *ifend = BasicBlock::Create(name("ifthenend"), m_func,0);
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
-   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
-   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
-                                             name("extractx"));
-   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
-   m_builder.CreateCondBr(xcmp, ifthen, ifend);
-   //m_builder.SetInsertPoint(yblock);
+   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
 
-   m_builder.SetInsertPoint(ifthen);
-   m_ifStack.push(ifend);
-}
+   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
 
-llvm::BasicBlock * Instructions::currentBlock() const
-{
-   return m_builder.GetInsertBlock();
-}
+   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
 
-void Instructions::elseop()
-{
-   assert(!m_ifStack.empty());
-   BasicBlock *ifend = BasicBlock::Create(name("ifend"), m_func,0);
-   m_builder.CreateBr(ifend);
-   m_builder.SetInsertPoint(m_ifStack.top());
-   currentBlock()->setName(name("ifelse"));
-   m_ifStack.pop();
-   m_ifStack.push(ifend);
+   return vectorFromVals(x, y, z, w);
 }
 
-void Instructions::endif()
+llvm::Value * Instructions::sne(llvm::Value *in1, llvm::Value *in2)
 {
-   assert(!m_ifStack.empty());
-   m_builder.CreateBr(m_ifStack.top());
-   m_builder.SetInsertPoint(m_ifStack.top());
-   m_ifStack.pop();
-}
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
 
-llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
-                                 llvm::Value *in3)
-{
-   llvm::Value *m = mul(in1, in2);
-   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
-   llvm::Value *s = sub(vec1, in1);
-   return add(m, mul(s, in3));
-}
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
 
-void Instructions::beginLoop()
-{
-   BasicBlock *begin = BasicBlock::Create(name("loop"), m_func,0);
-   BasicBlock *end = BasicBlock::Create(name("endloop"), m_func,0);
+   Value *xcmp = m_builder.CreateFCmpONE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
 
-   m_builder.CreateBr(begin);
-   Loop loop;
-   loop.begin = begin;
-   loop.end   = end;
-   m_builder.SetInsertPoint(begin);
-   m_loopStack.push(loop);
+   Value *ycmp = m_builder.CreateFCmpONE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpONE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpONE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
 }
 
-void Instructions::endLoop()
+llvm::Value * Instructions::str(llvm::Value *in1, llvm::Value *in2)
 {
-   assert(!m_loopStack.empty());
-   Loop loop = m_loopStack.top();
-   m_builder.CreateBr(loop.begin);
-   loop.end->moveAfter(currentBlock());
-   m_builder.SetInsertPoint(loop.end);
-   m_loopStack.pop();
+   Constant *const1f = ConstantFP::get(APFloat(1.000000e+00f));
+
+   return vectorFromVals(const1f, const1f, const1f, const1f);
 }
 
-void Instructions::brk()
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
 {
-   assert(!m_loopStack.empty());
-   BasicBlock *unr = BasicBlock::Create(name("unreachable"), m_func,0);
-   m_builder.CreateBr(m_loopStack.top().end);
-   m_builder.SetInsertPoint(unr);
+   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+   return res;
 }
 
 llvm::Value * Instructions::trunc(llvm::Value *in)
@@ -741,18 +846,298 @@ llvm::Value * Instructions::trunc(llvm::Value *in)
    return vectorFromVals(fx, fy, fz, fw);
 }
 
-void Instructions::end()
+llvm::Value * Instructions::x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
 {
-   m_builder.CreateRetVoid();
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   std::vector<llvm::Value*> vec3 = extractVector(in3);
+
+   Value *x2x3 = m_builder.CreateMul( vec2[0], vec3[0], name("x2x3"));
+   Value *y2y3 = m_builder.CreateMul( vec2[1], vec3[1], name("y2y3"));
+   Value *x1px2x3 = m_builder.CreateAdd (vec1[0], x2x3, name("x1 + x2x3"));
+   Value *x1px2x3py2y3 = m_builder.CreateAdd (x1px2x3, y2y3, name("x1 + x2x3 + y2y3"));
+
+   Value *x2z3 = m_builder.CreateMul( vec2[0], vec3[2], name("x2z3"));
+   Value *y2w3 = m_builder.CreateMul( vec2[1], vec3[3], name("y2w3"));
+   Value *y1px2z3 = m_builder.CreateAdd (vec1[1], x2z3, name("y1 + x2z3"));
+   Value *y1px2z3py2w3 = m_builder.CreateAdd (y1px2z3, y2w3, name("y1 + x2z3 + y2w3"));
+
+   return vectorFromVals(x1px2x3py2y3, y1px2z3py2w3, x1px2x3py2y3, y1px2z3py2w3);
 }
 
-void Instructions::cal(int label, llvm::Value *input)
+void Instructions::printVector(llvm::Value *val)
 {
+   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+   if (!m_fmtPtr) {
+      Constant *format = ConstantArray::get(frmt, true);
+      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+      GlobalVariable* globalFormat = new GlobalVariable(
+         /*Type=*/arrayTy,
+         /*isConstant=*/true,
+         /*Linkage=*/GlobalValue::InternalLinkage,
+         /*Initializer=*/0, // has initializer, specified below
+         /*Name=*/name(".str"),
+         m_mod);
+      globalFormat->setInitializer(format);
+
+      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+      std::vector<Constant*> const_ptr_21_indices;
+      const_ptr_21_indices.push_back(const_int0);
+      const_ptr_21_indices.push_back(const_int0);
+      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
+   }
+
+   Function *func_printf = m_mod->getFunction("printf");
+   if (!func_printf)
+      func_printf = declarePrintf();
+   assert(func_printf);
+   std::vector<llvm::Value*> vec = extractVector(val);
+   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
    std::vector<Value*> params;
-   params.push_back(input);
-   llvm::Function *func = findFunction(label);
+   params.push_back(m_fmtPtr);
+   params.push_back(dx);
+   params.push_back(dy);
+   params.push_back(dz);
+   params.push_back(dw);
+   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+                                         name("printf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(true);
+}
 
-   m_builder.CreateCall(func, params.begin(), params.end());
+const char * Instructions::name(const char *prefix)
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::Value * Instructions::callCeil(llvm::Value *val)
+{
+   if (!m_llvmCeil) {
+      // predeclare the intrinsic
+      std::vector<const Type*> ceilArgs;
+      ceilArgs.push_back(Type::FloatTy);
+      AttrListPtr ceilPal;
+      FunctionType* ceilType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/ceilArgs,
+         /*isVarArg=*/false);
+      m_llvmCeil = Function::Create(
+         /*Type=*/ceilType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"ceilf", m_mod);
+      m_llvmCeil->setCallingConv(CallingConv::C);
+      m_llvmCeil->setAttributes(ceilPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmCeil, val,
+                                          name("ceilf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
+{
+   if (!m_llvmFAbs) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fabsArgs;
+      fabsArgs.push_back(Type::FloatTy);
+      AttrListPtr fabsPal;
+      FunctionType* fabsType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fabsArgs,
+         /*isVarArg=*/false);
+      m_llvmFAbs = Function::Create(
+         /*Type=*/fabsType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"fabs", m_mod);
+      m_llvmFAbs->setCallingConv(CallingConv::C);
+      m_llvmFAbs->setAttributes(fabsPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+                                         name("fabs"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFExp(llvm::Value *val)
+{
+   if (!m_llvmFexp) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fexpArgs;
+      fexpArgs.push_back(Type::FloatTy);
+      AttrListPtr fexpPal;
+      FunctionType* fexpType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fexpArgs,
+         /*isVarArg=*/false);
+      m_llvmFexp = Function::Create(
+         /*Type=*/fexpType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"expf", m_mod);
+      m_llvmFexp->setCallingConv(CallingConv::C);
+      m_llvmFexp->setAttributes(fexpPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFexp, val,
+                                         name("expf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFLog(llvm::Value *val)
+{
+   if (!m_llvmFlog) {
+      // predeclare the intrinsic
+      std::vector<const Type*> flogArgs;
+      flogArgs.push_back(Type::FloatTy);
+      AttrListPtr flogPal;
+      FunctionType* flogType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/flogArgs,
+         /*isVarArg=*/false);
+      m_llvmFlog = Function::Create(
+         /*Type=*/flogType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"logf", m_mod);
+      m_llvmFlog->setCallingConv(CallingConv::C);
+      m_llvmFlog->setAttributes(flogPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+                                         name("logf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callFloor(llvm::Value *val)
+{
+   if (!m_llvmFloor) {
+      // predeclare the intrinsic
+      std::vector<const Type*> floorArgs;
+      floorArgs.push_back(Type::FloatTy);
+      AttrListPtr floorPal;
+      FunctionType* floorType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/floorArgs,
+         /*isVarArg=*/false);
+      m_llvmFloor = Function::Create(
+         /*Type=*/floorType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"floorf", m_mod);
+      m_llvmFloor->setCallingConv(CallingConv::C);
+      m_llvmFloor->setAttributes(floorPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
+                                          name("floorf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+{
+   if (!m_llvmFSqrt) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fsqrtArgs;
+      fsqrtArgs.push_back(Type::FloatTy);
+      AttrListPtr fsqrtPal;
+      FunctionType* fsqrtType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fsqrtArgs,
+         /*isVarArg=*/false);
+      m_llvmFSqrt = Function::Create(
+         /*Type=*/fsqrtType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.sqrt.f32", m_mod);
+      m_llvmFSqrt->setCallingConv(CallingConv::C);
+      m_llvmFSqrt->setAttributes(fsqrtPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+                                         name("sqrt"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+{
+   if (!m_llvmPow) {
+      // predeclare the intrinsic
+      std::vector<const Type*> powArgs;
+      powArgs.push_back(Type::FloatTy);
+      powArgs.push_back(Type::FloatTy);
+      AttrListPtr powPal;
+      FunctionType* powType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/powArgs,
+         /*isVarArg=*/false);
+      m_llvmPow = Function::Create(
+         /*Type=*/powType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.pow.f32", m_mod);
+      m_llvmPow->setCallingConv(CallingConv::C);
+      m_llvmPow->setAttributes(powPal);
+   }
+   std::vector<Value*> params;
+   params.push_back(val1);
+   params.push_back(val2);
+   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+                                         name("pow"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                           llvm::Value *z, llvm::Value *w)
+{
+   Constant *const_vec = Constant::getNullValue(m_floatVecType);
+   Value *res = m_builder.CreateInsertElement(const_vec, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
+}
+
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
+{
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(APFloat(x));
+   vec[1] = ConstantFP::get(APFloat(y));
+   vec[2] = ConstantFP::get(APFloat(z));
+   vec[3] = ConstantFP::get(APFloat(w));
+   return ConstantVector::get(m_floatVecType, vec);
+}
+
+llvm::Function * Instructions::declarePrintf()
+{
+   std::vector<const Type*> args;
+   AttrListPtr params;
+   FunctionType* funcTy = FunctionType::get(
+      /*Result=*/IntegerType::get(32),
+      /*Params=*/args,
+      /*isVarArg=*/true);
+   Function* func_printf = Function::Create(
+      /*Type=*/funcTy,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/"printf", m_mod);
+   func_printf->setCallingConv(CallingConv::C);
+   func_printf->setAttributes(params);
+   return func_printf;
 }
 
 llvm::Function * Instructions::declareFunc(int label)
@@ -763,7 +1148,7 @@ llvm::Function * Instructions::declareFunc(int label)
    args.push_back(vecPtr);
    args.push_back(vecPtr);
    args.push_back(vecPtr);
-   PAListPtr params;
+   AttrListPtr params;
    FunctionType *funcType = FunctionType::get(
       /*Result=*/Type::VoidTy,
       /*Params=*/args,
@@ -774,31 +1159,10 @@ llvm::Function * Instructions::declareFunc(int label)
       /*Linkage=*/GlobalValue::ExternalLinkage,
       /*Name=*/name.c_str(), m_mod);
    func->setCallingConv(CallingConv::C);
-   func->setParamAttrs(params);
+   func->setAttributes(params);
    return func;
 }
 
-void Instructions::bgnSub(unsigned label)
-{
-   llvm::Function *func = findFunction(label);
-
-   Function::arg_iterator args = func->arg_begin();
-   Value *ptr_INPUT = args++;
-   ptr_INPUT->setName("INPUT");
-   m_storage->pushArguments(ptr_INPUT);
-
-   llvm::BasicBlock *entry = BasicBlock::Create("entry", func, 0);
-
-   m_func = func;
-   m_builder.SetInsertPoint(entry);
-}
-
-void Instructions::endSub()
-{
-   m_func = 0;
-   m_builder.SetInsertPoint(0);
-}
-
 llvm::Function * Instructions::findFunction(int label)
 {
    llvm::Function *func = m_functions[label];
@@ -809,17 +1173,6 @@ llvm::Function * Instructions::findFunction(int label)
    return func;
 }
 
-llvm::Value * Instructions::constVector(float x, float y, float z, float w)
-{
-   std::vector<Constant*> vec(4);
-   vec[0] = ConstantFP::get(APFloat(x));
-   vec[1] = ConstantFP::get(APFloat(y));
-   vec[2] = ConstantFP::get(APFloat(z));
-   vec[3] = ConstantFP::get(APFloat(w));
-   return ConstantVector::get(m_floatVecType, vec);
-}
-
-
 std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
 {
    std::vector<llvm::Value*> elems(4);
@@ -834,69 +1187,7 @@ std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
    return elems;
 }
 
-llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
-{
-   llvm::Function *func = m_mod->getFunction("cmp");
-   assert(func);
-
-   std::vector<Value*> params;
-   params.push_back(in1);
-   params.push_back(in2);
-   params.push_back(in3);
-   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::cos(llvm::Value *in)
-{
-#if 0
-   llvm::Function *func = m_mod->getFunction("vcos");
-   assert(func);
 
-   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
-   call->setTailCall(false);
-   return call;
-#else
-   std::vector<llvm::Value*> elems = extractVector(in);
-   Function *func = m_mod->getFunction("cosf");
-   assert(func);
-   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
-   cos->setCallingConv(CallingConv::C);
-   cos->setTailCall(true);
-   return vectorFromVals(cos, cos, cos, cos);
-#endif
-}
-
-llvm::Value * Instructions::scs(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("scs");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::kil(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("kil");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
-   call->setTailCall(false);
-   return call;
-}
-
-llvm::Value * Instructions::sin(llvm::Value *in)
-{
-   llvm::Function *func = m_mod->getFunction("vsin");
-   assert(func);
-
-   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
-   call->setTailCall(false);
-   return call;
-}
 #endif //MESA_LLVM
 
 
diff --git a/src/gallium/auxiliary/gallivm/instructions.h b/src/gallium/auxiliary/gallivm/instructions.h
index d286ce80c78..e18571251ee 100644
--- a/src/gallium/auxiliary/gallivm/instructions.h
+++ b/src/gallium/auxiliary/gallivm/instructions.h
@@ -57,15 +57,24 @@ public:
    llvm::BasicBlock *currentBlock() const;
 
    llvm::Value *abs(llvm::Value *in1);
-   llvm::Value *arl(llvm::Value *in1);
    llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *arl(llvm::Value *in1);
    void         beginLoop();
    void         bgnSub(unsigned);
    void         brk();
    void         cal(int label, llvm::Value *input);
+   llvm::Value *ceil(llvm::Value *in);
+   llvm::Value *clamp(llvm::Value *in);
    llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cnd0(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
    llvm::Value *cos(llvm::Value *in);
    llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *ddx(llvm::Value *in);
+   llvm::Value *ddy(llvm::Value *in);
+   llvm::Value *div(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dot2add(llvm::Value *in, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *dp2(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
@@ -75,6 +84,7 @@ public:
    void         endLoop();
    void         end();
    void         endSub();
+   llvm::Value *exp(llvm::Value *in);
    llvm::Value *ex2(llvm::Value *in);
    llvm::Value *floor(llvm::Value *in);
    llvm::Value *frc(llvm::Value *in);
@@ -82,32 +92,43 @@ public:
    llvm::Value *kil(llvm::Value *in);
    llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *lit(llvm::Value *in);
    llvm::Value *lg2(llvm::Value *in);
+   llvm::Value *lit(llvm::Value *in);
+   llvm::Value *log(llvm::Value *in);
    llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
                      llvm::Value *in3);
-   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *neg(llvm::Value *in);
+   llvm::Value *nrm(llvm::Value *in);
    llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *rcp(llvm::Value *in);
    llvm::Value *rsq(llvm::Value *in);
    llvm::Value *scs(llvm::Value *in);
+   llvm::Value *seq(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sfl(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sin(llvm::Value *in);
+   llvm::Value *sle(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sne(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *str(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
    llvm::Value *trunc(llvm::Value *in);
+   llvm::Value *x2d(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
 
    void printVector(llvm::Value *val);
 private:
    const char *name(const char *prefix);
 
+   llvm::Value *callCeil(llvm::Value *val);
    llvm::Value *callFAbs(llvm::Value *val);
+   llvm::Value *callFExp(llvm::Value *val);
+   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callFloor(llvm::Value *val);
    llvm::Value *callFSqrt(llvm::Value *val);
-   llvm::Value *callFLog(llvm::Value *val);
    llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
 
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -125,16 +146,18 @@ private:
    llvm::Module             *m_mod;
    llvm::Function           *m_func;
    char                      m_name[32];
-   llvm::IRBuilder           m_builder;
+   llvm::IRBuilder<>         m_builder;
    int                       m_idx;
 
    llvm::VectorType *m_floatVecType;
 
+   llvm::Function   *m_llvmCeil;
    llvm::Function   *m_llvmFSqrt;
    llvm::Function   *m_llvmFAbs;
    llvm::Function   *m_llvmPow;
    llvm::Function   *m_llvmFloor;
    llvm::Function   *m_llvmFlog;
+   llvm::Function   *m_llvmFexp;
    llvm::Function   *m_llvmLit;
 
    llvm::Constant   *m_fmtPtr;
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.cpp b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
index efddc04e818..d5600fd22da 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.cpp
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.cpp
@@ -90,68 +90,11 @@ llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
    return res;
 }
 
-std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
-{
-   std::vector<llvm::Value*> res(4);
-
-   //Extract x's
-   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
-                                                    m_storage->constantInt(0),
-                                                    name("extractX"));
-   //cast it to an unsigned int
-   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
-
-   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
-   //only x is valid. the others shouldn't be necessary
-   /*
-   res[1] = Constant::getNullValue(m_floatVecType);
-   res[2] = Constant::getNullValue(m_floatVecType);
-   res[3] = Constant::getNullValue(m_floatVecType);
-   */
-
-   return res;
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
-   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
-   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
-   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
-   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
-   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
-   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
-
-   return res;
-}
-
 void InstructionsSoa::end()
 {
    m_builder.CreateRetVoid();
 }
 
-std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
-                                                const std::vector<llvm::Value*> in2,
-                                                const std::vector<llvm::Value*> in3)
-{
-   std::vector<llvm::Value*> res = mul(in1, in2);
-   return add(res, in3);
-}
-
 std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
 {
    std::vector<llvm::Value*> res(4);
@@ -171,6 +114,11 @@ std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
    return res;
 }
 
+llvm::IRBuilder<>* InstructionsSoa::getIRBuilder()
+{
+   return &m_builder;
+}
+
 void InstructionsSoa::createFunctionMap()
 {
    m_functionsMap[TGSI_OPCODE_ABS]   = "abs";
@@ -181,6 +129,7 @@ void InstructionsSoa::createFunctionMap()
    m_functionsMap[TGSI_OPCODE_POWER] = "pow";
    m_functionsMap[TGSI_OPCODE_LIT]   = "lit";
    m_functionsMap[TGSI_OPCODE_RSQ]   = "rsq";
+   m_functionsMap[TGSI_OPCODE_SLT]   = "slt";
 }
 
 void InstructionsSoa::createDependencies()
@@ -273,6 +222,41 @@ std::vector<llvm::Value*> InstructionsSoa::abs(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+   std::vector<llvm::Value*> res(4);
+
+   //Extract x's
+   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+                                                    m_storage->constantInt(0),
+                                                    name("extractX"));
+   //cast it to an unsigned int
+   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+   //only x is valid. the others shouldn't be necessary
+   /*
+   res[1] = Constant::getNullValue(m_floatVecType);
+   res[2] = Constant::getNullValue(m_floatVecType);
+   res[3] = Constant::getNullValue(m_floatVecType);
+   */
+
+   return res;
+}
+
 std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> in1,
                                                const std::vector<llvm::Value*> in2)
 {
@@ -280,6 +264,98 @@ std::vector<llvm::Value*> InstructionsSoa::dp3(const std::vector<llvm::Value*> i
    return callBuiltin(func, in1, in2);
 }
 
+std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_LIT);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+                                                const std::vector<llvm::Value*> in2,
+                                                const std::vector<llvm::Value*> in3)
+{
+   std::vector<llvm::Value*> res = mul(in1, in2);
+   return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MAX);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_MIN);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_POWER);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
+{
+   llvm::Function *func = function(TGSI_OPCODE_RSQ);
+   return callBuiltin(func, in);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::slt(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   llvm::Function *func = function(TGSI_OPCODE_SLT);
+   return callBuiltin(func, in1, in2);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
+   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
+   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
+   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
+
+   return res;
+}
+
+void checkFunction(Function *func)
+{
+   for (Function::const_iterator BI = func->begin(), BE = func->end();
+        BI != BE; ++BI) {
+      const BasicBlock &BB = *BI;
+      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
+           II != IE; ++II) {
+         const Instruction &I = *II;
+         std::cout<< "Instr = "<<I;
+         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
+            const Value *Op = I.getOperand(op);
+            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
+            //I->setOperand(op, V);
+  }
+      }
+   }
+}
+
 llvm::Value * InstructionsSoa::allocaTemp()
 {
    VectorType *vector   = VectorType::get(Type::FloatTy, 4);
@@ -399,46 +475,6 @@ std::vector<Value*> InstructionsSoa::callBuiltin(llvm::Function *func, const std
    return allocaToResult(allocaPtr);
 }
 
-std::vector<llvm::Value*> InstructionsSoa::pow(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_POWER);
-   return callBuiltin(func, in1, in2);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::min(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MIN);
-   return callBuiltin(func, in1, in2);
-}
-
-
-std::vector<llvm::Value*> InstructionsSoa::max(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   llvm::Function *func = function(TGSI_OPCODE_MAX);
-   return callBuiltin(func, in1, in2);
-}
-
-void checkFunction(Function *func)
-{
-   for (Function::const_iterator BI = func->begin(), BE = func->end();
-        BI != BE; ++BI) {
-      const BasicBlock &BB = *BI;
-      for (BasicBlock::const_iterator II = BB.begin(), IE = BB.end();
-           II != IE; ++II) {
-         const Instruction &I = *II;
-         std::cout<< "Instr = "<<I;
-         for (unsigned op = 0, E = I.getNumOperands(); op != E; ++op) {
-            const Value *Op = I.getOperand(op);
-            std::cout<< "\top = "<<Op<<"("<<op<<")"<<std::endl;
-            //I->setOperand(op, V);
-  }
-      }
-   }
-}
-
 void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
 {
    assert(originalFunc);
@@ -458,8 +494,8 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
       func = Function::Create(originalFunc->getFunctionType(), GlobalValue::ExternalLinkage,
                               originalFunc->getName(), currentModule());
       func->setCallingConv(CallingConv::C);
-      const PAListPtr pal;
-      func->setParamAttrs(pal);
+      const AttrListPtr pal;
+      func->setAttributes(pal);
       currentModule()->dump();
    } else {
       DenseMap<const Value*, Value *> val;
@@ -483,28 +519,4 @@ void InstructionsSoa::injectFunction(llvm::Function *originalFunc, int op)
    }
 }
 
-std::vector<llvm::Value*> InstructionsSoa::sub(const std::vector<llvm::Value*> in1,
-                                               const std::vector<llvm::Value*> in2)
-{
-   std::vector<llvm::Value*> res(4);
-
-   res[0] = m_builder.CreateSub(in1[0], in2[0], name("subx"));
-   res[1] = m_builder.CreateSub(in1[1], in2[1], name("suby"));
-   res[2] = m_builder.CreateSub(in1[2], in2[2], name("subz"));
-   res[3] = m_builder.CreateSub(in1[3], in2[3], name("subw"));
-
-   return res;
-}
-
-std::vector<llvm::Value*> InstructionsSoa::lit(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_LIT);
-   return callBuiltin(func, in);
-}
-
-std::vector<llvm::Value*> InstructionsSoa::rsq(const std::vector<llvm::Value*> in)
-{
-   llvm::Function *func = function(TGSI_OPCODE_RSQ);
-   return callBuiltin(func, in);
-}
 
diff --git a/src/gallium/auxiliary/gallivm/instructionssoa.h b/src/gallium/auxiliary/gallivm/instructionssoa.h
index 3e20b652dd3..d6831e0a6b9 100644
--- a/src/gallium/auxiliary/gallivm/instructionssoa.h
+++ b/src/gallium/auxiliary/gallivm/instructionssoa.h
@@ -69,11 +69,14 @@ public:
    std::vector<llvm::Value*> pow(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> rsq(const std::vector<llvm::Value*> in1);
+   std::vector<llvm::Value*> slt(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
    std::vector<llvm::Value*> sub(const std::vector<llvm::Value*> in1,
                                  const std::vector<llvm::Value*> in2);
    void         end();
 
    std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+   llvm::IRBuilder<>*  getIRBuilder();
 private:
    const char * name(const char *prefix) const;
    llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
@@ -96,7 +99,7 @@ private:
                                          const std::vector<llvm::Value*> in3);
    void injectFunction(llvm::Function *originalFunc, int op = TGSI_OPCODE_LAST);
 private:
-   llvm::IRBuilder  m_builder;
+   llvm::IRBuilder<>  m_builder;
    StorageSoa *m_storage;
 
    std::map<int, std::string> m_functionsMap;
diff --git a/src/gallium/auxiliary/gallivm/soabuiltins.c b/src/gallium/auxiliary/gallivm/soabuiltins.c
index 78f84510e29..cb85e1734ec 100644
--- a/src/gallium/auxiliary/gallivm/soabuiltins.c
+++ b/src/gallium/auxiliary/gallivm/soabuiltins.c
@@ -36,6 +36,8 @@ typedef __attribute__(( ext_vector_type(4) )) float float4;
 
 extern float fabsf(float val);
 
+/* helpers */
+
 float4 absvec(float4 vec)
 {
    float4 res;
@@ -47,6 +49,58 @@ float4 absvec(float4 vec)
    return res;
 }
 
+float4 maxvec(float4 a, float4 b)
+{
+   return (float4){(a.x > b.x) ? a.x : b.x,
+         (a.y > b.y) ? a.y : b.y,
+         (a.z > b.z) ? a.z : b.z,
+         (a.w > b.w) ? a.w : b.w};
+}
+
+float4 minvec(float4 a, float4 b)
+{
+   return (float4){(a.x < b.x) ? a.x : b.x,
+         (a.y < b.y) ? a.y : b.y,
+         (a.z < b.z) ? a.z : b.z,
+         (a.w < b.w) ? a.w : b.w};
+}
+
+extern float powf(float num, float p);
+extern float sqrtf(float x);
+
+float4 powvec(float4 vec, float4 q)
+{
+   float4 p;
+   p.x = powf(vec.x, q.x);
+   p.y = powf(vec.y, q.y);
+   p.z = powf(vec.z, q.z);
+   p.w = powf(vec.w, q.w);
+   return p;
+}
+
+float4 sqrtvec(float4 vec)
+{
+   float4 p;
+   p.x = sqrtf(vec.x);
+   p.y = sqrtf(vec.y);
+   p.z = sqrtf(vec.z);
+   p.w = sqrtf(vec.w);
+   return p;
+}
+
+float4 sltvec(float4 v1, float4 v2)
+{
+   float4 p;
+   p.x = (v1.x < v2.x) ? 1.0 : 0.0;
+   p.y = (v1.y < v2.y) ? 1.0 : 0.0;
+   p.z = (v1.z < v2.z) ? 1.0 : 0.0;
+   p.w = (v1.w < v2.w) ? 1.0 : 0.0;
+   return p;
+}
+
+
+/* instructions */
+
 void abs(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
@@ -69,7 +123,6 @@ void dp3(float4 *res,
    res[3] = dot;
 }
 
-
 void dp4(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -83,35 +136,25 @@ void dp4(float4 *res,
    res[3] = dot;
 }
 
-extern float powf(float num, float p);
-extern float sqrtf(float x);
-
-float4 powvec(float4 vec, float4 q)
-{
-   float4 p;
-   p.x = powf(vec.x, q.x);
-   p.y = powf(vec.y, q.y);
-   p.z = powf(vec.z, q.z);
-   p.w = powf(vec.w, q.w);
-   return p;
-}
-
-void pow(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
-         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+void lit(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
 {
-   res[0] = powvec(tmp0x, tmp1x);
-   res[1] = res[0];
-   res[2] = res[0];
-   res[3] = res[0];
-}
+   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
+   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
+   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
 
-float4 minvec(float4 a, float4 b)
-{
-   return (float4){(a.x < b.x) ? a.x : b.x,
-         (a.y < b.y) ? a.y : b.y,
-         (a.z < b.z) ? a.z : b.z,
-         (a.w < b.w) ? a.w : b.w};
+   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
+   if (tmp0x.x > 0) {
+      float4 tmpy = maxvec(tmp0y, zerovec);
+      float4 tmpw = minvec(tmp0w, plus128);
+      tmpw = maxvec(tmpw, min128);
+      res[1] = tmp0x;
+      res[2] = powvec(tmpy, tmpw);
+   } else {
+      res[1] = zerovec;
+      res[2] = zerovec;
+   }
+   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
 }
 
 void min(float4 *res,
@@ -125,14 +168,6 @@ void min(float4 *res,
 }
 
 
-float4 maxvec(float4 a, float4 b)
-{
-   return (float4){(a.x > b.x) ? a.x : b.x,
-         (a.y > b.y) ? a.y : b.y,
-         (a.z > b.z) ? a.z : b.z,
-         (a.w > b.w) ? a.w : b.w};
-}
-
 void max(float4 *res,
          float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
          float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
@@ -143,37 +178,14 @@ void max(float4 *res,
    res[3] = maxvec(tmp0w, tmp1w);
 }
 
-
-void lit(float4 *res,
-         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w)
-{
-   const float4 zerovec = (float4) {0.0, 0.0, 0.0, 0.0};
-   const float4 min128 = (float4) {-128.f, -128.f, -128.f, -128.f};
-   const float4 plus128 = (float4) {128.f,  128.f,  128.f,  128.f};
-
-   res[0] = (float4){1.0, 1.0, 1.0, 1.0};
-   if (tmp0x.x > 0) {
-      float4 tmpy = maxvec(tmp0y, zerovec);
-      float4 tmpw = minvec(tmp0w, plus128);
-      tmpw = maxvec(tmpw, min128);
-      res[1] = tmp0x;
-      res[2] = powvec(tmpy, tmpw);
-   } else {
-      res[1] = zerovec;
-      res[2] = zerovec;
-   }
-   res[3] = (float4){1.0, 1.0, 1.0, 1.0};
-}
-
-
-float4 sqrtvec(float4 vec)
+void pow(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
 {
-   float4 p;
-   p.x = sqrtf(vec.x);
-   p.y = sqrtf(vec.y);
-   p.z = sqrtf(vec.z);
-   p.w = sqrtf(vec.w);
-   return p;
+   res[0] = powvec(tmp0x, tmp1x);
+   res[1] = res[0];
+   res[2] = res[0];
+   res[3] = res[0];
 }
 
 void rsq(float4 *res,
@@ -185,3 +197,14 @@ void rsq(float4 *res,
    res[2] = onevec/sqrtvec(absvec(tmp0z));
    res[3] = onevec/sqrtvec(absvec(tmp0w));
 }
+
+void slt(float4 *res,
+         float4 tmp0x, float4 tmp0y, float4 tmp0z, float4 tmp0w,
+         float4 tmp1x, float4 tmp1y, float4 tmp1z, float4 tmp1w)
+{
+   res[0] = sltvec(tmp0x, tmp1x);
+   res[1] = sltvec(tmp0y, tmp1y);
+   res[2] = sltvec(tmp0z, tmp1z);
+   res[3] = sltvec(tmp0w, tmp1w);
+}
+
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.cpp b/src/gallium/auxiliary/gallivm/storagesoa.cpp
index 78d754371f0..4fc075cf6d4 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.cpp
+++ b/src/gallium/auxiliary/gallivm/storagesoa.cpp
@@ -93,7 +93,7 @@ void StorageSoa::declareImmediates()
       std::vector<float> vals(4);
       std::vector<Constant*> channelArray;
 
-      vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+      vals[0] = vec[0]; vals[1] = vec[1]; vals[2] = vec[2]; vals[3] = vec[3];
       llvm::Constant *xChannel = createConstGlobalVector(vals);
 
       vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
@@ -144,22 +144,43 @@ std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
    return res;
 }
 
-std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+llvm::Value* StorageSoa::unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value* vector, int cc)
 {
-   std::vector<llvm::Value*> res(4);
+   std::vector<llvm::Value*> x(4);
+   x[0] = m_builder->CreateExtractElement(vector,
+                                           constantInt(cc),
+                                           name("x"));
+
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder->CreateInsertElement(constVector, x[0],
+                                              constantInt(0),
+                                              name("vecx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(1),
+                               name("vecxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(2),
+                               name("vecxxx"));
+   res = m_builder->CreateInsertElement(res, x[0], constantInt(3),
+                               name("vecxxxx"));
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::IRBuilder<>* m_builder, llvm::Value *idx)
+{
+   llvm::Value* res;
+   std::vector<llvm::Value*> res2(4);
    llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
 
    xChannel = elementPointer(m_consts, idx, 0);
-   yChannel = elementPointer(m_consts, idx, 1);
-   zChannel = elementPointer(m_consts, idx, 2);
-   wChannel = elementPointer(m_consts, idx, 3);
 
-   res[0] = alignedArrayLoad(xChannel);
-   res[1] = alignedArrayLoad(yChannel);
-   res[2] = alignedArrayLoad(zChannel);
-   res[3] = alignedArrayLoad(wChannel);
+   res = alignedArrayLoad(xChannel);
 
-   return res;
+   res2[0]=unpackConstElement(m_builder, res,0);
+   res2[1]=unpackConstElement(m_builder, res,1);
+   res2[2]=unpackConstElement(m_builder, res,2);
+   res2[3]=unpackConstElement(m_builder, res,3);
+
+   return res2;
 }
 
 std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
@@ -260,6 +281,12 @@ llvm::Module * StorageSoa::currentModule() const
     return m_block->getParent()->getParent();
 }
 
+llvm::Constant * StorageSoa::createConstGlobalFloat(const float val)
+{
+   Constant*c = ConstantFP::get(APFloat(val));
+   return c;
+}
+
 llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
 {
    VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
@@ -278,7 +305,7 @@ llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &v
 }
 
 std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, int swizzle,
-                                           llvm::Value *indIdx)
+                                           llvm::IRBuilder<>* m_builder,llvm::Value *indIdx)
 {
    std::vector<llvm::Value*> val(4);
 
@@ -302,7 +329,7 @@ std::vector<llvm::Value*> StorageSoa::load(enum tgsi_file_type type, int idx, in
       val = tempElement(realIndex);
       break;
    case TGSI_FILE_CONSTANT:
-      val = constElement(realIndex);
+      val = constElement(m_builder, realIndex);
       break;
    case TGSI_FILE_IMMEDIATE:
       val = immediateElement(realIndex);
diff --git a/src/gallium/auxiliary/gallivm/storagesoa.h b/src/gallium/auxiliary/gallivm/storagesoa.h
index ae2fc7c6aee..f21ca6ec433 100644
--- a/src/gallium/auxiliary/gallivm/storagesoa.h
+++ b/src/gallium/auxiliary/gallivm/storagesoa.h
@@ -29,6 +29,7 @@
 #define STORAGESOA_H
 
 #include <pipe/p_shader_tokens.h>
+#include <llvm/Support/IRBuilder.h>
 
 #include <vector>
 #include <list>
@@ -56,7 +57,7 @@ public:
 
 
    std::vector<llvm::Value*> load(enum tgsi_file_type type, int idx, int swizzle, 
-                                  llvm::Value *indIdx =0);
+                                  llvm::IRBuilder<>* m_builder, llvm::Value *indIdx =0);
    void store(enum tgsi_file_type type, int idx, const std::vector<llvm::Value*> &val,
               int mask);
 
@@ -76,10 +77,12 @@ private:
    const char *name(const char *prefix) const;
    llvm::Value  *alignedArrayLoad(llvm::Value *val);
    llvm::Module *currentModule() const;
+   llvm::Constant  *createConstGlobalFloat(const float val);
    llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
 
    std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
-   std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+   llvm::Value* unpackConstElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx, int cc);
+   std::vector<llvm::Value*> constElement(llvm::IRBuilder<>* m_builder, llvm::Value *indIdx);
    std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
    std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
    std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
diff --git a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
index cc1516a45e0..1191a6cae97 100644
--- a/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
+++ b/src/gallium/auxiliary/gallivm/tgsitollvm.cpp
@@ -52,7 +52,7 @@ static inline FunctionType *vertexShaderFunctionType()
    // pass are castable to the following:
    // [4 x <4 x float>] inputs,
    // [4 x <4 x float>] output,
-   // [4 x [4 x float]] consts,
+   // [4 x [1 x float]] consts,
    // [4 x <4 x float>] temps
 
    std::vector<const Type*> funcArgs;
@@ -61,7 +61,7 @@ static inline FunctionType *vertexShaderFunctionType()
    PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
 
    ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
-   ArrayType   *constsArray    = ArrayType::get(floatArray, 4);
+   ArrayType   *constsArray    = ArrayType::get(floatArray, 1);
    PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
 
    funcArgs.push_back(vectorArrayPtr);//inputs
@@ -246,6 +246,7 @@ translate_instruction(llvm::Module *module,
          val = storage->constElement(src->SrcRegister.Index, indIdx);
       } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
          val = storage->inputElement(src->SrcRegister.Index, indIdx);
+      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
          val = storage->tempElement(src->SrcRegister.Index);
       } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
@@ -286,9 +287,13 @@ translate_instruction(llvm::Module *module,
       out = instr->rsq(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_EXP:
+   case TGSI_OPCODE_EXP: {
+      out = instr->exp(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_LOG:
+   case TGSI_OPCODE_LOG: {
+      out = instr->log(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_MUL: {
       out = instr->mul(inputs[0], inputs[1]);
@@ -338,21 +343,31 @@ translate_instruction(llvm::Module *module,
       out = instr->lerp(inputs[0], inputs[1], inputs[2]);
    }
       break;
-   case TGSI_OPCODE_CND:
+   case TGSI_OPCODE_CND: {
+      out = instr->cnd(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_CND0:
+   case TGSI_OPCODE_CND0: {
+      out = instr->cnd0(inputs[0], inputs[1], inputs[2]);
+   }
       break;
-   case TGSI_OPCODE_DOT2ADD:
+   case TGSI_OPCODE_DOT2ADD: {
+      out = instr->dot2add(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_INDEX:
       break;
-   case TGSI_OPCODE_NEGATE:
+   case TGSI_OPCODE_NEGATE: {
+      out = instr->neg(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FRAC: {
       out = instr->frc(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_CLAMP:
+   case TGSI_OPCODE_CLAMP: {
+      out = instr->clamp(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_FLOOR: {
       out = instr->floor(inputs[0]);
@@ -392,9 +407,13 @@ translate_instruction(llvm::Module *module,
       out = instr->cos(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_DDX:
+   case TGSI_OPCODE_DDX: {
+      out = instr->ddx(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DDY:
+   case TGSI_OPCODE_DDY: {
+      out = instr->ddy(inputs[0]);
+   }
       break;
    case TGSI_OPCODE_KILP:
       break;
@@ -408,9 +427,13 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_RFL:
       break;
-   case TGSI_OPCODE_SEQ:
+   case TGSI_OPCODE_SEQ: {
+      out = instr->seq(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SFL:
+   case TGSI_OPCODE_SFL: {
+      out = instr->sfl(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_SGT: {
       out = instr->sgt(inputs[0], inputs[1]);
@@ -420,11 +443,17 @@ translate_instruction(llvm::Module *module,
       out = instr->sin(inputs[0]);
    }
       break;
-   case TGSI_OPCODE_SLE:
+   case TGSI_OPCODE_SLE: {
+      out = instr->sle(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_SNE:
+   case TGSI_OPCODE_SNE: {
+      out = instr->sne(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_STR:
+   case TGSI_OPCODE_STR: {
+      out = instr->str(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TEX:
       break;
@@ -438,7 +467,9 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_UP4UB:
       break;
-   case TGSI_OPCODE_X2D:
+   case TGSI_OPCODE_X2D: {
+      out = instr->x2d(inputs[0], inputs[1], inputs[2]);
+   }
       break;
    case TGSI_OPCODE_ARA:
       break;
@@ -468,11 +499,18 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_TXB:
       break;
-   case TGSI_OPCODE_NRM:
+   case TGSI_OPCODE_NRM4:
+   case TGSI_OPCODE_NRM: {
+      out = instr->nrm(inputs[0]);
+   }
       break;
-   case TGSI_OPCODE_DIV:
+   case TGSI_OPCODE_DIV: {
+      out = instr->div(inputs[0], inputs[1]);
+   }
       break;
-   case TGSI_OPCODE_DP2:
+   case TGSI_OPCODE_DP2: {
+      out = instr->dp2(inputs[0], inputs[1]);
+   }
       break;
    case TGSI_OPCODE_TXL:
       break;
@@ -590,8 +628,6 @@ translate_instruction(llvm::Module *module,
       break;
    case TGSI_OPCODE_M3X2:
       break;
-   case TGSI_OPCODE_NRM4:
-      break;
    case TGSI_OPCODE_CALLNZ:
       break;
    case TGSI_OPCODE_IFC:
@@ -641,6 +677,7 @@ translate_instruction(llvm::Module *module,
 
       if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
          storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      // FIXME we should not be generating elements for temporaries, this creates useless memory writes
       } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
          storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
       } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
@@ -672,9 +709,8 @@ translate_instructionir(llvm::Module *module,
       if (src->SrcRegister.Indirect) {
          indIdx = storage->addrElement(src->SrcRegisterInd.Index);
       }
-
       val = storage->load((enum tgsi_file_type)src->SrcRegister.File,
-                          src->SrcRegister.Index, swizzle, indIdx);
+                          src->SrcRegister.Index, swizzle, instr->getIRBuilder(), indIdx);
 
       inputs[i] = val;
    }
@@ -732,6 +768,7 @@ translate_instructionir(llvm::Module *module,
    }
       break;
    case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
    }
       break;
    case TGSI_OPCODE_SGE: {
@@ -989,7 +1026,6 @@ translate_instructionir(llvm::Module *module,
    /* store results  */
    for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
       struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
-
       storage->store((enum tgsi_file_type)dst->DstRegister.File,
                      dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
    }
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index 491141f1908..dea1aed0320 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -164,6 +164,27 @@ rem_prefix(const char *longname)
 }
 
 
+static const char *
+reg_name(int reg)
+{
+   switch (reg) {
+   case SPE_REG_SP:
+      return "$sp";
+   case SPE_REG_RA:
+      return "$lr";
+   default:
+      {
+         /* cycle through four buffers to handle multiple calls per printf */
+         static char buf[4][10];
+         static int b = 0;
+         b = (b + 1) % 4;
+         sprintf(buf[b], "$%d", reg);
+         return buf[b];
+      }
+   }
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
 		    unsigned rA, unsigned rB, const char *name)
 {
@@ -176,7 +197,8 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
+       printf("%s\t%s, %s, %s\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), reg_name(rB));
     }
 }
 
@@ -194,7 +216,8 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
+       printf("%s\t%s, %s, %s, %s\n", rem_prefix(name), reg_name(rT),
+              reg_name(rA), reg_name(rB), reg_name(rC));
     }
 }
 
@@ -211,7 +234,8 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -229,7 +253,8 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
@@ -247,15 +272,22 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       if (strcmp(name, "spe_lqd") == 0 ||
-           strcmp(name, "spe_stqd") == 0)
-          printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
-       else
-          printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+       printf("%s\t%s, %s, 0x%x\n",
+              rem_prefix(name), reg_name(rT), reg_name(rA), imm);
     }
 }
 
 
+/** As above, but do range checking on signed immediate value */
+static void emit_RI10s(struct spe_function *p, unsigned op, unsigned rT,
+                       unsigned rA, int imm, const char *name)
+{
+    assert(imm <= 511);
+    assert(imm >= -512);
+    emit_RI10(p, op, rT, rA, imm, name);
+}
+
+
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
 		      int imm, const char *name)
 {
@@ -267,7 +299,7 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
     }
 }
 
@@ -283,7 +315,7 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     assert(p->num_inst <= p->max_inst);
     if (p->print) {
        indent(p);
-       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+       printf("%s\t%s, 0x%x\n", rem_prefix(name), reg_name(rT), imm);
     }
 }
 
@@ -332,6 +364,12 @@ void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
    emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
+#define EMIT_RI10s(_name, _op) \
+void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
+{ \
+   emit_RI10s(p, _op, rT, rA, imm, __FUNCTION__);             \
+}
+
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
@@ -353,20 +391,28 @@ void _name (struct spe_function *p, int imm) \
 #include "rtasm_ppc_spe.h"
 
 
+
 /**
  * Initialize an spe_function.
  * \param code_size  size of instruction buffer to allocate, in bytes.
  */
 void spe_init_func(struct spe_function *p, unsigned code_size)
 {
+    unsigned int i;
+
     p->store = align_malloc(code_size, 16);
     p->num_inst = 0;
     p->max_inst = code_size / SPE_INST_SIZE;
 
+    p->set_count = 0;
+    memset(p->regs, 0, SPE_NUM_REGS * sizeof(p->regs[0]));
+
     /* Conservatively treat R0 - R2 and R80 - R127 as non-volatile.
      */
-    p->regs[0] = ~7;
-    p->regs[1] = (1U << (80 - 64)) - 1;
+    p->regs[0] = p->regs[1] = p->regs[2] = 1;
+    for (i = 80; i <= 127; i++) {
+      p->regs[i] = 1;
+    }
 
     p->print = false;
     p->indent = 0;
@@ -398,12 +444,8 @@ int spe_allocate_available_register(struct spe_function *p)
 {
    unsigned i;
    for (i = 0; i < SPE_NUM_REGS; i++) {
-      const uint64_t mask = (1ULL << (i % 64));
-      const unsigned idx = i / 64;
-
-      assert(idx < 2);
-      if ((p->regs[idx] & mask) != 0) {
-         p->regs[idx] &= ~mask;
+      if (p->regs[i] == 0) {
+         p->regs[i] = 1;
          return i;
       }
    }
@@ -417,31 +459,84 @@ int spe_allocate_available_register(struct spe_function *p)
  */
 int spe_allocate_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
-
    assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) != 0);
-
-   p->regs[idx] &= ~(1ULL << bit);
+   assert(p->regs[reg] == 0);
+   p->regs[reg] = 1;
    return reg;
 }
 
 
 /**
- * Mark the given SPE register as "unallocated".
+ * Mark the given SPE register as "unallocated".  Note that this should
+ * only be used on registers allocated in the current register set; an
+ * assertion will fail if an attempt is made to deallocate a register
+ * allocated in an earlier register set.
  */
 void spe_release_register(struct spe_function *p, int reg)
 {
-   const unsigned idx = reg / 64;
-   const unsigned bit = reg % 64;
+   assert(reg < SPE_NUM_REGS);
+   assert(p->regs[reg] == 1);
 
-   assert(idx < 2);
+   p->regs[reg] = 0;
+}
 
-   assert(reg < SPE_NUM_REGS);
-   assert((p->regs[idx] & (1ULL << bit)) == 0);
+/**
+ * Start a new set of registers.  This can be called if
+ * it will be difficult later to determine exactly what
+ * registers were actually allocated during a code generation
+ * sequence, and you really just want to deallocate all of them.
+ */
+void spe_allocate_register_set(struct spe_function *p)
+{
+   unsigned int i;
 
-   p->regs[idx] |= (1ULL << bit);
+   /* Keep track of the set count.  If it ever wraps around to 0, 
+    * we're in trouble.
+    */
+   p->set_count++;
+   assert(p->set_count > 0);
+
+   /* Increment the allocation count of all registers currently
+    * allocated.  Then any registers that are allocated in this set
+    * will be the only ones with a count of 1; they'll all be released
+    * when the register set is released.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]++;
+   }
+}
+
+void spe_release_register_set(struct spe_function *p)
+{
+   unsigned int i;
+
+   /* If the set count drops below zero, we're in trouble. */
+   assert(p->set_count > 0);
+   p->set_count--;
+
+   /* Drop the allocation level of all registers.  Any allocated
+    * during this register set will drop to 0 and then become
+    * available.
+    */
+   for (i = 0; i < SPE_NUM_REGS; i++) {
+      if (p->regs[i] > 0)
+         p->regs[i]--;
+   }
+}
+
+
+unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[])
+{
+   unsigned i, num = 0;
+   /* only count registers in the range available to callers */
+   for (i = 2; i < 80; i++) {
+      if (p->regs[i]) {
+         used[num++] = i;
+      }
+   }
+   return num;
 }
 
 
@@ -459,7 +554,7 @@ spe_indent(struct spe_function *p, int spaces)
 }
 
 
-extern void
+void
 spe_comment(struct spe_function *p, int rel_indent, const char *s)
 {
    if (p->print) {
@@ -472,6 +567,56 @@ spe_comment(struct spe_function *p, int rel_indent, const char *s)
 
 
 /**
+ * Load quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x034, rT, rA, offset >> 4, "spe_lqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("lqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
+ * Store quad word.
+ * NOTE: offset is in bytes and the least significant 4 bits must be zero!
+ */
+void spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset)
+{
+   const boolean pSave = p->print;
+
+   /* offset must be a multiple of 16 */
+   assert(offset % 16 == 0);
+   /* offset must fit in 10-bit signed int field, after shifting */
+   assert((offset >> 4) <= 511);
+   assert((offset >> 4) >= -512);
+
+   p->print = FALSE;
+   emit_RI10(p, 0x024, rT, rA, offset >> 4, "spe_stqd");
+   p->print = pSave;
+
+   if (p->print) {
+      indent(p);
+      printf("stqd\t%s, %d(%s)\n", reg_name(rT), offset, reg_name(rA));
+   }
+}
+
+
+/**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
  * \param e  if 1, enable interupts if branch is taken
@@ -603,22 +748,187 @@ void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
 {
    /* If the whole value is in the lower 18 bits, use ila, which
     * doesn't sign-extend.  Otherwise, if the two halfwords of
-    * the constant are identical, use ilh.  Otherwise, we have
-    * to use ilhu followed by iohl.
+    * the constant are identical, use ilh.  Otherwise, if every byte of
+    * the desired value is 0x00 or 0xff, we can use Form Select Mask for
+    * Bytes Immediate (fsmbi) to load the value in a single instruction.
+    * Otherwise, in the general case, we have to use ilhu followed by iohl.
     */
-   if ((ui & 0xfffc0000) == ui) {
+   if ((ui & 0x0003ffff) == ui) {
       spe_ila(p, rT, ui);
    }
    else if ((ui >> 16) == (ui & 0xffff)) {
       spe_ilh(p, rT, ui & 0xffff);
    }
+   else if (
+      ((ui & 0x000000ff) == 0 || (ui & 0x000000ff) == 0x000000ff) &&
+      ((ui & 0x0000ff00) == 0 || (ui & 0x0000ff00) == 0x0000ff00) &&
+      ((ui & 0x00ff0000) == 0 || (ui & 0x00ff0000) == 0x00ff0000) &&
+      ((ui & 0xff000000) == 0 || (ui & 0xff000000) == 0xff000000)
+   ) {
+      unsigned int mask = 0;
+      /* fsmbi duplicates each bit in the given mask eight times,
+       * using a 16-bit value to initialize a 16-byte quadword.
+       * Each 4-bit nybble of the mask corresponds to a full word
+       * of the result; look at the value and figure out the mask
+       * (replicated for each word in the quadword), and then
+       * form the "select mask" to get the value.
+       */
+      if ((ui & 0x000000ff) == 0x000000ff) mask |= 0x1111;
+      if ((ui & 0x0000ff00) == 0x0000ff00) mask |= 0x2222;
+      if ((ui & 0x00ff0000) == 0x00ff0000) mask |= 0x4444;
+      if ((ui & 0xff000000) == 0xff000000) mask |= 0x8888;
+      spe_fsmbi(p, rT, mask);
+   }
    else {
+      /* The general case: this usually uses two instructions, but
+       * may use only one if the low-order 16 bits of each word are 0.
+       */
       spe_ilhu(p, rT, ui >> 16);
       if (ui & 0xffff)
          spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
+/**
+ * This function is constructed identically to spe_xor_uint() below.
+ * Changes to one should be made in the other.
+ */
+void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either And Byte Immediate
+    * (which uses the same constant across each byte), And Halfword Immediate
+    * (which sign-extends a 10-bit immediate to 16 bits and uses that
+    * across each halfword), or And Word Immediate (which sign-extends
+    * a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use And Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_andi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use And Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_andhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the And Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_andbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_and(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+
+/**
+ * This function is constructed identically to spe_and_uint() above.
+ * Changes to one should be made in the other.
+ */
+void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If we can, emit a single instruction, either Exclusive Or Byte 
+    * Immediate (which uses the same constant across each byte), Exclusive 
+    * Or Halfword Immediate (which sign-extends a 10-bit immediate to 
+    * 16 bits and uses that across each halfword), or Exclusive Or Word 
+    * Immediate (which sign-extends a 10-bit immediate to 32 bits).
+    *
+    * Otherwise, we'll need to use a temporary register.
+    */
+   unsigned int tmp;
+
+   /* If the upper 23 bits are all 0s or all 1s, sign extension
+    * will work and we can use Exclusive Or Word Immediate
+    */
+   tmp = ui & 0xfffffe00;
+   if (tmp == 0xfffffe00 || tmp  == 0) {
+      spe_xori(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+   
+   /* If the ui field is symmetric along halfword boundaries and
+    * the upper 7 bits of each halfword are all 0s or 1s, we
+    * can use Exclusive Or Halfword Immediate
+    */
+   tmp = ui & 0xfe00fe00;
+   if ((tmp == 0xfe00fe00 || tmp == 0) && ((ui >> 16) == (ui & 0x0000ffff))) {
+      spe_xorhi(p, rT, rA, ui & 0x000003ff);
+      return;
+   }
+
+   /* If the ui field is symmetric in each byte, then we can use
+    * the Exclusive Or Byte Immediate instruction.
+    */
+   tmp = ui & 0x000000ff;
+   if ((ui >> 24) == tmp && ((ui >> 16) & 0xff) == tmp && ((ui >> 8) & 0xff) == tmp) {
+      spe_xorbi(p, rT, rA, tmp);
+      return;
+   }
+
+   /* Otherwise, we'll have to use a temporary register. */
+   unsigned int tmp_reg = spe_allocate_available_register(p);
+   spe_load_uint(p, tmp_reg, ui);
+   spe_xor(p, rT, rA, tmp_reg);
+   spe_release_register(p, tmp_reg);
+}
+
+void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 9 bits or less, it fits inside a
+    * Compare Equal Word Immediate instruction.
+    */
+   if ((ui & 0x000001ff) == ui) {
+      spe_ceqi(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_ceq(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
+
+void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui)
+{
+   /* If the comparison value is 10 bits or less, it fits inside a
+    * Compare Logical Greater Than Word Immediate instruction.
+    */
+   if ((ui & 0x000003ff) == ui) {
+      spe_clgti(p, rT, rA, ui);
+   }
+   /* Otherwise, we're going to have to load a word first. */
+   else {
+      unsigned int tmp_reg = spe_allocate_available_register(p);
+      spe_load_uint(p, tmp_reg, ui);
+      spe_clgt(p, rT, rA, tmp_reg);
+      spe_release_register(p, tmp_reg);
+   }
+}
 
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index 61c7edeb604..d6a3c02f20c 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -53,17 +53,26 @@ struct spe_function
    uint num_inst;
    uint max_inst;
 
-    /**
-     * Mask of used / unused registers
-     *
-     * Each set bit corresponds to an available register.  Each cleared bit
-     * corresponds to an allocated register.
+   /**
+    * The "set count" reflects the number of nested register sets
+    * are allowed.  In the unlikely case that we exceed the set count,
+    * register allocation will start to be confused, which is critical
+    * enough that we check for it.
+    */
+   unsigned char set_count;
+
+   /** 
+    * Flags for used and unused registers.  Each byte corresponds to a
+    * register; a 0 in that byte means that the register is available.
+    * A value of 1 means that the register was allocated in the current
+    * register set.  Any other value N means that the register was allocated
+    * N register sets ago.
      *
      * \sa
      * spe_allocate_register, spe_allocate_available_register,
-     * spe_release_register
+     * spe_allocate_register_set, spe_release_register_set, spe_release_register, 
      */
-    uint64_t regs[SPE_NUM_REGS / 64];
+    unsigned char regs[SPE_NUM_REGS];
 
     boolean print; /**< print/dump instructions as they're emitted? */
     int indent;    /**< number of spaces to indent */
@@ -77,6 +86,11 @@ extern unsigned spe_code_size(const struct spe_function *p);
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
+extern void spe_allocate_register_set(struct spe_function *p);
+extern void spe_release_register_set(struct spe_function *p);
+
+extern unsigned
+spe_get_registers_used(const struct spe_function *p, ubyte used[]);
 
 extern void spe_print_code(struct spe_function *p, boolean enable);
 extern void spe_indent(struct spe_function *p, int spaces);
@@ -105,6 +119,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 #define EMIT_RI10(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
 			   int imm)
+#define EMIT_RI10s(_name, _op) \
+    extern void _name (struct spe_function *p, unsigned rT, unsigned rA, \
+			   int imm)
 #define EMIT_RI16(_name, _op) \
     extern void _name (struct spe_function *p, unsigned rT, int imm)
 #define EMIT_RI18(_name, _op) \
@@ -117,11 +134,9 @@ extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
 
 /* Memory load / store instructions
  */
-EMIT_RI10(spe_lqd,  0x034);
 EMIT_RR  (spe_lqx,  0x1c4);
 EMIT_RI16(spe_lqa,  0x061);
 EMIT_RI16(spe_lqr,  0x067);
-EMIT_RI10(spe_stqd, 0x024);
 EMIT_RR  (spe_stqx, 0x144);
 EMIT_RI16(spe_stqa, 0x041);
 EMIT_RI16(spe_stqr, 0x047);
@@ -151,7 +166,7 @@ EMIT_RI16(spe_fsmbi, 0x065);
 EMIT_RR  (spe_ah,      0x0c8);
 EMIT_RI10(spe_ahi,     0x01d);
 EMIT_RR  (spe_a,       0x0c0);
-EMIT_RI10(spe_ai,      0x01c);
+EMIT_RI10s(spe_ai,      0x01c);
 EMIT_RR  (spe_sfh,     0x048);
 EMIT_RI10(spe_sfhi,    0x00d);
 EMIT_RR  (spe_sf,      0x040);
@@ -189,19 +204,19 @@ EMIT_R   (spe_xshw,    0x2ae);
 EMIT_R   (spe_xswd,    0x2a6);
 EMIT_RR  (spe_and,     0x0c1);
 EMIT_RR  (spe_andc,    0x2c1);
-EMIT_RI10(spe_andbi,   0x016);
-EMIT_RI10(spe_andhi,   0x015);
-EMIT_RI10(spe_andi,    0x014);
+EMIT_RI10s(spe_andbi,   0x016);
+EMIT_RI10s(spe_andhi,   0x015);
+EMIT_RI10s(spe_andi,    0x014);
 EMIT_RR  (spe_or,      0x041);
 EMIT_RR  (spe_orc,     0x2c9);
-EMIT_RI10(spe_orbi,    0x006);
-EMIT_RI10(spe_orhi,    0x005);
-EMIT_RI10(spe_ori,     0x004);
+EMIT_RI10s(spe_orbi,    0x006);
+EMIT_RI10s(spe_orhi,    0x005);
+EMIT_RI10s(spe_ori,     0x004);
 EMIT_R   (spe_orx,     0x1f0);
 EMIT_RR  (spe_xor,     0x241);
-EMIT_RI10(spe_xorbi,   0x026);
-EMIT_RI10(spe_xorhi,   0x025);
-EMIT_RI10(spe_xori,    0x024);
+EMIT_RI10s(spe_xorbi,   0x026);
+EMIT_RI10s(spe_xorhi,   0x025);
+EMIT_RI10s(spe_xori,    0x024);
 EMIT_RR  (spe_nand,    0x0c9);
 EMIT_RR  (spe_nor,     0x049);
 EMIT_RR  (spe_eqv,     0x249);
@@ -279,6 +294,12 @@ EMIT_RI16(spe_brz,       0x040);
 EMIT_RI16(spe_brhnz,     0x046);
 EMIT_RI16(spe_brhz,      0x044);
 
+extern void
+spe_lqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
+extern void
+spe_stqd(struct spe_function *p, unsigned rT, unsigned rA, int offset);
+
 extern void spe_bi(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_iret(struct spe_function *p, unsigned rA, int d, int e);
 extern void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA,
@@ -307,6 +328,22 @@ spe_load_int(struct spe_function *p, unsigned rT, int i);
 extern void
 spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
 
+/** And immediate value into rT. */
+extern void
+spe_and_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Xor immediate value into rT. */
+extern void
+spe_xor_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare equal with immediate value. */
+extern void
+spe_compare_equal_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
+/** Compare greater with immediate value. */
+extern void
+spe_compare_greater_uint(struct spe_function *p, unsigned rT, unsigned rA, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
@@ -388,6 +425,7 @@ EMIT_R   (spe_wrch,       0x10d);
 #undef EMIT_RI7
 #undef EMIT_RI8
 #undef EMIT_RI10
+#undef EMIT_RI10s
 #undef EMIT_RI16
 #undef EMIT_RI18
 #undef EMIT_I16
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 3bba9dcc076..99ee74cf14b 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -371,7 +371,11 @@ void x86_jcc( struct x86_function *p,
    DUMP_I(cc);
    
    if (offset < 0) {
-      assert(p->csr - p->store > -offset);
+      /*assert(p->csr - p->store > -offset);*/
+      if (p->csr - p->store <= -offset) {
+         /* probably out of memory (using the error_overflow buffer) */
+         return;
+      }
    }
 
    if (offset <= 127 && offset >= -128) {
@@ -675,6 +679,44 @@ void x86_and( struct x86_function *p,
  * SSE instructions
  */
 
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 0, ptr);
+}
+
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 1, ptr);
+}
+
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr)
+{
+   DUMP_R( ptr );
+   assert(ptr.mod != mod_REG);
+   emit_2ub(p, 0x0f, 0x18);
+   emit_modrm_noreg(p, 2, ptr);
+}
+
+void sse_movntps( struct x86_function *p, 
+                  struct x86_reg dst,
+                  struct x86_reg src)
+{
+   DUMP_RR( dst, src );
+
+   assert(dst.mod != mod_REG);
+   assert(src.mod == mod_REG);
+   emit_2ub(p, 0x0f, 0x2b);
+   emit_modrm(p, src, dst);
+}
+
+
+
 
 void sse_movss( struct x86_function *p,
 		struct x86_reg dst,
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index 510aa1b0dec..1b5eaaca850 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -185,6 +185,13 @@ void sse2_pshufd( struct x86_function *p, struct x86_reg dest, struct x86_reg ar
 void sse2_rcpps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse2_rcpss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 
+
+void sse_prefetchnta( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch0( struct x86_function *p, struct x86_reg ptr);
+void sse_prefetch1( struct x86_function *p, struct x86_reg ptr);
+
+void sse_movntps( struct x86_function *p, struct x86_reg dst, struct x86_reg src);
+
 void sse_addps( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_addss( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void sse_cvtps2pi( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4fdad3a5c78..f79170b9d65 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -25,6 +25,10 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
+
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
@@ -36,8 +40,6 @@
 
 #include "rtasm/rtasm_x86sse.h"
 
-#ifdef PIPE_ARCH_X86
-
 /* for 1/sqrt()
  *
  * This costs about 100fps (close to 10%) in gears:
diff --git a/src/gallium/auxiliary/util/Makefile b/src/gallium/auxiliary/util/Makefile
index d3951e4e7d7..b3d1045a8f4 100644
--- a/src/gallium/auxiliary/util/Makefile
+++ b/src/gallium/auxiliary/util/Makefile
@@ -10,6 +10,7 @@ C_SOURCES = \
 	u_gen_mipmap.c \
 	u_handle_table.c \
 	u_hash_table.c \
+	u_keymap.c \
 	u_math.c \
 	u_mm.c \
 	u_rect.c \
diff --git a/src/gallium/auxiliary/util/SConscript b/src/gallium/auxiliary/util/SConscript
index e65c17b1cc8..8a04955a16e 100644
--- a/src/gallium/auxiliary/util/SConscript
+++ b/src/gallium/auxiliary/util/SConscript
@@ -11,13 +11,14 @@ util = env.ConvenienceLibrary(
 		'u_gen_mipmap.c',
 		'u_handle_table.c',
 		'u_hash_table.c',
+		'u_keymap.c',
 		'u_math.c',
 		'u_mm.c',
 		'u_rect.c',
 		'u_simple_shaders.c',
 		'u_snprintf.c',
-        'u_stream_stdc.c',
-        'u_stream_wd.c',
+		'u_stream_stdc.c',
+		'u_stream_wd.c',
 		'u_tile.c',
 		'u_time.c',
 	])
diff --git a/src/gallium/auxiliary/util/u_keymap.c b/src/gallium/auxiliary/util/u_keymap.c
new file mode 100644
index 00000000000..01b17ddb1b3
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.c
@@ -0,0 +1,309 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * Key lookup/associative container.
+ *
+ * Like Jose's u_hash_table, based on CSO cache code for now.
+ *
+ * Author: Brian Paul
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_error.h"
+
+#include "cso_cache/cso_hash.h"
+
+#include "util/u_memory.h"
+#include "util/u_keymap.h"
+
+
+struct keymap
+{
+   struct cso_hash *cso;   
+   unsigned key_size;
+   unsigned max_entries; /* XXX not obeyed net */
+   unsigned num_entries;
+   keymap_delete_func delete_func;
+};
+
+
+struct keymap_item
+{
+   void *key, *value;
+};
+
+
+/**
+ * This the default key-delete function used when the client doesn't
+ * provide one.
+ */
+static void
+default_delete_func(const struct keymap *map,
+                    const void *key, void *data, void *user)
+{
+   FREE((void*) data);
+}
+
+
+static INLINE struct keymap_item *
+hash_table_item(struct cso_hash_iter iter)
+{
+   return (struct keymap_item *) cso_hash_iter_data(iter);
+}
+
+
+/**
+ * Return 4-byte hash key for a block of bytes.
+ */
+static unsigned
+hash(const void *key, unsigned keySize)
+{
+   unsigned i, hash;
+
+   keySize /= 4; /* convert from bytes to uints */
+
+   hash = 0;
+   for (i = 0; i < keySize; i++) {
+      hash ^= (i + 1) * ((const unsigned *) key)[i];
+   }
+
+   /*hash = hash ^ (hash >> 11) ^ (hash >> 22);*/
+
+   return hash;
+}
+
+
+/**
+ * Create a new map.
+ * \param keySize  size of the keys in bytes
+ * \param maxEntries  max number of entries to allow (~0 = infinity)
+ * \param deleteFunc  optional callback to call when entries
+ *                    are deleted/replaced
+ */
+struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                 keymap_delete_func deleteFunc)
+{
+   struct keymap *map = MALLOC_STRUCT(keymap);
+   if (!map)
+      return NULL;
+   
+   map->cso = cso_hash_create();
+   if (!map->cso) {
+      FREE(map);
+      return NULL;
+   }
+   
+   map->max_entries = maxEntries;
+   map->num_entries = 0;
+   map->key_size = keySize;
+   map->delete_func = deleteFunc ? deleteFunc : default_delete_func;
+
+   return map;
+}
+
+
+/**
+ * Delete/free a keymap and all entries.  The deleteFunc that was given at
+ * create time will be called for each entry.
+ * \param user  user-provided pointer passed through to the delete callback
+ */
+void
+util_delete_keymap(struct keymap *map, void *user)
+{
+   util_keymap_remove_all(map, user);
+   cso_hash_delete(map->cso);
+   FREE(map);
+}
+
+
+static INLINE struct cso_hash_iter
+hash_table_find_iter(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+   
+   iter = cso_hash_find(map->cso, key_hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *) cso_hash_iter_data(iter);
+      if (!memcmp(item->key, key, map->key_size))
+         break;
+      iter = cso_hash_iter_next(iter);
+   }
+   
+   return iter;
+}
+
+
+static INLINE struct keymap_item *
+hash_table_find_item(const struct keymap *map, const void *key,
+                     unsigned key_hash)
+{
+   struct cso_hash_iter iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter)) {
+      return NULL;
+   }
+   else {
+      return hash_table_item(iter);
+   }
+}
+
+
+/**
+ * Insert a new key + data pointer into the table.
+ * Note: we create a copy of the key, but not the data!
+ * If the key is already present in the table, replace the existing
+ * entry (calling the delete callback on the previous entry).
+ * If the maximum capacity of the map is reached an old entry
+ * will be deleted (the delete callback will be called).
+ */
+boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+   struct cso_hash_iter iter;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (item) {
+      /* call delete callback for old entry/item */
+      map->delete_func(map, item->key, item->value, user);
+      item->value = (void *) data;
+      return TRUE;
+   }
+   
+   item = MALLOC_STRUCT(keymap_item);
+   if (!item)
+      return FALSE;
+
+   item->key = mem_dup(key, map->key_size);
+   item->value = (void *) data;
+   
+   iter = cso_hash_insert(map->cso, key_hash, item);
+   if (cso_hash_iter_is_null(iter)) {
+      FREE(item);
+      return FALSE;
+   }
+
+   map->num_entries++;
+
+   return TRUE;
+}
+
+
+/**
+ * Look up a key in the map and return the associated data pointer.
+ */
+const void *
+util_keymap_lookup(const struct keymap *map, const void *key)
+{
+   unsigned key_hash;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   item = hash_table_find_item(map, key, key_hash);
+   if (!item)
+      return NULL;
+   
+   return item->value;
+}
+
+
+/**
+ * Remove an entry from the map.
+ * The delete callback will be called if the given key/entry is found.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove(struct keymap *map, const void *key, void *user)
+{
+   unsigned key_hash;
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+
+   key_hash = hash(key, map->key_size);
+
+   iter = hash_table_find_iter(map, key, key_hash);
+   if (cso_hash_iter_is_null(iter))
+      return;
+   
+   item = hash_table_item(iter);
+   assert(item);
+   map->delete_func(map, item->key, item->value, user);
+   FREE(item->key);
+   FREE(item);
+   
+   map->num_entries--;
+
+   cso_hash_erase(map->cso, iter);
+}
+
+
+/**
+ * Remove all entries from the map, calling the delete callback for each.
+ * \param user  passed to the delete callback as the last param.
+ */
+void
+util_keymap_remove_all(struct keymap *map, void *user)
+{
+   struct cso_hash_iter iter;
+   struct keymap_item *item;
+
+   assert(map);
+   
+   iter = cso_hash_first_node(map->cso);
+   while (!cso_hash_iter_is_null(iter)) {
+      item = (struct keymap_item *)
+         cso_hash_take(map->cso, cso_hash_iter_key(iter));
+      map->delete_func(map, item->key, item->value, user);
+      FREE(item->key);
+      FREE(item);
+      iter = cso_hash_first_node(map->cso);
+   }
+}
+
+
+extern void
+util_keymap_info(const struct keymap *map)
+{
+   debug_printf("Keymap %p: %u of max %u entries\n",
+                (void *) map, map->num_entries, map->max_entries);
+}
diff --git a/src/gallium/auxiliary/util/u_keymap.h b/src/gallium/auxiliary/util/u_keymap.h
new file mode 100644
index 00000000000..8d60a76fc3c
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_keymap.h
@@ -0,0 +1,68 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef U_KEYMAP_H
+#define U_KEYMAP_H
+
+#include "pipe/p_compiler.h"
+
+
+/** opaque keymap type */
+struct keymap;
+
+
+/** Delete/callback function type */
+typedef void (*keymap_delete_func)(const struct keymap *map,
+                                   const void *key, void *data,
+                                   void *user);
+
+
+extern struct keymap *
+util_new_keymap(unsigned keySize, unsigned maxEntries,
+                keymap_delete_func deleteFunc);
+
+extern void
+util_delete_keymap(struct keymap *map, void *user);
+
+extern boolean
+util_keymap_insert(struct keymap *map, const void *key,
+                   const void *data, void *user);
+
+extern const void *
+util_keymap_lookup(const struct keymap *map, const void *key);
+
+extern void
+util_keymap_remove(struct keymap *map, const void *key, void *user);
+
+extern void
+util_keymap_remove_all(struct keymap *map, void *user);
+
+extern void
+util_keymap_info(const struct keymap *map);
+
+
+#endif /* U_KEYMAP_H */
diff --git a/src/gallium/auxiliary/util/u_math.c b/src/gallium/auxiliary/util/u_math.c
index 0729114d6a1..5b3cab4642a 100644
--- a/src/gallium/auxiliary/util/u_math.c
+++ b/src/gallium/auxiliary/util/u_math.c
@@ -30,7 +30,7 @@
 #include "util/u_math.h"
 
 
-
+/** 2^x, for x in [-1.0, 1.0[ */
 float pow2_table[POW2_TABLE_SIZE];
 
 
@@ -38,9 +38,21 @@ static void
 init_pow2_table(void)
 {
    int i;
-   for (i = 0; i < POW2_TABLE_SIZE; i++) {
-      pow2_table[i] = (float) pow(2.0, i / POW2_TABLE_SCALE);
-   }
+   for (i = 0; i < POW2_TABLE_SIZE; i++)
+      pow2_table[i] = (float) pow(2.0, (i - POW2_TABLE_OFFSET) / POW2_TABLE_SCALE);
+}
+
+
+/** log2(x), for x in [1.0, 2.0[ */
+float log2_table[LOG2_TABLE_SIZE];
+
+
+static void 
+init_log2_table(void)
+{
+   unsigned i;
+   for (i = 0; i < LOG2_TABLE_SIZE; i++)
+      log2_table[i] = (float) log2(1.0 + i * (1.0 / LOG2_TABLE_SIZE));
 }
 
 
@@ -53,6 +65,7 @@ util_init_math(void)
    static boolean initialized = FALSE;
    if (!initialized) {
       init_pow2_table();
+      init_log2_table();
       initialized = TRUE;
    }
 }
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 084655e6ddc..be7303e5503 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -174,8 +174,10 @@ static INLINE float logf( float f )
 
 
 
-#define POW2_TABLE_SIZE 256
-#define POW2_TABLE_SCALE ((float) (POW2_TABLE_SIZE-1))
+#define POW2_TABLE_SIZE_LOG2 9
+#define POW2_TABLE_SIZE (1 << POW2_TABLE_SIZE_LOG2)
+#define POW2_TABLE_OFFSET (POW2_TABLE_SIZE/2)
+#define POW2_TABLE_SCALE ((float)(POW2_TABLE_SIZE/2))
 extern float pow2_table[POW2_TABLE_SIZE];
 
 
@@ -186,98 +188,78 @@ util_init_math(void);
 
 union fi {
    float f;
-   int i;
-   unsigned ui;
+   int32_t i;
+   uint32_t ui;
 };
 
 
 /**
- * Fast approximation to exp(x).
- * Compute with base 2 exponents:  exp(x) = exp2(log2(e) * x)
- * Note: log2(e) is a constant, k = 1.44269
- * So, exp(x) = exp2(k * x);
+ * Fast version of 2^x
  * Identity: exp2(a + b) = exp2(a) * exp2(b)
- * Let ipart = int(k*x)
- * Let fpart = k*x - ipart;
- * So, exp2(k*x) = exp2(ipart) * exp2(fpart)
+ * Let ipart = int(x)
+ * Let fpart = x - ipart;
+ * So, exp2(x) = exp2(ipart) * exp2(fpart)
  * Compute exp2(ipart) with i << ipart
  * Compute exp2(fpart) with lookup table.
  */
 static INLINE float
-util_fast_exp(float x)
+util_fast_exp2(float x)
 {
-   if (x >= 0.0f) {
-      float k = 1.44269f; /* = log2(e) */
-      float kx = k * x;
-      int ipart = (int) kx;
-      float fpart = kx - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return y;
-   }
-   else {
-      /* exp(-x) = 1.0 / exp(x) */
-      float k = -1.44269f;
-      float kx = k * x;
-      int ipart = (int) kx;
-      float fpart = kx - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return 1.0f / y;
-   }
+   int32_t ipart;
+   float fpart, mpart;
+   union fi epart;
+   
+   if(x > 129.00000f)
+      return 3.402823466e+38f;
+   
+   if(x < -126.99999f)
+      return 0.0f;
+
+   ipart = (int32_t) x;
+   fpart = x - (float) ipart;
+   
+   /* same as
+    *   epart.f = (float) (1 << ipart)
+    * but faster and without integer overflow for ipart > 31 */
+   epart.i = (ipart + 127 ) << 23;
+   
+   mpart = pow2_table[POW2_TABLE_OFFSET + (int)(fpart * POW2_TABLE_SCALE)];
+   
+   return epart.f * mpart;
 }
 
 
 /**
- * Fast version of 2^x
- * XXX the above function could be implemented in terms of this one.
+ * Fast approximation to exp(x).
  */
 static INLINE float
-util_fast_exp2(float x)
+util_fast_exp(float x)
 {
-   if (x >= 0.0f) {
-      int ipart = (int) x;
-      float fpart = x - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return y;
-   }
-   else {
-      /* exp(-x) = 1.0 / exp(x) */
-      int ipart = (int) -x;
-      float fpart = -x - (float) ipart;
-      float y = (float) (1 << ipart)
-         * pow2_table[(int) (fpart * POW2_TABLE_SCALE)];
-      return 1.0f / y;
-   }
+   const float k = 1.44269f; /* = log2(e) */
+   return util_fast_exp2(k * x);
 }
 
 
-/**
- * Based on code from http://www.flipcode.com/archives/Fast_log_Function.shtml
- */
+#define LOG2_TABLE_SIZE_LOG2 8
+#define LOG2_TABLE_SIZE (1 << LOG2_TABLE_SIZE_LOG2)
+extern float log2_table[LOG2_TABLE_SIZE];
+
+
 static INLINE float
-util_fast_log2(float val)
+util_fast_log2(float x)
 {
    union fi num;
-   int log_2;
-   num.f = val;
-   log_2 = ((num.i >> 23) & 255) - 128;
-   num.i &= ~(255 << 23);
-   num.i += 127 << 23;
-   num.f = ((-1.0f/3) * num.f + 2) * num.f - 2.0f/3;
-   return num.f + log_2;
+   float epart, mpart;
+   num.f = x;
+   epart = (float)(((num.i & 0x7f800000) >> 23) - 127);
+   mpart = log2_table[(num.i & 0x007fffff) >> (23 - LOG2_TABLE_SIZE_LOG2)];
+   return epart + mpart;
 }
 
 
 static INLINE float
 util_fast_pow(float x, float y)
 {
-   /* XXX these tests may need adjustment */
-   if (y >= 3.0f && (-0.02f <= x && x <= 0.02f))
-      return 0.0f;
-   if (y >= 50.0f && (-0.9f <= x && x <= 0.9f))
-      return 0.0f;
    return util_fast_exp2(util_fast_log2(x) * y);
 }
 
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
index 0c8356cd057..e2a8491e62c 100644
--- a/src/gallium/auxiliary/util/u_sse.h
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -37,6 +37,10 @@
 #ifndef U_SSE_H_
 #define U_SSE_H_
 
+#include "pipe/p_config.h"
+
+#if defined(PIPE_ARCH_SSE)
+
 #include <xmmintrin.h>
 #include <emmintrin.h>
 
@@ -66,7 +70,8 @@ _mm_castps_si128(__m128 a)
    return u.m128i;
 }
 
-#endif
+#endif /* defined(_MSC_VER) && _MSC_VER < 1500 */
 
+#endif /* PIPE_ARCH_X86 || PIPE_ARCH_X86_64 */
 
 #endif /* U_SSE_H_ */
diff --git a/src/gallium/drivers/cell/common.h b/src/gallium/drivers/cell/common.h
index 99329fd8e22..b0169b8e329 100644
--- a/src/gallium/drivers/cell/common.h
+++ b/src/gallium/drivers/cell/common.h
@@ -67,6 +67,7 @@
 #define CELL_MAX_SPUS 6
 
 #define CELL_MAX_SAMPLERS 4
+#define CELL_MAX_TEXTURE_LEVELS 12  /* 2k x 2k */
 
 #define TILE_SIZE 32
 
@@ -94,6 +95,7 @@
 #define CELL_CMD_STATE_BIND_VS       18
 #define CELL_CMD_STATE_FRAGMENT_PROGRAM 19
 #define CELL_CMD_STATE_ATTRIB_FETCH  20
+#define CELL_CMD_STATE_FS_CONSTANTS  21
 #define CELL_CMD_VS_EXECUTE          22
 #define CELL_CMD_FLUSH_BUFFER_RANGE  23
 
@@ -127,7 +129,7 @@ struct cell_command_fragment_ops
 
 
 /** Max instructions for fragment programs */
-#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 128
+#define SPU_MAX_FRAGMENT_PROGRAM_INSTS 512
 
 /**
  * Command to send a fragment program to SPUs.
@@ -227,6 +229,7 @@ struct cell_command_render
    float xmin, ymin, xmax, ymax;  /* XXX another dummy field */
    uint min_index;
    boolean inline_verts;
+   uint front_winding; /* the rasterizer needs to be able to determine facing to apply front/back-facing stencil */
 };
 
 
@@ -248,9 +251,12 @@ struct cell_command_sampler
 struct cell_command_texture
 {
    uint64_t opcode;     /**< CELL_CMD_STATE_TEXTURE */
+   uint target;         /**< PIPE_TEXTURE_x */
    uint unit;
-   void *start;         /**< Address in main memory */
-   ushort width, height;
+   void *start[CELL_MAX_TEXTURE_LEVELS];   /**< Address in main memory */
+   ushort width[CELL_MAX_TEXTURE_LEVELS];
+   ushort height[CELL_MAX_TEXTURE_LEVELS];
+   ushort depth[CELL_MAX_TEXTURE_LEVELS];
 };
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_context.c b/src/gallium/drivers/cell/ppu/cell_context.c
index 62e213ea354..b66aa9c9d99 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.c
+++ b/src/gallium/drivers/cell/ppu/cell_context.c
@@ -62,6 +62,8 @@ cell_destroy_context( struct pipe_context *pipe )
 {
    struct cell_context *cell = cell_context(pipe);
 
+   util_delete_keymap(cell->fragment_ops_cache, NULL);
+
    cell_spu_exit(cell);
 
    align_free(cell);
@@ -126,11 +128,14 @@ cell_create_context(struct pipe_screen *screen,
    cell_init_state_functions(cell);
    cell_init_shader_functions(cell);
    cell_init_surface_functions(cell);
-   cell_init_texture_functions(cell);
    cell_init_vertex_functions(cell);
 
    cell->draw = cell_draw_create(cell);
 
+   /* Create cache of fragment ops generated code */
+   cell->fragment_ops_cache =
+      util_new_keymap(sizeof(struct cell_fragment_ops_key), ~0, NULL);
+
    cell_init_vbuf(cell);
 
    draw_set_rasterize_stage(cell->draw, cell->vbuf);
@@ -156,5 +161,8 @@ cell_create_context(struct pipe_screen *screen,
 
    cell_init_batch_buffers(cell);
 
+   /* make sure SPU initializations are done before proceeding */
+   cell_flush_int(cell, CELL_FLUSH_WAIT);
+
    return &cell->pipe;
 }
diff --git a/src/gallium/drivers/cell/ppu/cell_context.h b/src/gallium/drivers/cell/ppu/cell_context.h
index 3dc15c9233c..80a9b3d7e13 100644
--- a/src/gallium/drivers/cell/ppu/cell_context.h
+++ b/src/gallium/drivers/cell/ppu/cell_context.h
@@ -38,6 +38,7 @@
 #include "cell/common.h"
 #include "rtasm/rtasm_ppc_spe.h"
 #include "tgsi/tgsi_scan.h"
+#include "util/u_keymap.h"
 
 
 struct cell_vbuf_render;
@@ -67,6 +68,19 @@ struct cell_fragment_shader_state
 
 
 /**
+ * Key for mapping per-fragment state to cached SPU machine code.
+ *  keymap(cell_fragment_ops_key) => cell_command_fragment_ops
+ */
+struct cell_fragment_ops_key
+{
+   struct pipe_blend_state blend;
+   struct pipe_depth_stencil_alpha_state dsa;
+   enum pipe_format color_format;
+   enum pipe_format zs_format;
+};
+
+
+/**
  * Per-context state, subclass of pipe_context.
  */
 struct cell_context
@@ -107,6 +121,9 @@ struct cell_context
 
    uint dirty;
 
+   /** Cache of code generated for per-fragment ops */
+   struct keymap *fragment_ops_cache;
+
    /** The primitive drawing context */
    struct draw_context *draw;
    struct draw_stage *render_stage;
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fp.c b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
index 8d2d4f2a0f2..3dfd5f673dd 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fp.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fp.c
@@ -77,13 +77,15 @@ struct codegen
 
    /** Per-instruction temps / intermediate temps */
    int num_itemps;
-   int itemps[10];
+   int itemps[12];
 
    /** Current IF/ELSE/ENDIF nesting level */
    int if_nesting;
    /** Index of execution mask register */
    int exec_mask_reg;
 
+   int frame_size;  /**< Stack frame size, in words */
+
    struct spe_function *f;
    boolean error;
 };
@@ -165,6 +167,37 @@ get_exec_mask_reg(struct codegen *gen)
 }
 
 
+static boolean
+is_register_src(struct codegen *gen, int channel,
+                const struct tgsi_full_src_register *src)
+{
+   int swizzle = tgsi_util_get_full_src_register_extswizzle(src, channel);
+   int sign_op = tgsi_util_get_full_src_register_sign_mode(src, channel);
+
+   if (swizzle > TGSI_SWIZZLE_W || sign_op != TGSI_UTIL_SIGN_KEEP) {
+      return FALSE;
+   }
+   if (src->SrcRegister.File == TGSI_FILE_TEMPORARY ||
+       src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+      return TRUE;
+   }
+   return FALSE;
+}
+
+  
+static boolean
+is_memory_dst(struct codegen *gen, int channel,
+              const struct tgsi_full_dst_register *dst)
+{
+   if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+      return TRUE;
+   }
+   else {
+      return FALSE;
+   }
+}
+
+  
 /**
  * Return the index of the SPU temporary containing the named TGSI
  * source register.  If the TGSI register is a TGSI_FILE_TEMPORARY we
@@ -185,41 +218,48 @@ get_src_reg(struct codegen *gen,
    assert(swizzle >= TGSI_SWIZZLE_X);
    assert(swizzle <= TGSI_EXTSWIZZLE_ONE);
 
-   switch (src->SrcRegister.File) {
-   case TGSI_FILE_TEMPORARY:
-      reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
-      break;
-   case TGSI_FILE_INPUT:
-      {
-         if(swizzle == TGSI_EXTSWIZZLE_ONE)
-         {
-            /* Load const one float and early out */
-            reg = get_const_one_reg(gen);
-         }
-         else if(swizzle == TGSI_EXTSWIZZLE_ZERO)
+   if (swizzle == TGSI_EXTSWIZZLE_ONE) {
+      /* Load const one float and early out */
+      reg = get_const_one_reg(gen);
+   }
+   else if (swizzle == TGSI_EXTSWIZZLE_ZERO) {
+      /* Load const zero float and early out */
+      reg = get_itemp(gen);
+      spe_xor(gen->f, reg, reg, reg);
+   }
+   else {
+      assert(swizzle < 4);
+
+      switch (src->SrcRegister.File) {
+      case TGSI_FILE_TEMPORARY:
+         reg = gen->temp_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_INPUT:
          {
-            /* Load const zero float and early out */
+            /* offset is measured in quadwords, not bytes */
+            int offset = src->SrcRegister.Index * 4 + swizzle;
             reg = get_itemp(gen);
-            spe_xor(gen->f, reg, reg, reg);
+            reg_is_itemp = TRUE;
+            /* Load:  reg = memory[(machine_reg) + offset] */
+            spe_lqd(gen->f, reg, gen->inputs_reg, offset * 16);
          }
-         else
+         break;
+      case TGSI_FILE_IMMEDIATE:
+         reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
+         break;
+      case TGSI_FILE_CONSTANT:
          {
             /* offset is measured in quadwords, not bytes */
             int offset = src->SrcRegister.Index * 4 + swizzle;
             reg = get_itemp(gen);
             reg_is_itemp = TRUE;
             /* Load:  reg = memory[(machine_reg) + offset] */
-            spe_lqd(gen->f, reg, gen->inputs_reg, offset);
+            spe_lqd(gen->f, reg, gen->constants_reg, offset * 16);
          }
+         break;
+      default:
+         assert(0);
       }
-      break;
-   case TGSI_FILE_IMMEDIATE:
-      reg = gen->imm_regs[src->SrcRegister.Index][swizzle];
-      break;
-   case TGSI_FILE_CONSTANT:
-      /* xxx fall-through for now / fix */
-   default:
-      assert(0);
    }
 
    /*
@@ -243,7 +283,7 @@ get_src_reg(struct codegen *gen,
       }
 
       /* mask with bit 31 set, the rest cleared */
-      spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+      spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
       if (sign_op == TGSI_UTIL_SIGN_CLEAR) {
          spe_andc(gen->f, result_reg, reg, bit31mask_reg);
@@ -318,6 +358,7 @@ store_dest_reg(struct codegen *gen,
       }
       else {
          /* we're not inside a condition or loop: do nothing special */
+
       }
       break;
    case TGSI_FILE_OUTPUT:
@@ -330,17 +371,17 @@ store_dest_reg(struct codegen *gen,
             /* First read the current value from memory:
              * Load:  curval = memory[(machine_reg) + offset]
              */
-            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_lqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
             /* Mix curval with newvalue according to exec mask:
              * d[i] = mask_reg[i] ? value_reg : d_reg
              */
             spe_selb(gen->f, curval_reg, curval_reg, value_reg, exec_reg);
             /* Store: memory[(machine_reg) + offset] = curval */
-            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, curval_reg, gen->outputs_reg, offset * 16);
          }
          else {
             /* Store: memory[(machine_reg) + offset] = reg */
-            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset);
+            spe_stqd(gen->f, value_reg, gen->outputs_reg, offset * 16);
          }
       }
       break;
@@ -350,18 +391,95 @@ store_dest_reg(struct codegen *gen,
 }
 
 
+
+static void
+emit_prologue(struct codegen *gen)
+{
+   gen->frame_size = 1024; /* XXX temporary, should be dynamic */
+
+   spe_comment(gen->f, -4, "Function prologue:");
+
+   /* save $lr on stack     # stqd $lr,16($sp) */
+   spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      int sp_reg = spe_allocate_available_register(gen->f);
+      /* offset = -framesize */
+      spe_load_int(gen->f, offset_reg, -gen->frame_size);
+      /* sp = $sp */
+      spe_move(gen->f, sp_reg, SPE_REG_SP);
+      /* $sp = $sp + offset_reg */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* save $sp in stack frame */
+      spe_stqd(gen->f, sp_reg, SPE_REG_SP, 0);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+      spe_release_register(gen->f, sp_reg);
+   }
+   else {
+      /* save stack pointer    # stqd $sp,-frameSize($sp) */
+      spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+
+      /* adjust stack pointer  # ai $sp,$sp,-frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -gen->frame_size);
+   }
+}
+
+
+static void
+emit_epilogue(struct codegen *gen)
+{
+   spe_comment(gen->f, -4, "Function epilogue:");
+
+   if (gen->frame_size >= 512) {
+      /* offset is too large for ai instruction */
+      int offset_reg = spe_allocate_available_register(gen->f);
+      /* offset = framesize */
+      spe_load_int(gen->f, offset_reg, gen->frame_size);
+      /* $sp = $sp + offset */
+      spe_a(gen->f, SPE_REG_SP, SPE_REG_SP, offset_reg);
+      /* clean up */
+      spe_release_register(gen->f, offset_reg);
+   }
+   else {
+      /* restore stack pointer    # ai $sp,$sp,frameSize */
+      spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, gen->frame_size);
+   }
+
+   /* restore $lr              # lqd $lr,16($sp) */
+   spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
+
+   /* return from function call */
+   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+}
+
+
 static boolean
 emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, src_reg[4], dst_reg[4];
+
    spe_comment(gen->f, -4, "MOV:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int src_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int dst_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         /* XXX we don't always need to actually emit a mov instruction here */
-         spe_move(gen->f, dst_reg, src_reg);
-         store_dest_reg(gen, dst_reg, ch, &inst->FullDstRegisters[0]);
+         src_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         dst_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         if (is_register_src(gen, ch, &inst->FullSrcRegisters[0]) &&
+             is_memory_dst(gen, ch, &inst->FullDstRegisters[0])) {
+            /* special-case: register to memory store */
+            store_dest_reg(gen, src_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
+         else {
+            spe_move(gen->f, dst_reg[ch], src_reg[ch]);
+            store_dest_reg(gen, dst_reg[ch], ch, &inst->FullDstRegisters[0]);
+         }
          free_itemps(gen);
       }
    }
@@ -376,22 +494,25 @@ emit_MOV(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
+
    spe_comment(gen->f, -4, "ADD:");
-   /* Loop over Red/Green/Blue/Alpha channels */
+   /* Loop over Red/Green/Blue/Alpha channels, fetch src operands */
    for (ch = 0; ch < 4; ch++) {
       /* If the dest R, G, B or A writemask is enabled... */
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* get indexes of the two src, one dest SPE registers */
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   /* Loop over Red/Green/Blue/Alpha channels, do the add, store results */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* Emit actual SPE instruction: d = s1 + s2 */
-         spe_fa(gen->f, d_reg, s1_reg, s2_reg);
-
+         spe_fa(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
          /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          /* Free any intermediate temps we allocated */
          free_itemps(gen);
       }
@@ -405,23 +526,20 @@ emit_ADD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
    spe_comment(gen->f, -4, "SUB:");
-   /* Loop over Red/Green/Blue/Alpha channels */
    for (ch = 0; ch < 4; ch++) {
-      /* If the dest R, G, B or A writemask is enabled... */
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         /* get indexes of the two src, one dest SPE registers */
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-
-         /* Emit actual SPE instruction: d = s1 - s2 */
-         spe_fs(gen->f, d_reg, s1_reg, s2_reg);
-
-         /* Store the result (a no-op for TGSI_FILE_TEMPORARY dests) */
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         /* Free any intermediate temps we allocated */
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         /* d = s1 - s2 */
+         spe_fs(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -434,17 +552,21 @@ emit_SUB(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4];
    spe_comment(gen->f, -4, "MAD:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* d = s1 * s2 + s3 */
-         spe_fma(gen->f, d_reg, s1_reg, s2_reg, s3_reg);
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         spe_fma(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch], s3_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -458,21 +580,37 @@ emit_MAD(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], s3_reg[4], d_reg[4], tmp_reg[4];
    spe_comment(gen->f, -4, "LERP:");
+   /* setup/get src/dst/temp regs */
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int s3_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         /* d = s3 + s1(s2 - s3) */
-         spe_fs(gen->f, d_reg, s2_reg, s3_reg);
-         spe_fma(gen->f, d_reg, d_reg, s1_reg, s3_reg);
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         s3_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[2]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);
+      }
+   }
+
+   /* d = s3 + s1(s2 - s3) */
+   /* do all subtracts, then all fma, then all stores to better pipeline */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fs(gen->f, tmp_reg[ch], s2_reg[ch], s3_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fma(gen->f, d_reg[ch], tmp_reg[ch], s1_reg[ch], s3_reg[ch]);
       }
    }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   free_itemps(gen);
    return true;
 }
 
@@ -482,16 +620,20 @@ emit_LERP(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MUL(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s1_reg[4], s2_reg[4], d_reg[4];
    spe_comment(gen->f, -4, "MUL:");
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s2_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
          /* d = s1 * s2 */
-         spe_fm(gen->f, d_reg, s1_reg, s2_reg);
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
+         spe_fm(gen->f, d_reg[ch], s1_reg[ch], s2_reg[ch]);
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
    }
@@ -557,7 +699,7 @@ emit_ABS(struct codegen *gen, const struct tgsi_full_instruction *inst)
          const int bit31mask_reg = get_itemp(gen);
 
          /* mask with bit 31 set, the rest cleared */  
-         spe_load_int(gen->f, bit31mask_reg, (1 << 31));
+         spe_load_uint(gen->f, bit31mask_reg, (1 << 31));
 
          /* d = sign bit cleared in s1 */
          spe_andc(gen->f, d_reg, s1_reg, bit31mask_reg);
@@ -576,27 +718,36 @@ static boolean
 emit_DP3(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   int s1x_reg, s1y_reg, s1z_reg;
+   int s2x_reg, s2y_reg, s2z_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
    spe_comment(gen->f, -4, "DP3:");
 
-   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int tmp_reg = get_itemp(gen);
-   /* t = x0 * x1 */
-   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s2x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s2y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s2z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* t = y0 * y1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s1x_reg, s2x_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* t = z0 * z1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s1y_reg, s2y_reg);
+
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s1z_reg, s2z_reg, t0_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -611,32 +762,41 @@ static boolean
 emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
+   int s0x_reg, s0y_reg, s0z_reg, s0w_reg;
+   int s1x_reg, s1y_reg, s1z_reg, s1w_reg;
+   int t0_reg = get_itemp(gen), t1_reg = get_itemp(gen);
+
    spe_comment(gen->f, -4, "DP4:");
 
-   int s1_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
-   int s2_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
-   int tmp_reg = get_itemp(gen);
-   /* t = x0 * x1 */
-   spe_fm(gen->f, tmp_reg, s1_reg, s2_reg);
+   s0x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[0]);
+   s1x_reg = get_src_reg(gen, CHAN_X, &inst->FullSrcRegisters[1]);
+   s0y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
+   s1y_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
+   s0z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
+   s1z_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
+   s0w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
+   s1w_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
 
-   s1_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Y, &inst->FullSrcRegisters[1]);
-   /* t = y0 * y1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t0 = x0 * x1 */
+   spe_fm(gen->f, t0_reg, s0x_reg, s1x_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_Z, &inst->FullSrcRegisters[1]);
-   /* t = z0 * z1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t1 = y0 * y1 */
+   spe_fm(gen->f, t1_reg, s0y_reg, s1y_reg);
 
-   s1_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[0]);
-   s2_reg = get_src_reg(gen, CHAN_W, &inst->FullSrcRegisters[1]);
-   /* t = w0 * w1 + t */
-   spe_fma(gen->f, tmp_reg, s1_reg, s2_reg, tmp_reg);
+   /* t0 = z0 * z1 + t0 */
+   spe_fma(gen->f, t0_reg, s0z_reg, s1z_reg, t0_reg);
+
+   /* t1 = w0 * w1 + t1 */
+   spe_fma(gen->f, t1_reg, s0w_reg, s1w_reg, t1_reg);
+
+   /* t0 = t0 + t1 */
+   spe_fa(gen->f, t0_reg, t0_reg, t1_reg);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, t0_reg);
+         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
 
@@ -650,6 +810,7 @@ emit_DP4(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
+   /* XXX rewrite this function to look more like DP3/DP4 */
    int ch;
    spe_comment(gen->f, -4, "DPH:");
 
@@ -676,6 +837,8 @@ emit_DPH(struct codegen *gen, const struct tgsi_full_instruction *inst)
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         spe_move(gen->f, d_reg, tmp_reg);
          store_dest_reg(gen, tmp_reg, ch, &inst->FullDstRegisters[0]);
       }
    }
@@ -1016,15 +1179,15 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int tmp_reg = get_itemp(gen);
 
          /* If negative, subtract 1.0 */
-         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
-         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
-         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
 
          /* Convert float to int */
-         spe_cflts(gen->f, d_reg, d_reg, 0);
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
 
          /* Convert int to float */
-         spe_csflt(gen->f, d_reg, d_reg, 0);
+         spe_csflt(gen->f, d_reg, tmp_reg, 0);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1035,15 +1198,14 @@ emit_FLR(struct codegen *gen, const struct tgsi_full_instruction *inst)
 }
 
 /**
- * Emit frac.  
- * Input - FLR(Input)
+ * Compute frac = Input - FLR(Input)
  */
 static boolean
 emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
    int ch;
 
-   spe_comment(gen->f, -4, "FLR:");
+   spe_comment(gen->f, -4, "FRC:");
 
    int zero_reg = get_itemp(gen);
    spe_xor(gen->f, zero_reg, zero_reg, zero_reg);
@@ -1055,18 +1217,18 @@ emit_FRC(struct codegen *gen, const struct tgsi_full_instruction *inst)
          int tmp_reg = get_itemp(gen);
 
          /* If negative, subtract 1.0 */
-         spe_fcgt(gen->f, d_reg, zero_reg, s1_reg);
-         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), d_reg);
-         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
+         spe_fcgt(gen->f, tmp_reg, zero_reg, s1_reg);
+         spe_selb(gen->f, tmp_reg, zero_reg, get_const_one_reg(gen), tmp_reg);
+         spe_fs(gen->f, tmp_reg, s1_reg, tmp_reg);
 
          /* Convert float to int */
-         spe_cflts(gen->f, d_reg, d_reg, 0);
+         spe_cflts(gen->f, tmp_reg, tmp_reg, 0);
 
          /* Convert int to float */
-         spe_csflt(gen->f, d_reg, d_reg, 0);
+         spe_csflt(gen->f, tmp_reg, tmp_reg, 0);
 
          /* d = s1 - FLR(s1) */
-         spe_fs(gen->f, d_reg, s1_reg, d_reg);
+         spe_fs(gen->f, d_reg, s1_reg, tmp_reg);
 
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
@@ -1091,6 +1253,21 @@ print_functions(struct cell_context *cell)
 #endif
 
 
+static uint
+lookup_function(struct cell_context *cell, const char *funcname)
+{
+   const struct cell_spu_function_info *funcs = &cell->spu_functions;
+   uint i, addr = 0;
+   for (i = 0; i < funcs->num; i++) {
+      if (strcmp(funcs->names[i], funcname) == 0) {
+         addr = funcs->addrs[i];
+      }
+   }
+   assert(addr && "spu function not found");
+   return addr / 4;  /* discard 2 least significant bits */
+}
+
+
 /**
  * Emit code to call a SPU function.
  * Used to implement instructions like SIN/COS/POW/TEX/etc.
@@ -1100,77 +1277,56 @@ emit_function_call(struct codegen *gen,
                    const struct tgsi_full_instruction *inst,
                    char *funcname, uint num_args)
 {
-   const struct cell_spu_function_info *funcs = &gen->cell->spu_functions;
+   const uint addr = lookup_function(gen->cell, funcname);
    char comment[100];
-   uint addr;
    int ch;
 
-   /* XXX temporary value */
-   const int frameSize = 64; /* stack frame (activation record) size */
-
    assert(num_args <= 3);
 
-   /* lookup function address */
-   {
-      uint i;
-      addr = 0;
-      for (i = 0; i < funcs->num; i++) {
-         if (strcmp(funcs->names[i], funcname) == 0) {
-            addr = funcs->addrs[i];
-         }
-      }
-      assert(addr && "spu function not found");
-   }
-
-   addr /= 4; /* discard 2 least significant bits */
-
    snprintf(comment, sizeof(comment), "CALL %s:", funcname);
    spe_comment(gen->f, -4, comment);
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
-         int s_regs[3];
-         uint a;
+         int s_regs[3], d_reg;
+         ubyte usedRegs[SPE_NUM_REGS];
+         uint a, i, numUsed;
+
          for (a = 0; a < num_args; a++) {
             s_regs[a] = get_src_reg(gen, ch, &inst->FullSrcRegisters[a]);
          }
+         d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
 
-         /* Basically:
-          * save registers on stack
-          * move parameters to registers 3, 4, 5...
-          * call function
-          * save return value (reg 3)
-          * restore registers from stack
-          */
-
-         /* XXX hack: load first function param */
-         spe_move(gen->f, 3, s_regs[0]);
+         numUsed = spe_get_registers_used(gen->f, usedRegs);
+         assert(numUsed < gen->frame_size / 16 - 2);
 
-         /* save $lr on stack     # stqd $lr,16($sp) */
-         spe_stqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-         /* save stack pointer    # stqd $sp,-frameSize($sp) */
-         spe_stqd(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
-
-         /* XXX save registers to stack here */
+         /* save registers to stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            int offset = 2 + i;
+            spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
 
-         /* adjust stack pointer  # ai $sp,$sp,-frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, -frameSize);
+         /* setup function arguments */
+         for (a = 0; a < num_args; a++) {
+            spe_move(gen->f, 3 + a, s_regs[a]);
+         }
 
          /* branch to function, save return addr */
          spe_brasl(gen->f, SPE_REG_RA, addr);
 
-         /* restore stack pointer # ai $sp,$sp,frameSize */
-         spe_ai(gen->f, SPE_REG_SP, SPE_REG_SP, frameSize);
-
-         /* XXX restore registers from stack here */
-
-         /* restore $lr           # lqd $lr,16($sp) */
-         spe_lqd(gen->f, SPE_REG_RA, SPE_REG_SP, 16);
-
-         /* XXX hack: save function's return value */
+         /* save function's return value */
          spe_move(gen->f, d_reg, 3);
 
+         /* restore registers from stack */
+         for (i = 0; i < numUsed; i++) {
+            uint reg = usedRegs[i];
+            if (reg != d_reg) {
+               int offset = 2 + i;
+               spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+            }
+         }
+
          store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
          free_itemps(gen);
       }
@@ -1180,31 +1336,114 @@ emit_function_call(struct codegen *gen,
 }
 
 
+static boolean
+emit_TXP(struct codegen *gen, const struct tgsi_full_instruction *inst)
+{
+   const uint addr = lookup_function(gen->cell, "spu_txp");
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   int ch;
+   int coord_regs[4], d_regs[4];
+
+   assert(inst->FullSrcRegisters[1].SrcRegister.File == TGSI_FILE_SAMPLER);
+
+   spe_comment(gen->f, -4, "CALL txp:");
+
+   /* get src/dst reg info */
+   for (ch = 0; ch < 4; ch++) {
+      coord_regs[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+      d_regs[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+   }
+
+   {
+      ubyte usedRegs[SPE_NUM_REGS];
+      uint i, numUsed;
+
+      numUsed = spe_get_registers_used(gen->f, usedRegs);
+      assert(numUsed < gen->frame_size / 16 - 2);
+
+      /* save registers to stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         int offset = 2 + i;
+         spe_stqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+      }
+
+      /* setup function arguments */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, 3 + i, coord_regs[i]);
+      }
+      spe_load_uint(gen->f, 7, unit); /* sampler unit */
+
+      /* branch to function, save return addr */
+      spe_brasl(gen->f, SPE_REG_RA, addr);
+
+      /* save function's return values (four pixel's colors) */
+      for (i = 0; i < 4; i++) {
+         spe_move(gen->f, d_regs[i], 3 + i);
+      }
+
+      /* restore registers from stack */
+      for (i = 0; i < numUsed; i++) {
+         uint reg = usedRegs[i];
+         if (reg != d_regs[0] &&
+             reg != d_regs[1] &&
+             reg != d_regs[2] &&
+             reg != d_regs[3]) {
+            int offset = 2 + i;
+            spe_lqd(gen->f, reg, SPE_REG_SP, 16 * offset);
+         }
+      }
+   }
+
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_regs[ch], ch, &inst->FullDstRegisters[0]);
+         free_itemps(gen);
+      }
+   }
+
+   return TRUE;
+}
+
+
 /**
  * Emit max.  See emit_SGT for comments.
  */
 static boolean
 emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
 
    spe_comment(gen->f, -4, "MAX:");
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
 
-         /* d = (s1 > s2) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s1_reg, s2_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+   /* d = (s0 > s1) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s0_reg[ch], s1_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
 
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
       }
    }
 
+   free_itemps(gen);
    return true;
 }
 
@@ -1214,25 +1453,38 @@ emit_MAX(struct codegen *gen, const struct tgsi_full_instruction *inst)
 static boolean
 emit_MIN(struct codegen *gen, const struct tgsi_full_instruction *inst)
 {
-   int ch;
+   int ch, s0_reg[4], s1_reg[4], d_reg[4], tmp_reg[4];
 
    spe_comment(gen->f, -4, "MIN:");
 
    for (ch = 0; ch < 4; ch++) {
       if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
-         int s1_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
-         int s2_reg = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
-         int d_reg = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         s0_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[0]);
+         s1_reg[ch] = get_src_reg(gen, ch, &inst->FullSrcRegisters[1]);
+         d_reg[ch] = get_dst_reg(gen, ch, &inst->FullDstRegisters[0]);
+         tmp_reg[ch] = get_itemp(gen);         
+      }
+   }
 
-         /* d = (s2 > s1) ? s1 : s2 */
-         spe_fcgt(gen->f, d_reg, s2_reg, s1_reg);
-         spe_selb(gen->f, d_reg, s2_reg, s1_reg, d_reg);
+   /* d = (s1 > s0) ? s0 : s1 */
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_fcgt(gen->f, tmp_reg[ch], s1_reg[ch], s0_reg[ch]);
+      }
+   }
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         spe_selb(gen->f, d_reg[ch], s1_reg[ch], s0_reg[ch], tmp_reg[ch]);
+      }
+   }
 
-         store_dest_reg(gen, d_reg, ch, &inst->FullDstRegisters[0]);
-         free_itemps(gen);
+   for (ch = 0; ch < 4; ch++) {
+      if (inst->FullDstRegisters[0].DstRegister.WriteMask & (1 << ch)) {
+         store_dest_reg(gen, d_reg[ch], ch, &inst->FullDstRegisters[0]);
       }
    }
 
+   free_itemps(gen);
    return true;
 }
 
@@ -1339,8 +1591,7 @@ static boolean
 emit_END(struct codegen *gen)
 {
    spe_comment(gen->f, -4, "END:");
-   /* return from function call */
-   spe_bi(gen->f, SPE_REG_RA, 0, 0);
+   emit_epilogue(gen);
    return true;
 }
 
@@ -1413,6 +1664,18 @@ emit_instruction(struct codegen *gen,
       return emit_function_call(gen, inst, "spu_sin", 1);
    case TGSI_OPCODE_POW:
       return emit_function_call(gen, inst, "spu_pow", 2);
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_function_call(gen, inst, "spu_exp2", 1);
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_function_call(gen, inst, "spu_log2", 1);
+   case TGSI_OPCODE_TEX:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXD:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXB:
+      /* fall-through for now */
+   case TGSI_OPCODE_TXP:
+      return emit_TXP(gen, inst);
 
    case TGSI_OPCODE_IF:
       return emit_IF(gen, inst);
@@ -1456,16 +1719,23 @@ emit_immediate(struct codegen *gen, const struct tgsi_full_immediate *immed)
 
    for (ch = 0; ch < 4; ch++) {
       float val = immed->u.ImmediateFloat32[ch].Float;
-      int reg = spe_allocate_available_register(gen->f);
 
-      if (reg < 0)
-         return false;
+      if (ch > 0 && val == immed->u.ImmediateFloat32[ch - 1].Float) {
+         /* re-use previous register */
+         gen->imm_regs[gen->num_imm][ch] = gen->imm_regs[gen->num_imm][ch - 1];
+      }
+      else {
+         int reg = spe_allocate_available_register(gen->f);
+
+         if (reg < 0)
+            return false;
 
-      /* update immediate map */
-      gen->imm_regs[gen->num_imm][ch] = reg;
+         /* update immediate map */
+         gen->imm_regs[gen->num_imm][ch] = reg;
 
-      /* emit initializer instruction */
-      spe_load_float(gen->f, reg, val);
+         /* emit initializer instruction */
+         spe_load_float(gen->f, reg, val);
+      }
    }
 
    gen->num_imm++;
@@ -1488,12 +1758,6 @@ emit_declaration(struct cell_context *cell,
 
    switch (decl->Declaration.File) {
    case TGSI_FILE_TEMPORARY:
-      if (cell->debug_flags & CELL_DEBUG_ASM) {
-         printf("Declare temp reg %d .. %d\n",
-                decl->DeclarationRange.First,
-                decl->DeclarationRange.Last);
-      }
-
       for (i = decl->DeclarationRange.First;
            i <= decl->DeclarationRange.Last;
            i++) {
@@ -1508,12 +1772,12 @@ emit_declaration(struct cell_context *cell,
           * to SPU memory.  someday...
           */
 
-         if (cell->debug_flags & CELL_DEBUG_ASM) {
-            printf("  SPE regs: %d %d %d %d\n",
-                   gen->temp_regs[i][0],
-                   gen->temp_regs[i][1],
-                   gen->temp_regs[i][2],
-                   gen->temp_regs[i][3]);
+         {
+            char buf[100];
+            sprintf(buf, "TGSI temp[%d] maps to SPU regs [$%d $%d $%d $%d]", i,
+                    gen->temp_regs[i][0], gen->temp_regs[i][1],
+                    gen->temp_regs[i][2], gen->temp_regs[i][3]);
+            spe_comment(gen->f, -4, buf);
          }
       }
       break;
@@ -1525,6 +1789,7 @@ emit_declaration(struct cell_context *cell,
 }
 
 
+
 /**
  * Translate TGSI shader code to SPE instructions.  This is done when
  * the state tracker gives us a new shader (via pipe->create_fs_state()).
@@ -1564,12 +1829,14 @@ cell_gen_fragment_program(struct cell_context *cell,
 
    tgsi_parse_init(&parse, tokens);
 
+   emit_prologue(&gen);
+
    while (!tgsi_parse_end_of_tokens(&parse) && !gen.error) {
       tgsi_parse_token(&parse);
 
       switch (parse.FullToken.Token.Type) {
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         if (!emit_immediate(&gen,  &parse.FullToken.FullImmediate))
+         if (!emit_immediate(&gen, &parse.FullToken.FullImmediate))
             gen.error = true;
          break;
 
@@ -1588,7 +1855,6 @@ cell_gen_fragment_program(struct cell_context *cell,
       }
    }
 
-
    if (gen.error) {
       /* terminate the SPE code */
       return emit_END(&gen);
diff --git a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
index 653afc235df..4e1e53ecdc7 100644
--- a/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
+++ b/src/gallium/drivers/cell/ppu/cell_gen_fragment.c
@@ -54,10 +54,12 @@
  * \param ifragZ_reg  register containing integer fragment Z values (in)
  * \param ifbZ_reg    register containing integer frame buffer Z values (in/out)
  * \param zmask_reg   register containing result of Z test/comparison (out)
+ *
+ * Returns true if the Z-buffer needs to be updated.
  */
-static void
-gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
-               struct spe_function *f,
+static boolean
+gen_depth_test(struct spe_function *f,
+               const struct pipe_depth_stencil_alpha_state *dsa,
                int mask_reg, int ifragZ_reg, int ifbZ_reg, int zmask_reg)
 {
    /* NOTE: we use clgt below, not cgt, because we want to compare _unsigned_
@@ -132,7 +134,10 @@ gen_depth_test(const struct pipe_depth_stencil_alpha_state *dsa,
        * framebufferZ = (ztest_passed ? fragmentZ : framebufferZ;
        */
       spe_selb(f, ifbZ_reg, ifbZ_reg, ifragZ_reg, mask_reg);
+      return true;
    }
+
+   return false;
 }
 
 
@@ -238,22 +243,35 @@ gen_alpha_test(const struct pipe_depth_stencil_alpha_state *dsa,
  * it and have to allocate and load it again unnecessarily.
  */
 static inline void
-setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+setup_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int *r)
 {
    if (*is_already_set) return;
    *r = spe_allocate_available_register(f);
-   spe_load_float(f, *r, value);
    *is_already_set = true;
 }
 
 static inline void
-release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+release_optional_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
 {
     if (!*is_already_set) return;
     spe_release_register(f, r);
     *is_already_set = false;
 }
 
+static inline void
+setup_const_register(struct spe_function *f, boolean *is_already_set, unsigned int *r, float value)
+{
+   if (*is_already_set) return;
+   setup_optional_register(f, is_already_set, r);
+   spe_load_float(f, *r, value);
+}
+
+static inline void
+release_const_register(struct spe_function *f, boolean *is_already_set, unsigned int r)
+{
+    release_optional_register(f, is_already_set, r);
+}
+
 /**
  * Generate SPE code to implement the given blend mode for a quad of pixels.
  * \param f          SPE function to append instruction onto.
@@ -1117,6 +1135,666 @@ gen_colormask(struct spe_function *f,
     spe_release_register(f, colormask_reg);
 }
 
+/* This function is annoyingly similar to gen_depth_test(), above, except
+ * that instead of comparing two varying values (i.e. fragment and buffer),
+ * we're comparing a varying value with a static value.  As such, we have
+ * access to the Compare Immediate instructions where we don't in 
+ * gen_depth_test(), which is what makes us very different.
+ *
+ * The return value in the stencil_pass_reg is a bitmask of valid
+ * fragments that also passed the stencil test.  The bitmask of valid
+ * fragments that failed would be found in (mask_reg & ~stencil_pass_reg).
+ */
+static void
+gen_stencil_test(struct spe_function *f, const struct pipe_stencil_state *state, 
+                 unsigned int mask_reg, unsigned int fbS_reg, 
+                 unsigned int stencil_pass_reg)
+{
+   /* Generate code that puts the set of passing fragments into the stencil_pass_reg
+    * register, taking into account whether each fragment was active to begin with.
+    */
+   switch (state->func) {
+   case PIPE_FUNC_EQUAL:
+      /* stencil_pass = mask & (s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_NOTEQUAL:
+      /* stencil_pass = mask & ~(s == reference) */
+      spe_compare_equal_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GREATER:
+      /* stencil_pass = mask & (s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_LESS: {
+      /* stencil_pass = mask & (reference > s) */
+      /* There's no convenient Compare Less Than Immediate instruction, so
+       * we'll have to do this one the harder way, by loading a register and 
+       * comparing directly.  Compare Logical Greater Than Word (clgt) 
+       * treats its operands as unsigned - no sign extension.
+       */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_and(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_LEQUAL:
+      /* stencil_pass = mask & (s <= reference) = mask & ~(s > reference) */
+      spe_compare_greater_uint(f, stencil_pass_reg, fbS_reg, state->ref_value);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      break;
+
+   case PIPE_FUNC_GEQUAL: {
+      /* stencil_pass = mask & (s >= reference) = mask & ~(reference > s) */
+      /* As above, we have to do this by loading a register */
+      unsigned int tmp_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, tmp_reg, state->ref_value);
+      spe_clgt(f, stencil_pass_reg, tmp_reg, fbS_reg);
+      spe_andc(f, stencil_pass_reg, mask_reg, stencil_pass_reg);
+      spe_release_register(f, tmp_reg);
+      break;
+   }
+
+   case PIPE_FUNC_NEVER:
+      /* stencil_pass = mask & 0 = 0 */
+      spe_load_uint(f, stencil_pass_reg, 0);
+      break;
+
+   case PIPE_FUNC_ALWAYS:
+      /* stencil_pass = mask & 1 = mask */
+      spe_move(f, stencil_pass_reg, mask_reg);
+      break;
+   }
+
+   /* The fragments that passed the stencil test are now in stencil_pass_reg.
+    * The fragments that failed would be (mask_reg & ~stencil_pass_reg).
+    */
+}
+
+/* This function generates code that calculates a set of new stencil values
+ * given the earlier values and the operation to apply.  It does not
+ * apply any tests.  It is intended to be called up to 3 times
+ * (for the stencil fail operation, for the stencil pass-z fail operation,
+ * and for the stencil pass-z pass operation) to collect up to three
+ * possible sets of values, and for the caller to combine them based
+ * on the result of the tests.
+ *
+ * stencil_max_value should be (2^n - 1) where n is the number of bits
+ * in the stencil buffer - in other words, it should be usable as a mask.
+ */
+static void
+gen_stencil_values(struct spe_function *f, unsigned int stencil_op,
+                   unsigned int stencil_ref_value, unsigned int stencil_max_value,
+                   unsigned int fbS_reg, unsigned int newS_reg)
+{
+   /* The code below assumes that newS_reg and fbS_reg are not the same
+    * register; if they can be, the calculations below will have to use
+    * an additional temporary register.  For now, mark the assumption
+    * with an assertion that will fail if they are the same.
+    */
+   ASSERT(fbS_reg != newS_reg);
+
+   /* The code also assumes the the stencil_max_value is of the form 
+    * 2^n-1 and can therefore be used as a mask for the valid bits in 
+    * addition to a maximum.  Make sure this is the case as well.
+    * The clever math below exploits the fact that incrementing a 
+    * binary number serves to flip all the bits of a number starting at
+    * the LSB and continuing to (and including) the first zero bit
+    * found.  That means that a number and its increment will always
+    * have at least one bit in common (the high order bit, if nothing
+    * else) *unless* the number is zero, *or* the number is of a form
+    * consisting of some number of 1s in the low-order bits followed
+    * by nothing but 0s in the high-order bits.  The latter case
+    * implies it's of the form 2^n-1.
+    */
+   ASSERT(stencil_max_value > 0 && ((stencil_max_value + 1) & stencil_max_value) == 0);
+
+   switch(stencil_op) {
+   case PIPE_STENCIL_OP_KEEP:
+      /* newS = S */
+      spe_move(f, newS_reg, fbS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_ZERO:
+      /* newS = 0 */
+      spe_zero(f, newS_reg);
+      break;
+
+   case PIPE_STENCIL_OP_REPLACE:
+      /* newS = stencil reference value */
+      spe_load_uint(f, newS_reg, stencil_ref_value);
+      break;
+
+   case PIPE_STENCIL_OP_INCR: {
+      /* newS = (s == max ? max : s + 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, stencil_max_value);
+      /* Add Word Immediate computes rT = rA + 10-bit signed immediate */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_DECR: {
+      /* newS = (s == 0 ? 0 : s - 1) */
+      unsigned int equals_reg = spe_allocate_available_register(f);
+
+      spe_compare_equal_uint(f, equals_reg, fbS_reg, 0);
+      /* Add Word Immediate with a (-1) value works */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      /* Select from the current value or the new value based on the equality test */
+      spe_selb(f, newS_reg, fbS_reg, newS_reg, equals_reg);
+
+      spe_release_register(f, equals_reg);
+      break;
+   }
+   case PIPE_STENCIL_OP_INCR_WRAP:
+      /* newS = (s == max ? 0 : s + 1), but since max is 2^n-1, we can
+       * do a normal add and mask off the correct bits 
+       */
+      spe_ai(f, newS_reg, fbS_reg, 1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_DECR_WRAP:
+      /* newS = (s == 0 ? max : s - 1), but we'll pull the same mask trick as above */
+      spe_ai(f, newS_reg, fbS_reg, -1);
+      spe_and_uint(f, newS_reg, newS_reg, stencil_max_value);
+      break;
+
+   case PIPE_STENCIL_OP_INVERT:
+      /* newS = ~s.  We take advantage of the mask/max value to invert only
+       * the valid bits for the field so we don't have to do an extra "and".
+       */
+      spe_xor_uint(f, newS_reg, fbS_reg, stencil_max_value);
+      break;
+
+   default:
+      ASSERT(0);
+   }
+}
+
+
+/* This function generates code to get all the necessary possible
+ * stencil values.  For each of the output registers (fail_reg,
+ * zfail_reg, and zpass_reg), it either allocates a new register
+ * and calculates a new set of values based on the stencil operation,
+ * or it reuses a register allocation and calculation done for an
+ * earlier (matching) operation, or it reuses the fbS_reg register
+ * (if the stencil operation is KEEP, which doesn't change the 
+ * stencil buffer).
+ *
+ * Since this function allocates a variable number of registers,
+ * to avoid incurring complex logic to free them, they should
+ * be allocated after a spe_allocate_register_set() call
+ * and released by the corresponding spe_release_register_set() call.
+ */
+static void
+gen_get_stencil_values(struct spe_function *f, const struct pipe_depth_stencil_alpha_state *dsa,
+                       unsigned int fbS_reg, 
+                       unsigned int *fail_reg, unsigned int *zfail_reg, 
+                       unsigned int *zpass_reg, unsigned int *back_fail_reg, 
+                       unsigned int *back_zfail_reg, unsigned int *back_zpass_reg)
+{
+   unsigned zfail_op, back_zfail_op;
+
+   /* Stenciling had better be enabled here */
+   ASSERT(dsa->stencil[0].enabled);
+
+   /* If the depth test is not enabled, it is treated as though it always
+    * passes.  In particular, that means that the "zfail_op" (and the backfacing
+    * counterpart, if active) are not considered - a failing stencil test will
+    * trigger the "fail_op", and a passing stencil test will trigger the
+    * "zpass_op".
+    *
+    * By overriding the operations in this case to be PIPE_STENCIL_OP_KEEP,
+    * we keep them from being calculated.
+    */
+   if (dsa->depth.enabled) {
+      zfail_op = dsa->stencil[0].zfail_op;
+      back_zfail_op = dsa->stencil[1].zfail_op;
+   }
+   else {
+      zfail_op = PIPE_STENCIL_OP_KEEP;
+      back_zfail_op = PIPE_STENCIL_OP_KEEP;
+   }
+
+   /* One-sided or front-facing stencil */
+   if (dsa->stencil[0].fail_op == PIPE_STENCIL_OP_KEEP) {
+      *fail_reg = fbS_reg;
+   }
+   else {
+      *fail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].fail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *fail_reg);
+   }
+
+   if (zfail_op == PIPE_STENCIL_OP_KEEP) {
+      *zfail_reg = fbS_reg;
+   }
+   else if (zfail_op == dsa->stencil[0].fail_op) {
+      *zfail_reg = *fail_reg;
+   }
+   else {
+      *zfail_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zfail_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zfail_reg);
+   }
+
+   if (dsa->stencil[0].zpass_op == PIPE_STENCIL_OP_KEEP) {
+      *zpass_reg = fbS_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == dsa->stencil[0].fail_op) {
+      *zpass_reg = *fail_reg;
+   }
+   else if (dsa->stencil[0].zpass_op == zfail_op) {
+      *zpass_reg = *zfail_reg;
+   }
+   else {
+      *zpass_reg = spe_allocate_available_register(f);
+      gen_stencil_values(f, dsa->stencil[0].zpass_op, dsa->stencil[0].ref_value, 
+         0xff, fbS_reg, *zpass_reg);
+   }
+
+   /* If two-sided stencil is enabled, we have more work to do. */
+   if (!dsa->stencil[1].enabled) {
+      /* This just flags that the registers need not be deallocated later */
+      *back_fail_reg = fbS_reg;
+      *back_zfail_reg = fbS_reg;
+      *back_zpass_reg = fbS_reg;
+   }
+   else {
+      /* Same calculations as above, but for the back stencil */
+      if (dsa->stencil[1].fail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_fail_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].fail_op) {
+         *back_fail_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == zfail_op) {
+         *back_fail_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].fail_op == dsa->stencil[0].zpass_op) {
+         *back_fail_reg = *zpass_reg;
+      }
+      else {
+         *back_fail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].fail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_fail_reg);
+      }
+
+      if (back_zfail_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zfail_reg = fbS_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].fail_op) {
+         *back_zfail_reg = *fail_reg;
+      }
+      else if (back_zfail_op == zfail_op) {
+         *back_zfail_reg = *zfail_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[0].zpass_op) {
+         *back_zfail_reg = *zpass_reg;
+      }
+      else if (back_zfail_op == dsa->stencil[1].fail_op) {
+         *back_zfail_reg = *back_fail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zfail_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zfail_reg);
+      }
+
+      if (dsa->stencil[1].zpass_op == PIPE_STENCIL_OP_KEEP) {
+         *back_zpass_reg = fbS_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].fail_op) {
+         *back_zpass_reg = *fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == zfail_op) {
+         *back_zpass_reg = *zfail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[0].zpass_op) {
+         *back_zpass_reg = *zpass_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == dsa->stencil[1].fail_op) {
+         *back_zpass_reg = *back_fail_reg;
+      }
+      else if (dsa->stencil[1].zpass_op == back_zfail_op) {
+         *back_zpass_reg = *back_zfail_reg;
+      }
+      else {
+         *back_zfail_reg = spe_allocate_available_register(f);
+         gen_stencil_values(f, dsa->stencil[1].zpass_op, dsa->stencil[1].ref_value, 
+            0xff, fbS_reg, *back_zpass_reg);
+      }
+   } /* End of calculations for back-facing stencil */
+}
+
+/* Note that fbZ_reg may *not* be set on entry, if in fact
+ * the depth test is not enabled.  This function must not use
+ * the register if depth is not enabled.
+ */
+static boolean
+gen_stencil_depth_test(struct spe_function *f, 
+                       const struct pipe_depth_stencil_alpha_state *dsa, 
+                       const int const facing_reg,
+                       const int mask_reg, const int fragZ_reg, 
+                       const int fbZ_reg, const int fbS_reg)
+{
+   /* True if we've generated code that could require writeback to the
+    * depth and/or stencil buffers
+    */
+   boolean modified_buffers = false;
+
+   boolean need_to_calculate_stencil_values;
+   boolean need_to_writemask_stencil_values;
+
+   /* Registers.  We may or may not actually allocate these, depending
+    * on whether the state values indicate that we need them.
+    */
+   unsigned int stencil_pass_reg, stencil_fail_reg;
+   unsigned int stencil_fail_values, stencil_pass_depth_fail_values, stencil_pass_depth_pass_values;
+   unsigned int stencil_writemask_reg;
+   unsigned int zmask_reg;
+   unsigned int newS_reg;
+
+   /* Stenciling is quite complex: up to six different configurable stencil 
+    * operations/calculations can be required (three each for front-facing
+    * and back-facing fragments).  Many of those operations will likely 
+    * be identical, so there's good reason to try to avoid calculating 
+    * the same values more than once (which unfortunately makes the code less 
+    * straightforward).
+    *
+    * To make register management easier, we start a new 
+    * register set; we can release all the registers in the set at
+    * once, and avoid having to keep track of exactly which registers
+    * we allocate.  We can still allocate and free registers as 
+    * desired (if we know we no longer need a register), but we don't
+    * have to spend the complexity to track the more difficult variant
+    * register usage scenarios.
+    */
+   spe_comment(f, 0, "Allocating stencil register set");
+   spe_allocate_register_set(f);
+
+   /* Calculate the writemask.  If the writemask is trivial (either
+    * all 0s, meaning that we don't need to calculate any stencil values
+    * because they're not going to change the stencil anyway, or all 1s,
+    * meaning that we have to calculate the stencil values but do not
+    * need to mask them), we can avoid generating code.  Don't forget
+    * that we need to consider backfacing stencil, if enabled.
+    */
+   if (dsa->stencil[0].write_mask == 0x0 && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0x00)) {
+      /* Trivial: don't need to calculate stencil values, and don't need to 
+       * write them back to the framebuffer.
+       */
+      need_to_calculate_stencil_values = false;
+      need_to_writemask_stencil_values = false;
+   }
+   else if (dsa->stencil[0].write_mask == 0xff && (!dsa->stencil[1].enabled || dsa->stencil[1].write_mask == 0xff)) {
+      /* Still trivial, but a little less so.  We need to write the stencil
+       * values, but we don't need to mask them.
+       */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = false;
+   }
+   else {
+      /* The general case: calculate, mask, and write */
+      need_to_calculate_stencil_values = true;
+      need_to_writemask_stencil_values = true;
+
+      /* While we're here, generate code that calculates what the
+       * writemask should be.  If backface stenciling is enabled,
+       * and the backface writemask is not the same as the frontface
+       * writemask, we'll have to generate code that merges the
+       * two masks into a single effective mask based on fragment facing.
+       */
+      spe_comment(f, 0, "Computing stencil writemask");
+      stencil_writemask_reg = spe_allocate_available_register(f);
+      spe_load_uint(f, stencil_writemask_reg, dsa->stencil[0].write_mask);
+      if (dsa->stencil[1].enabled && dsa->stencil[0].write_mask != dsa->stencil[1].write_mask) {
+         unsigned int back_write_mask_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Resolving two-sided stencil writemask");
+         spe_load_uint(f, back_write_mask_reg, dsa->stencil[1].write_mask);
+         spe_selb(f, stencil_writemask_reg, stencil_writemask_reg, back_write_mask_reg, facing_reg);
+         spe_release_register(f, back_write_mask_reg);
+      }
+   }
+
+   /* At least one-sided stenciling must be on.  Generate code that
+    * runs the stencil test on the basic/front-facing stencil, leaving
+    * the mask of passing stencil bits in stencil_pass_reg.  This mask will
+    * be used both to mask the set of active pixels, and also to
+    * determine how the stencil buffer changes.
+    *
+    * This test will *not* change the value in mask_reg (because we don't
+    * yet know whether to apply the two-sided stencil or one-sided stencil).
+    */
+   spe_comment(f, 0, "Running basic stencil test");
+   stencil_pass_reg = spe_allocate_available_register(f);
+   gen_stencil_test(f, &dsa->stencil[0], mask_reg, fbS_reg, stencil_pass_reg);
+
+   /* If two-sided stenciling is on, generate code to run the stencil
+    * test on the backfacing stencil as well, and combine the two results
+    * into the one correct result based on facing.
+    */
+   if (dsa->stencil[1].enabled) {
+      unsigned int temp_reg = spe_allocate_available_register(f);
+      spe_comment(f, 0, "Running backface stencil test");
+      gen_stencil_test(f, &dsa->stencil[1], mask_reg, fbS_reg, temp_reg);
+      spe_selb(f, stencil_pass_reg, stencil_pass_reg, temp_reg, facing_reg);
+      spe_release_register(f, temp_reg);
+   }
+
+   /* Generate code that, given the mask of valid fragments and the
+    * mask of valid fragments that passed the stencil test, computes
+    * the mask of valid fragments that failed the stencil test.  We
+    * have to do this before we run a depth test (because the
+    * depth test should not be performed on fragments that failed the
+    * stencil test, and because the depth test will update the 
+    * mask of valid fragments based on the results of the depth test).
+    */
+   spe_comment(f, 0, "Computing stencil fail mask and updating fragment mask");
+   stencil_fail_reg = spe_allocate_available_register(f);
+   spe_andc(f, stencil_fail_reg, mask_reg, stencil_pass_reg);
+   /* Now remove the stenciled-out pixels from the valid fragment mask,
+    * so we can later use the valid fragment mask in the depth test.
+    */
+   spe_and(f, mask_reg, mask_reg, stencil_pass_reg);
+
+   /* We may not need to calculate stencil values, if the writemask is off */
+   if (need_to_calculate_stencil_values) {
+      unsigned int back_stencil_fail_values, back_stencil_pass_depth_fail_values, back_stencil_pass_depth_pass_values;
+      unsigned int front_stencil_fail_values, front_stencil_pass_depth_fail_values, front_stencil_pass_depth_pass_values;
+
+      /* Generate code that calculates exactly which stencil values we need,
+       * without calculating the same value twice (say, if two different
+       * stencil ops have the same value).  This code will work for one-sided
+       * and two-sided stenciling (so that we take into account that operations
+       * may match between front and back stencils), and will also take into
+       * account whether the depth test is enabled (if the depth test is off,
+       * we don't need any of the zfail results, because the depth test always
+       * is considered to pass if it is disabled).  Any register value that
+       * does not need to be calculated will come back with the same value
+       * that's in fbS_reg.
+       *
+       * This function will allocate a variant number of registers that
+       * will be released as part of the register set.
+       */
+      spe_comment(f, 0, "Computing stencil values");
+      gen_get_stencil_values(f, dsa, fbS_reg, 
+         &front_stencil_fail_values, &front_stencil_pass_depth_fail_values, 
+         &front_stencil_pass_depth_pass_values, &back_stencil_fail_values, 
+         &back_stencil_pass_depth_fail_values, &back_stencil_pass_depth_pass_values);
+
+      /* Tricky, tricky, tricky - the things we do to create optimal
+       * code...
+       *
+       * The various stencil values registers may overlap with each other
+       * and with fbS_reg arbitrarily (as any particular operation is
+       * only calculated once and stored in one register, no matter
+       * how many times it is used).  So we can't change the values 
+       * within those registers directly - if we change a value in a
+       * register that's being referenced by two different calculations,
+       * we've just unwittingly changed the second value as well...
+       *
+       * Avoid this by allocating new registers to hold the results
+       * (there may be 2, if the depth test is off, or 3, if it is on).
+       * These will be released as part of the register set.
+       */
+      if (!dsa->stencil[1].enabled) {
+         /* The easy case: if two-sided stenciling is *not* enabled, we
+          * just use the front-sided values.
+          */
+         stencil_fail_values = front_stencil_fail_values;
+         stencil_pass_depth_fail_values = front_stencil_pass_depth_fail_values;
+         stencil_pass_depth_pass_values = front_stencil_pass_depth_pass_values;
+      }
+      else { /* two-sided stencil enabled */
+         spe_comment(f, 0, "Resolving backface stencil values");
+         /* Allocate new registers for the needed merged values */
+         stencil_fail_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_fail_values, front_stencil_fail_values, back_stencil_fail_values, facing_reg);
+         if (dsa->depth.enabled) {
+            stencil_pass_depth_fail_values = spe_allocate_available_register(f);
+            spe_selb(f, stencil_pass_depth_fail_values, front_stencil_pass_depth_fail_values, back_stencil_pass_depth_fail_values, facing_reg);
+         }
+         else {
+            stencil_pass_depth_fail_values = fbS_reg;
+         }
+         stencil_pass_depth_pass_values = spe_allocate_available_register(f);
+         spe_selb(f, stencil_pass_depth_pass_values, front_stencil_pass_depth_pass_values, back_stencil_pass_depth_pass_values, facing_reg);
+      }
+   }
+
+   /* We now have all the stencil values we need.  We also need 
+    * the results of the depth test to figure out which
+    * stencil values will become the new stencil values.  (Even if
+    * we aren't actually calculating stencil values, we need to apply
+    * the depth test if it's enabled.)
+    *
+    * The code generated by gen_depth_test() returns the results of the
+    * test in the given register, but also alters the mask_reg based
+    * on the results of the test.
+    */
+   if (dsa->depth.enabled) {
+      spe_comment(f, 0, "Running stencil depth test");
+      zmask_reg = spe_allocate_available_register(f);
+      modified_buffers |= gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+   }
+
+   if (need_to_calculate_stencil_values) {
+
+      /* If we need to writemask the stencil values before going into
+       * the stencil buffer, we'll have to use a new register to
+       * hold the new values.  If not, we can just keep using the
+       * current register.
+       */
+      if (need_to_writemask_stencil_values) {
+         newS_reg = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Saving current stencil values for writemasking");
+         spe_move(f, newS_reg, fbS_reg);
+      }
+      else {
+         newS_reg = fbS_reg;
+      }
+
+      /* Merge in the selected stencil fail values */
+      if (stencil_fail_values != fbS_reg) {
+         spe_comment(f, 0, "Loading stencil fail values");
+         spe_selb(f, newS_reg, newS_reg, stencil_fail_values, stencil_fail_reg);
+         modified_buffers = true;
+      }
+
+      /* Same for the stencil pass/depth fail values.  If this calculation
+       * is not needed (say, if depth test is off), then the
+       * stencil_pass_depth_fail_values register will be equal to fbS_reg
+       * and we'll skip the calculation.
+       */
+      if (stencil_pass_depth_fail_values != fbS_reg) {
+         /* We don't actually have a stencil pass/depth fail mask yet.
+          * Calculate it here from the stencil passing mask and the
+          * depth passing mask.  Note that zmask_reg *must* have been
+          * set above if we're here.
+          */
+         unsigned int stencil_pass_depth_fail_mask = spe_allocate_available_register(f);
+         spe_comment(f, 0, "Loading stencil pass/depth fail values");
+         spe_andc(f, stencil_pass_depth_fail_mask, stencil_pass_reg, zmask_reg);
+
+         spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_fail_values, stencil_pass_depth_fail_mask);
+
+         spe_release_register(f, stencil_pass_depth_fail_mask);
+         modified_buffers = true;
+      }
+
+      /* Same for the stencil pass/depth pass mask.  Note that we
+       * *can* get here with zmask_reg being unset (if the depth
+       * test is off but the stencil test is on).  In this case,
+       * we assume the depth test passes, and don't need to mask
+       * the stencil pass mask with the Z mask.
+       */
+      if (stencil_pass_depth_pass_values != fbS_reg) {
+         if (dsa->depth.enabled) {
+            unsigned int stencil_pass_depth_pass_mask = spe_allocate_available_register(f);
+            /* We'll need a separate register */
+            spe_comment(f, 0, "Loading stencil pass/depth pass values");
+            spe_and(f, stencil_pass_depth_pass_mask, stencil_pass_reg, zmask_reg);
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_depth_pass_mask);
+            spe_release_register(f, stencil_pass_depth_pass_mask);
+         }
+         else {
+            /* We can use the same stencil-pass register */
+            spe_comment(f, 0, "Loading stencil pass values");
+            spe_selb(f, newS_reg, newS_reg, stencil_pass_depth_pass_values, stencil_pass_reg);
+         }
+         modified_buffers = true;
+      }
+
+      /* Almost done.  If we need to writemask, do it now, leaving the
+       * results in the fbS_reg register passed in.  If we don't need
+       * to writemask, then the results are *already* in the fbS_reg,
+       * so there's nothing more to do.
+       */
+
+      if (need_to_writemask_stencil_values && modified_buffers) {
+         /* The Select Bytes command makes a fine writemask.  Where
+          * the mask is 0, the first (original) values are retained,
+          * effectively masking out changes.  Where the mask is 1, the
+          * second (new) values are retained, incorporating changes.
+          */
+         spe_comment(f, 0, "Writemasking new stencil values");
+         spe_selb(f, fbS_reg, fbS_reg, newS_reg, stencil_writemask_reg);
+      }
+
+   } /* done calculating stencil values */
+
+   /* The stencil and/or depth values have been applied, and the
+    * mask_reg, fbS_reg, and fbZ_reg values have been updated.
+    * We're all done, except that we've allocated a fair number
+    * of registers that we didn't bother tracking.  Release all
+    * those registers as part of the register set, and go home.
+    */
+   spe_comment(f, 0, "Releasing stencil register set");
+   spe_release_register_set(f);
+
+   /* Return true if we could have modified the stencil and/or
+    * depth buffers.
+    */
+   return modified_buffers;
+}
+
+
 /**
  * Generate SPE code to implement the fragment operations (alpha test,
  * depth test, stencil test, blending, colormask, and final
@@ -1156,6 +1834,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    const int fragB_reg = 10;  /* vector float */
    const int fragA_reg = 11;  /* vector float */
    const int mask_reg = 12;   /* vector uint */
+   const int facing_reg = 13; /* uint */
 
    /* offset of quad from start of tile
     * XXX assuming 4-byte pixels for color AND Z/stencil!!!!
@@ -1183,6 +1862,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
    spe_allocate_register(f, fragB_reg);
    spe_allocate_register(f, fragA_reg);
    spe_allocate_register(f, mask_reg);
+   spe_allocate_register(f, facing_reg);
 
    quad_offset_reg = spe_allocate_available_register(f);
    fbRGBA_reg = spe_allocate_available_register(f);
@@ -1195,6 +1875,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
 
       ASSERT(TILE_SIZE == 32);
 
+      spe_comment(f, 0, "Compute quad offset within tile");
       spe_rotmi(f, y2_reg, y_reg, -1);  /* y2 = y / 2 */
       spe_rotmi(f, x2_reg, x_reg, -1);  /* x2 = x / 2 */
       spe_shli(f, y2_reg, y2_reg, 4);   /* y2 *= 16 */
@@ -1205,130 +1886,188 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       spe_release_register(f, y2_reg);
    }
 
-
    if (dsa->alpha.enabled) {
       gen_alpha_test(dsa, f, mask_reg, fragA_reg);
    }
 
+   /* If we need the stencil buffers (because one- or two-sided stencil is
+    * enabled) or the depth buffer (because the depth test is enabled),
+    * go grab them.  Note that if either one- or two-sided stencil is
+    * enabled, dsa->stencil[0].enabled will be true.
+    */
    if (dsa->depth.enabled || dsa->stencil[0].enabled) {
       const enum pipe_format zs_format = cell->framebuffer.zsbuf->format;
       boolean write_depth_stencil;
 
-      int fbZ_reg = spe_allocate_available_register(f); /* Z values */
-      int fbS_reg = spe_allocate_available_register(f); /* Stencil values */
+      /* We may or may not need to allocate a register for Z or stencil values */
+      boolean fbS_reg_set = false, fbZ_reg_set = false;
+      unsigned int fbS_reg, fbZ_reg = 0;
+
+      spe_comment(f, 0, "Fetching Z/stencil quad from tile");
 
       /* fetch quad of depth/stencil values from tile at (x,y) */
       /* Load: fbZS_reg = memory[depth_tile_reg + offset_reg] */
+      /* XXX Not sure this is allowed if we've only got a 16-bit Z buffer... */
       spe_lqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
 
-      if (dsa->depth.enabled) {
-         /* Extract Z bits from fbZS_reg into fbZ_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            int mask_reg = spe_allocate_available_register(f);
-            spe_fsmbi(f, mask_reg, 0x7777);  /* mask[0,1,2,3] = 0x00ffffff */
-            spe_and(f, fbZ_reg, fbZS_reg, mask_reg);  /* fbZ = fbZS & mask */
-            spe_release_register(f, mask_reg);
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_rotmi(f, fbZ_reg, fbZS_reg, -8);  /* fbZ = fbZS >> 8 */
-            /* OK, fbZ_reg has four 24-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 32-bit Z values now */
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZ_reg, fbZS_reg);
-            /* OK, fbZ_reg has four 16-bit Z values now */
-         }
-         else {
-            ASSERT(0);  /* invalid format */
-         }
-
-         /* Convert fragZ values from float[4] to 16, 24 or 32-bit uint[4] */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-             zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 8 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
-         }
-         else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-         }
-         else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            /* scale/convert fragZ from float in [0,1] to uint in [0, ~0] */
-            spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
-            /* fragZ = fragZ >> 16 */
-            spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
-         }
-      }
-      else {
-         /* no Z test, but set Z to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbZ_reg, 0); /* XXX set to zero for now */
+      /* From the Z/stencil buffer format, pull out the bits we need for
+       * Z and/or stencil.  We'll also convert the incoming fragment Z
+       * value in fragZ_reg from a floating point value in [0.0..1.0] to
+       * an unsigned integer value with the appropriate resolution.
+       */
+      switch(zs_format) {
+
+         case PIPE_FORMAT_S8Z24_UNORM: /* fall through */
+         case PIPE_FORMAT_X8Z24_UNORM:
+            if (dsa->depth.enabled) {
+               /* We need the Z part at least */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* four 24-bit Z values in the low-order bits */
+               spe_and_uint(f, fbZ_reg, fbZS_reg, 0x00ffffff);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* four 8-bit Z values in the high-order bits */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -24);
+            }
+            break;
+
+         case PIPE_FORMAT_Z24S8_UNORM: /* fall through */
+         case PIPE_FORMAT_Z24X8_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* shift by 8 to get the upper 24-bit values */
+               spe_rotmi(f, fbS_reg, fbZS_reg, -8);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 24-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -8);
+            }
+            if (dsa->stencil[0].enabled) {
+               setup_optional_register(f, &fbS_reg_set, &fbS_reg);
+               /* 8-bit stencil in the low-order bits - mask them out */
+               spe_and_uint(f, fbS_reg, fbZS_reg, 0x000000ff);
+            }
+            break;
+
+         case PIPE_FORMAT_Z32_UNORM:
+            if (dsa->depth.enabled) {
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 32-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+            }
+            /* No stencil, so can't do anything there */
+            break;
+
+         case PIPE_FORMAT_Z16_UNORM:
+            if (dsa->depth.enabled) {
+               /* XXX Not sure this is correct, but it was here before, so we're
+                * going with it for now
+                */
+               setup_optional_register(f, &fbZ_reg_set, &fbZ_reg);
+               /* Copy over 4 32-bit values */
+               spe_move(f, fbZ_reg, fbZS_reg);
+
+               /* Incoming fragZ_reg value is a float in 0.0...1.0; convert
+                * to a 16-bit unsigned integer
+                */
+               spe_cfltu(f, fragZ_reg, fragZ_reg, 32);
+               spe_rotmi(f, fragZ_reg, fragZ_reg, -16);
+            }
+            /* No stencil */
+            break;
+
+         default:
+            ASSERT(0); /* invalid format */
       }
 
-
+      /* If stencil is enabled, use the stencil-specific code
+       * generator to generate both the stencil and depth (if needed)
+       * tests.  Otherwise, if only depth is enabled, generate
+       * a quick depth test.  The test generators themselves will
+       * report back whether the depth/stencil buffer has to be
+       * written back.
+       */
       if (dsa->stencil[0].enabled) {
-         /* Extract Stencil bit sfrom fbZS_reg into fbS_reg */
-         if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
-             zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            /* XXX extract with a shift */
-            ASSERT(0);
-         }
-         else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
-                  zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            /* XXX extract with a mask */
-            ASSERT(0);
-         }
-      }
-      else {
-         /* no stencil test, but set to zero so we don't OR-in garbage below */
-         spe_load_uint(f, fbS_reg, 0); /* XXX set to zero for now */
-      }
+         /* This will perform the stencil and depth tests, and update
+          * the mask_reg, fbZ_reg, and fbS_reg as required by the
+          * tests.
+          */
+         ASSERT(fbS_reg_set);
+         spe_comment(f, 0, "Perform stencil test");
 
-      if (dsa->stencil[0].enabled) {
-         /* XXX this may involve depth testing too */
-         // gen_stencil_test(dsa, f, ... );
-         ASSERT(0);
+         /* Note that fbZ_reg may not be set on entry, if stenciling
+          * is enabled but there's no Z-buffer.  The 
+          * gen_stencil_depth_test() function must ignore the
+          * fbZ_reg register if depth is not enabled.
+          */
+         write_depth_stencil = gen_stencil_depth_test(f, dsa, facing_reg, mask_reg, fragZ_reg, fbZ_reg, fbS_reg);
       }
       else if (dsa->depth.enabled) {
          int zmask_reg = spe_allocate_available_register(f);
-         gen_depth_test(dsa, f, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
+         ASSERT(fbZ_reg_set);
+         spe_comment(f, 0, "Perform depth test");
+         write_depth_stencil = gen_depth_test(f, dsa, mask_reg, fragZ_reg, fbZ_reg, zmask_reg);
          spe_release_register(f, zmask_reg);
       }
-
-      /* do we need to write Z and/or Stencil back into framebuffer? */
-      write_depth_stencil = (dsa->depth.writemask |
-                             dsa->stencil[0].write_mask |
-                             dsa->stencil[1].write_mask);
+      else {
+         write_depth_stencil = false;
+      }
 
       if (write_depth_stencil) {
          /* Merge latest Z and Stencil values into fbZS_reg.
           * fbZ_reg has four Z vals in bits [23..0] or bits [15..0].
           * fbS_reg has four 8-bit Z values in bits [7..0].
           */
+         spe_comment(f, 0, "Store quad's depth/stencil values in tile");
          if (zs_format == PIPE_FORMAT_S8Z24_UNORM ||
              zs_format == PIPE_FORMAT_X8Z24_UNORM) {
-            spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set && fbZ_reg_set) {
+               spe_shli(f, fbS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else if (fbS_reg_set) {
+               spe_shli(f, fbZS_reg, fbS_reg, 24); /* fbS = fbS << 24 */
+            }
+            else {
+               spe_move(f, fbZS_reg, fbZ_reg);
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z24S8_UNORM ||
                   zs_format == PIPE_FORMAT_Z24X8_UNORM) {
-            spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
-            spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            if (fbS_reg_set && fbZ_reg_set) {
+               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+               spe_or(f, fbZS_reg, fbS_reg, fbZ_reg); /* fbZS = fbS | fbZ */
+            }
+            else if (fbS_reg_set) {
+               spe_move(f, fbZS_reg, fbS_reg);
+            }
+            else {
+               spe_shli(f, fbZ_reg, fbZ_reg, 8); /* fbZ = fbZ << 8 */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z32_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            if (fbZ_reg_set) {
+               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_Z16_UNORM) {
-            spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            if (fbZ_reg_set) {
+               spe_move(f, fbZS_reg, fbZ_reg); /* fbZS = fbZ */
+            }
          }
          else if (zs_format == PIPE_FORMAT_S8_UNORM) {
             ASSERT(0);   /* XXX to do */
@@ -1341,21 +2080,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
          spe_stqx(f, fbZS_reg, depth_tile_reg, quad_offset_reg);
       }
 
-      spe_release_register(f, fbZ_reg);
-      spe_release_register(f, fbS_reg);
+      release_optional_register(f, &fbZ_reg_set, fbZ_reg);
+      release_optional_register(f, &fbS_reg_set, fbS_reg);
    }
 
-
    /* Get framebuffer quad/colors.  We'll need these for blending,
     * color masking, and to obey the quad/pixel mask.
     * Load: fbRGBA_reg = memory[color_tile + quad_offset]
     * Note: if mask={~0,~0,~0,~0} and we're not blending or colormasking
     * we could skip this load.
     */
+   spe_comment(f, 0, "Fetch quad colors from tile");
    spe_lqx(f, fbRGBA_reg, color_tile_reg, quad_offset_reg);
 
-
    if (blend->blend_enable) {
+      spe_comment(f, 0, "Perform blending");
       gen_blend(blend, blend_color, f, color_format,
                 fragR_reg, fragG_reg, fragB_reg, fragA_reg, fbRGBA_reg);
    }
@@ -1369,19 +2108,21 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       int rgba_reg = spe_allocate_available_register(f);
 
       /* Pack four float colors as four 32-bit int colors */
+      spe_comment(f, 0, "Convert float quad colors to packed int framebuffer colors");
       gen_pack_colors(f, color_format,
                       fragR_reg, fragG_reg, fragB_reg, fragA_reg,
                       rgba_reg);
 
       if (blend->logicop_enable) {
+         spe_comment(f, 0, "Compute logic op");
          gen_logicop(blend, f, rgba_reg, fbRGBA_reg);
       }
 
       if (blend->colormask != PIPE_MASK_RGBA) {
+         spe_comment(f, 0, "Compute color mask");
          gen_colormask(f, blend->colormask, color_format, rgba_reg, fbRGBA_reg);
       }
 
-
       /* Mix fragment colors with framebuffer colors using the quad/pixel mask:
        * if (mask[i])
        *    rgba[i] = rgba[i];
@@ -1393,6 +2134,7 @@ cell_gen_fragment_function(struct cell_context *cell, struct spe_function *f)
       /* Store updated quad in tile:
        * memory[color_tile + quad_offset] = rgba_reg;
        */
+      spe_comment(f, 0, "Store quad colors into color tile");
       spe_stqx(f, rgba_reg, color_tile_reg, quad_offset_reg);
 
       spe_release_register(f, rgba_reg);
diff --git a/src/gallium/drivers/cell/ppu/cell_pipe_state.c b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
index 8c55b8e0933..2e3086c4fae 100644
--- a/src/gallium/drivers/cell/ppu/cell_pipe_state.c
+++ b/src/gallium/drivers/cell/ppu/cell_pipe_state.c
@@ -258,8 +258,6 @@ cell_set_sampler_textures(struct pipe_context *pipe,
    }
    cell->num_textures = num;
 
-   cell_update_texture_mapping(cell);
-
    cell->dirty |= CELL_NEW_TEXTURE;
 }
 
diff --git a/src/gallium/drivers/cell/ppu/cell_render.c b/src/gallium/drivers/cell/ppu/cell_render.c
index dd25ae880e5..79cb8df82fa 100644
--- a/src/gallium/drivers/cell/ppu/cell_render.c
+++ b/src/gallium/drivers/cell/ppu/cell_render.c
@@ -152,6 +152,7 @@ cell_flush_prim_buffer(struct cell_context *cell)
       struct cell_command_render *render = &cell_global.command[i].render;
       render->prim_type = PIPE_PRIM_TRIANGLES;
       render->num_verts = cell->prim_buffer.num_verts;
+      render->front_winding = cell->rasterizer->front_winding;
       render->vertex_size = cell->vertex_info->size * 4;
       render->xmin = cell->prim_buffer.xmin;
       render->ymin = cell->prim_buffer.ymin;
diff --git a/src/gallium/drivers/cell/ppu/cell_screen.c b/src/gallium/drivers/cell/ppu/cell_screen.c
index 47ba6fa2909..d2235579507 100644
--- a/src/gallium/drivers/cell/ppu/cell_screen.c
+++ b/src/gallium/drivers/cell/ppu/cell_screen.c
@@ -76,11 +76,11 @@ cell_get_param(struct pipe_screen *screen, int param)
    case PIPE_CAP_TEXTURE_SHADOW_MAP:
       return 10;
    case PIPE_CAP_MAX_TEXTURE_2D_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    case PIPE_CAP_MAX_TEXTURE_3D_LEVELS:
       return 8;  /* max 128x128x128 */
    case PIPE_CAP_MAX_TEXTURE_CUBE_LEVELS:
-      return 12; /* max 2Kx2K */
+      return CELL_MAX_TEXTURE_LEVELS;
    default:
       return 10;
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state.h b/src/gallium/drivers/cell/ppu/cell_state.h
index a7771a55a31..b193170f9ce 100644
--- a/src/gallium/drivers/cell/ppu/cell_state.h
+++ b/src/gallium/drivers/cell/ppu/cell_state.h
@@ -44,8 +44,9 @@
 #define CELL_NEW_TEXTURE       0x800
 #define CELL_NEW_VERTEX        0x1000
 #define CELL_NEW_VS            0x2000
-#define CELL_NEW_CONSTANTS     0x4000
-#define CELL_NEW_VERTEX_INFO   0x8000
+#define CELL_NEW_VS_CONSTANTS  0x4000
+#define CELL_NEW_FS_CONSTANTS  0x8000
+#define CELL_NEW_VERTEX_INFO   0x10000
 
 
 extern void
diff --git a/src/gallium/drivers/cell/ppu/cell_state_emit.c b/src/gallium/drivers/cell/ppu/cell_state_emit.c
index f35893537bf..bb694aa1073 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_emit.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_emit.c
@@ -25,6 +25,7 @@
  * 
  **************************************************************************/
 
+#include "pipe/p_inlines.h"
 #include "util/u_memory.h"
 #include "cell_context.h"
 #include "cell_gen_fragment.h"
@@ -36,6 +37,79 @@
 #include "draw/draw_private.h"
 
 
+/**
+ * Find/create a cell_command_fragment_ops object corresponding to the
+ * current blend/stencil/z/colormask/etc. state.
+ */
+static struct cell_command_fragment_ops *
+lookup_fragment_ops(struct cell_context *cell)
+{
+   struct cell_fragment_ops_key key;
+   struct cell_command_fragment_ops *ops;
+
+   /*
+    * Build key
+    */
+   memset(&key, 0, sizeof(key));
+   key.blend = *cell->blend;
+   key.dsa = *cell->depth_stencil;
+
+   if (cell->framebuffer.cbufs[0])
+      key.color_format = cell->framebuffer.cbufs[0]->format;
+   else
+      key.color_format = PIPE_FORMAT_NONE;
+
+   if (cell->framebuffer.zsbuf)
+      key.zs_format = cell->framebuffer.zsbuf->format;
+   else
+      key.zs_format = PIPE_FORMAT_NONE;
+
+   /*
+    * Look up key in cache.
+    */
+   ops = (struct cell_command_fragment_ops *)
+      util_keymap_lookup(cell->fragment_ops_cache, &key);
+
+   /*
+    * If not found, create/save new fragment ops command.
+    */
+   if (!ops) {
+      struct spe_function spe_code;
+
+      if (0)
+         debug_printf("**** Create New Fragment Ops\n");
+
+      /* Prepare the buffer that will hold the generated code. */
+      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
+
+      /* generate new code */
+      cell_gen_fragment_function(cell, &spe_code);
+
+      /* alloc new fragment ops command */
+      ops = CALLOC_STRUCT(cell_command_fragment_ops);
+
+      /* populate the new cell_command_fragment_ops object */
+      ops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
+      memcpy(ops->code, spe_code.store, spe_code_size(&spe_code));
+      ops->dsa = *cell->depth_stencil;
+      ops->blend = *cell->blend;
+
+      /* insert cell_command_fragment_ops object into keymap/cache */
+      util_keymap_insert(cell->fragment_ops_cache, &key, ops, NULL);
+
+      /* release rtasm buffer */
+      spe_release_func(&spe_code);
+   }
+   else {
+      if (0)
+         debug_printf("**** Re-use Fragment Ops\n");
+   }
+
+   return ops;
+}
+
+
+
 static void
 emit_state_cmd(struct cell_context *cell, uint cmd,
                const void *state, uint state_size)
@@ -89,31 +163,31 @@ cell_emit_state(struct cell_context *cell)
       }
    }
 
+   if (cell->dirty & (CELL_NEW_FS_CONSTANTS)) {
+      const uint shader = PIPE_SHADER_FRAGMENT;
+      const uint num_const = cell->constants[shader].size / sizeof(float);
+      uint i, j;
+      float *buf = cell_batch_alloc(cell, 16 + num_const * sizeof(float));
+      uint64_t *ibuf = (uint64_t *) buf;
+      const float *constants = pipe_buffer_map(cell->pipe.screen,
+                                               cell->constants[shader].buffer,
+                                               PIPE_BUFFER_USAGE_CPU_READ);
+      ibuf[0] = CELL_CMD_STATE_FS_CONSTANTS;
+      ibuf[1] = num_const;
+      j = 4;
+      for (i = 0; i < num_const; i++) {
+         buf[j++] = constants[i];
+      }
+      pipe_buffer_unmap(cell->pipe.screen, cell->constants[shader].buffer);
+   }
+
    if (cell->dirty & (CELL_NEW_FRAMEBUFFER |
                       CELL_NEW_DEPTH_STENCIL |
                       CELL_NEW_BLEND)) {
-      /* XXX we don't want to always do codegen here.  We should have
-       * a hash/lookup table to cache previous results...
-       */
-      struct cell_command_fragment_ops *fops
-            = cell_batch_alloc(cell, sizeof(*fops));
-      struct spe_function spe_code;
-
-      /* Prepare the buffer that will hold the generated code. */
-      spe_init_func(&spe_code, SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-
-      /* generate new code */
-      cell_gen_fragment_function(cell, &spe_code);
-
-      /* put the new code into the batch buffer */
-      fops->opcode = CELL_CMD_STATE_FRAGMENT_OPS;
-      memcpy(&fops->code, spe_code.store,
-             SPU_MAX_FRAGMENT_OPS_INSTS * SPE_INST_SIZE);
-      fops->dsa = *cell->depth_stencil;
-      fops->blend = *cell->blend;
-
-      /* free codegen buffer */
-      spe_release_func(&spe_code);
+      struct cell_command_fragment_ops *fops, *fops_cmd;
+      fops_cmd = cell_batch_alloc(cell, sizeof(*fops_cmd));
+      fops = lookup_fragment_ops(cell);
+      memcpy(fops_cmd, fops, sizeof(*fops));
    }
 
    if (cell->dirty & CELL_NEW_SAMPLER) {
@@ -137,14 +211,24 @@ cell_emit_state(struct cell_context *cell)
          texture->opcode = CELL_CMD_STATE_TEXTURE;
          texture->unit = i;
          if (cell->texture[i]) {
-            texture->start = cell->texture[i]->tiled_data;
-            texture->width = cell->texture[i]->base.width[0];
-            texture->height = cell->texture[i]->base.height[0];
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = cell->texture[i]->tiled_data[level];
+               texture->width[level] = cell->texture[i]->base.width[level];
+               texture->height[level] = cell->texture[i]->base.height[level];
+               texture->depth[level] = cell->texture[i]->base.depth[level];
+            }
+            texture->target = cell->texture[i]->base.target;
          }
          else {
-            texture->start = NULL;
-            texture->width = 1;
-            texture->height = 1;
+            uint level;
+            for (level = 0; level < CELL_MAX_TEXTURE_LEVELS; level++) {
+               texture->start[level] = NULL;
+               texture->width[level] = 0;
+               texture->height[level] = 0;
+               texture->depth[level] = 0;
+            }
+            texture->target = 0;
          }
       }
    }
diff --git a/src/gallium/drivers/cell/ppu/cell_state_shader.c b/src/gallium/drivers/cell/ppu/cell_state_shader.c
index 3a0d066da2a..54a17eaf2b7 100644
--- a/src/gallium/drivers/cell/ppu/cell_state_shader.c
+++ b/src/gallium/drivers/cell/ppu/cell_state_shader.c
@@ -197,7 +197,10 @@ cell_set_constant_buffer(struct pipe_context *pipe,
                         buf->buffer);
    cell->constants[shader].size = buf->size;
 
-   cell->dirty |= CELL_NEW_CONSTANTS;
+   if (shader == PIPE_SHADER_VERTEX)
+      cell->dirty |= CELL_NEW_VS_CONSTANTS;
+   else if (shader == PIPE_SHADER_FRAGMENT)
+      cell->dirty |= CELL_NEW_FS_CONSTANTS;
 }
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.c b/src/gallium/drivers/cell/ppu/cell_texture.c
index b6590dfb86e..230e1925733 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.c
+++ b/src/gallium/drivers/cell/ppu/cell_texture.c
@@ -52,20 +52,22 @@ static unsigned minify( unsigned d )
 
 
 static void
-cell_texture_layout(struct cell_texture * spt)
+cell_texture_layout(struct cell_texture *ct)
 {
-   struct pipe_texture *pt = &spt->base;
+   struct pipe_texture *pt = &ct->base;
    unsigned level;
    unsigned width = pt->width[0];
    unsigned height = pt->height[0];
    unsigned depth = pt->depth[0];
 
-   spt->buffer_size = 0;
+   ct->buffer_size = 0;
 
    for ( level = 0 ; level <= pt->last_level ; level++ ) {
       unsigned size;
       unsigned w_tile, h_tile;
 
+      assert(level < CELL_MAX_TEXTURE_LEVELS);
+
       /* width, height, rounded up to tile size */
       w_tile = align(width, TILE_SIZE);
       h_tile = align(height, TILE_SIZE);
@@ -76,9 +78,9 @@ cell_texture_layout(struct cell_texture * spt)
       pt->nblocksx[level] = pf_get_nblocksx(&pt->block, w_tile);  
       pt->nblocksy[level] = pf_get_nblocksy(&pt->block, h_tile);  
 
-      spt->stride[level] = pt->nblocksx[level] * pt->block.size;
+      ct->stride[level] = pt->nblocksx[level] * pt->block.size;
 
-      spt->level_offset[level] = spt->buffer_size;
+      ct->level_offset[level] = ct->buffer_size;
 
       size = pt->nblocksx[level] * pt->nblocksy[level] * pt->block.size;
       if (pt->target == PIPE_TEXTURE_CUBE)
@@ -86,7 +88,7 @@ cell_texture_layout(struct cell_texture * spt)
       else
          size *= depth;
 
-      spt->buffer_size += size;
+      ct->buffer_size += size;
 
       width  = minify(width);
       height = minify(height);
@@ -100,26 +102,25 @@ cell_texture_create(struct pipe_screen *screen,
                     const struct pipe_texture *templat)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = CALLOC_STRUCT(cell_texture);
-   if (!spt)
+   struct cell_texture *ct = CALLOC_STRUCT(cell_texture);
+   if (!ct)
       return NULL;
 
-   spt->base = *templat;
-   spt->base.refcount = 1;
-   spt->base.screen = screen;
+   ct->base = *templat;
+   ct->base.refcount = 1;
+   ct->base.screen = screen;
 
-   cell_texture_layout(spt);
+   cell_texture_layout(ct);
 
-   spt->buffer = ws->buffer_create(ws, 32,
-                                   PIPE_BUFFER_USAGE_PIXEL,
-                                   spt->buffer_size);
+   ct->buffer = ws->buffer_create(ws, 32, PIPE_BUFFER_USAGE_PIXEL,
+                                  ct->buffer_size);
 
-   if (!spt->buffer) {
-      FREE(spt);
+   if (!ct->buffer) {
+      FREE(ct);
       return NULL;
    }
 
-   return &spt->base;
+   return &ct->base;
 }
 
 
@@ -135,29 +136,116 @@ cell_texture_release(struct pipe_screen *screen,
        __FUNCTION__, (void *) *pt, (*pt)->refcount - 1);
    */
    if (--(*pt)->refcount <= 0) {
-      struct cell_texture *spt = cell_texture(*pt);
+      struct cell_texture *ct = cell_texture(*pt);
+      uint i;
 
       /*
-      DBG("%s deleting %p\n", __FUNCTION__, (void *) spt);
+      DBG("%s deleting %p\n", __FUNCTION__, (void *) ct);
       */
 
-      pipe_buffer_reference(screen, &spt->buffer, NULL);
+      pipe_buffer_reference(screen, &ct->buffer, NULL);
+
+      for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+         if (ct->tiled_data[i]) {
+            align_free(ct->tiled_data[i]);
+         }
+      }
 
-      FREE(spt);
+      FREE(ct);
    }
    *pt = NULL;
 }
 
 
-#if 0
+
+/**
+ * Convert image from linear layout to tiled layout.  4-byte pixels.
+ */
+static void
+twiddle_image_uint(uint w, uint h, uint tile_size, uint *dst,
+                   uint src_stride, const uint *src)
+{
+   const uint tile_size2 = tile_size * tile_size;
+   const uint h_t = (h + tile_size - 1) / tile_size;
+   const uint w_t = (w + tile_size - 1) / tile_size;
+
+   uint it, jt;  /* tile counters */
+   uint i, j;    /* intra-tile counters */
+
+   src_stride /= 4; /* convert from bytes to pixels */
+
+   /* loop over dest tiles */
+   for (it = 0; it < h_t; it++) {
+      for (jt = 0; jt < w_t; jt++) {
+         /* start of dest tile: */
+         uint *tdst = dst + (it * w_t + jt) * tile_size2;
+
+         /* compute size of this tile (may be smaller than tile_size) */
+         /* XXX note: a compiler bug was found here. That's why the code
+          * looks as it does.
+          */
+         uint tile_width = w - jt * tile_size;
+         tile_width = MIN2(tile_width, tile_size);
+         uint tile_height = h - it * tile_size;
+         tile_height = MIN2(tile_height, tile_size);
+
+         /* loop over texels in the tile */
+         for (i = 0; i < tile_height; i++) {
+            for (j = 0; j < tile_width; j++) {
+               const uint srci = it * tile_size + i;
+               const uint srcj = jt * tile_size + j;
+               ASSERT(srci < h);
+               ASSERT(srcj < w);
+               tdst[i * tile_size + j] = src[srci * src_stride + srcj];
+            }
+         }
+      }
+   }
+}
+
+
+/**
+ * Convert linear texture image data to tiled format for SPU usage.
+ */
 static void
-cell_texture_update(struct pipe_context *pipe, struct pipe_texture *texture,
-                    uint face, uint levelsMask)
+cell_twiddle_texture(struct pipe_screen *screen,
+                     struct pipe_surface *surface)
 {
-   /* XXX TO DO:  re-tile the texture data ... */
+   struct cell_texture *ct = cell_texture(surface->texture);
+   const uint level = surface->level;
+   const uint texWidth = ct->base.width[level];
+   const uint texHeight = ct->base.height[level];
+   const uint bufWidth = align(texWidth, TILE_SIZE);
+   const uint bufHeight = align(texHeight, TILE_SIZE);
+   const void *map = pipe_buffer_map(screen, surface->buffer,
+                                     PIPE_BUFFER_USAGE_CPU_READ);
+   const uint *src = (const uint *) ((const ubyte *) map + surface->offset);
+
+   switch (ct->base.format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      {
+         int numFaces = ct->base.target == PIPE_TEXTURE_CUBE ? 6 : 1;
+         int offset = bufWidth * bufHeight * 4 * surface->face;
+         uint *dst;
+
+         if (!ct->tiled_data[level]) {
+            ct->tiled_data[level] =
+               align_malloc(bufWidth * bufHeight * 4 * numFaces, 16);
+         }
+
+         dst = (uint *) ((ubyte *) ct->tiled_data[level] + offset);
 
+         twiddle_image_uint(texWidth, texHeight, TILE_SIZE, dst,
+                            surface->stride, src);
+      }
+      break;
+   default:
+      printf("Cell: twiddle unsupported texture format\n");
+      ;
+   }
+
+   pipe_buffer_unmap(screen, surface->buffer);
 }
-#endif
 
 
 static struct pipe_surface *
@@ -167,22 +255,22 @@ cell_get_tex_surface(struct pipe_screen *screen,
                      unsigned usage)
 {
    struct pipe_winsys *ws = screen->winsys;
-   struct cell_texture *spt = cell_texture(pt);
+   struct cell_texture *ct = cell_texture(pt);
    struct pipe_surface *ps;
 
    ps = ws->surface_alloc(ws);
    if (ps) {
       assert(ps->refcount);
       assert(ps->winsys);
-      winsys_buffer_reference(ws, &ps->buffer, spt->buffer);
+      winsys_buffer_reference(ws, &ps->buffer, ct->buffer);
       ps->format = pt->format;
       ps->block = pt->block;
       ps->width = pt->width[level];
       ps->height = pt->height[level];
       ps->nblocksx = pt->nblocksx[level];
       ps->nblocksy = pt->nblocksy[level];
-      ps->stride = spt->stride[level];
-      ps->offset = spt->level_offset[level];
+      ps->stride = ct->stride[level];
+      ps->offset = ct->level_offset[level];
       ps->usage = usage;
 
       /* XXX may need to override usage flags (see sp_texture.c) */
@@ -206,118 +294,12 @@ cell_get_tex_surface(struct pipe_screen *screen,
 }
 
 
-
-/**
- * Copy tile data from linear layout to tiled layout.
- * XXX this should be rolled into the future surface-creation code.
- * XXX also need "untile" code...
- */
-static void
-tile_copy_data(uint w, uint h, uint tile_size, uint *dst, const uint *src)
-{
-   const uint tile_size2 = tile_size * tile_size;
-   const uint h_t = h / tile_size, w_t = w / tile_size;
-
-   uint it, jt;  /* tile counters */
-   uint i, j;    /* intra-tile counters */
-
-   /* loop over dest tiles */
-   for (it = 0; it < h_t; it++) {
-      for (jt = 0; jt < w_t; jt++) {
-         /* start of dest tile: */
-         uint *tdst = dst + (it * w_t + jt) * tile_size2;
-         /* loop over texels in the tile */
-         for (i = 0; i < tile_size; i++) {
-            for (j = 0; j < tile_size; j++) {
-               const uint srci = it * tile_size + i;
-               const uint srcj = jt * tile_size + j;
-               *tdst++ = src[srci * w + srcj];
-            }
-         }
-      }
-   }
-}
-
-
-
-/**
- * Convert linear texture image data to tiled format for SPU usage.
- * XXX recast this in terms of pipe_surfaces (aka texture views).
- */
-static void
-cell_tile_texture(struct cell_context *cell,
-                  struct cell_texture *texture)
-{
-   struct pipe_screen *screen = cell->pipe.screen;
-   uint face = 0, level = 0, zslice = 0;
-   struct pipe_surface *surf;
-   const uint w = texture->base.width[0], h = texture->base.height[0];
-   const uint *src;
-
-   /* temporary restrictions: */
-   assert(w >= TILE_SIZE);
-   assert(h >= TILE_SIZE);
-   assert(w % TILE_SIZE == 0);
-   assert(h % TILE_SIZE == 0);
-
-   surf = screen->get_tex_surface(screen, &texture->base, face, level, zslice,
-                                  PIPE_BUFFER_USAGE_CPU_WRITE);
-   ASSERT(surf);
-
-   src = (const uint *) pipe_surface_map(surf, PIPE_BUFFER_USAGE_CPU_WRITE);
-
-   if (texture->tiled_data) {
-      align_free(texture->tiled_data);
-   }
-   texture->tiled_data = align_malloc(w * h * 4, 16);
-
-   tile_copy_data(w, h, TILE_SIZE, texture->tiled_data, src);
-
-   pipe_surface_unmap(surf);
-
-   pipe_surface_reference(&surf, NULL);
-}
-
-
-void
-cell_update_texture_mapping(struct cell_context *cell)
-{
-#if 0
-   uint face = 0, level = 0, zslice = 0;
-#endif
-   uint i;
-
-   for (i = 0; i < CELL_MAX_SAMPLERS; i++) {
-      if (cell->texture[i])
-         cell_tile_texture(cell, cell->texture[i]);
-   }
-
-#if 0
-   if (cell->tex_surf && cell->tex_map) {
-      pipe_surface_unmap(cell->tex_surf);
-      cell->tex_map = NULL;
-   }
-
-   /* XXX free old surface */
-
-   cell->tex_surf = cell_get_tex_surface(&cell->pipe,
-                                         &cell->texture[0]->base,
-                                         face, level, zslice);
-
-   cell->tex_map = pipe_surface_map(cell->tex_surf);
-#endif
-}
-
-
 static void 
 cell_tex_surface_release(struct pipe_screen *screen, 
                          struct pipe_surface **s)
 {
-   /* Effectively do the texture_update work here - if texture images
-    * needed post-processing to put them into hardware layout, this is
-    * where it would happen.  For softpipe, nothing to do.
-    */
-   assert ((*s)->texture);
+   /* XXX if done rendering to teximage, re-tile */
+
    pipe_texture_reference(&(*s)->texture, NULL); 
 
    screen->winsys->surface_release(screen->winsys, s);
@@ -325,9 +307,9 @@ cell_tex_surface_release(struct pipe_screen *screen,
 
 
 static void *
-cell_surface_map( struct pipe_screen *screen,
-                  struct pipe_surface *surface,
-                  unsigned flags )
+cell_surface_map(struct pipe_screen *screen,
+                 struct pipe_surface *surface,
+                 unsigned flags)
 {
    ubyte *map;
 
@@ -339,22 +321,8 @@ cell_surface_map( struct pipe_screen *screen,
    map = pipe_buffer_map( screen, surface->buffer, flags );
    if (map == NULL)
       return NULL;
-
-   /* May want to different things here depending on read/write nature
-    * of the map:
-    */
-   if (surface->texture &&
-       (flags & PIPE_BUFFER_USAGE_CPU_WRITE)) 
-   {
-      /* Do something to notify sharing contexts of a texture change.
-       * In softpipe, that would mean flushing the texture cache.
-       */
-#if 0
-      cell_screen(screen)->timestamp++;
-#endif
-   }
-   
-   return map + surface->offset;
+   else
+      return (void *) (map + surface->offset);
 }
 
 
@@ -362,17 +330,21 @@ static void
 cell_surface_unmap(struct pipe_screen *screen,
                    struct pipe_surface *surface)
 {
-   pipe_buffer_unmap( screen, surface->buffer );
-}
+   struct cell_texture *ct = cell_texture(surface->texture);
 
+   assert(ct);
 
-void
-cell_init_texture_functions(struct cell_context *cell)
-{
-   /*cell->pipe.texture_update = cell_texture_update;*/
+   if ((ct->base.tex_usage & PIPE_TEXTURE_USAGE_SAMPLER) &&
+       (surface->usage & PIPE_BUFFER_USAGE_CPU_WRITE)) {
+      /* convert from linear to tiled layout */
+      cell_twiddle_texture(screen, surface);
+   }
+
+   pipe_buffer_unmap( screen, surface->buffer );
 }
 
 
+
 void
 cell_init_screen_texture_funcs(struct pipe_screen *screen)
 {
diff --git a/src/gallium/drivers/cell/ppu/cell_texture.h b/src/gallium/drivers/cell/ppu/cell_texture.h
index 6d37e95ebce..a0757091b07 100644
--- a/src/gallium/drivers/cell/ppu/cell_texture.h
+++ b/src/gallium/drivers/cell/ppu/cell_texture.h
@@ -40,15 +40,15 @@ struct cell_texture
 {
    struct pipe_texture base;
 
-   unsigned long level_offset[PIPE_MAX_TEXTURE_LEVELS];
-   unsigned long stride[PIPE_MAX_TEXTURE_LEVELS];
+   unsigned long level_offset[CELL_MAX_TEXTURE_LEVELS];
+   unsigned long stride[CELL_MAX_TEXTURE_LEVELS];
 
    /* The data is held here:
     */
    struct pipe_buffer *buffer;
    unsigned long buffer_size;
 
-   void *tiled_data;  /* XXX this may be temporary */ /*ALIGN16*/
+   void *tiled_data[CELL_MAX_TEXTURE_LEVELS];  /* XXX this may be temporary */ /*ALIGN16*/
 };
 
 
@@ -62,14 +62,6 @@ cell_texture(struct pipe_texture *pt)
 
 
 extern void
-cell_update_texture_mapping(struct cell_context *cell);
-
-
-extern void
-cell_init_texture_functions(struct cell_context *cell);
-
-
-extern void
 cell_init_screen_texture_funcs(struct pipe_screen *screen);
 
 
diff --git a/src/gallium/drivers/cell/ppu/cell_vbuf.c b/src/gallium/drivers/cell/ppu/cell_vbuf.c
index aa63435b934..578ddf62dcb 100644
--- a/src/gallium/drivers/cell/ppu/cell_vbuf.c
+++ b/src/gallium/drivers/cell/ppu/cell_vbuf.c
@@ -214,6 +214,7 @@ cell_vbuf_draw(struct vbuf_render *vbr,
 
       render->opcode = CELL_CMD_RENDER;
       render->prim_type = cvbr->prim;
+      render->front_winding = cell->rasterizer->front_winding;
 
       render->num_indexes = nr_indices;
       render->min_index = min_index;
diff --git a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
index 566df7f59e3..18969005b02 100644
--- a/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
+++ b/src/gallium/drivers/cell/ppu/cell_vertex_fetch.c
@@ -73,8 +73,8 @@ emit_matrix_transpose(struct spe_function *p,
    int col3;
 
 
-   spe_lqd(p, shuf_hi, shuf_ptr, 3);
-   spe_lqd(p, shuf_lo, shuf_ptr, 4);
+   spe_lqd(p, shuf_hi, shuf_ptr, 3*16);
+   spe_lqd(p, shuf_lo, shuf_ptr, 4*16);
    spe_shufb(p, t1, row0, row2, shuf_hi);
    spe_shufb(p, t2, row0, row2, shuf_lo);
 
@@ -122,13 +122,13 @@ emit_matrix_transpose(struct spe_function *p,
     */
    switch (count) {
    case 4:
-      spe_stqd(p, col3, dest_ptr, 3);
+      spe_stqd(p, col3, dest_ptr, 3 * 16);
    case 3:
-      spe_stqd(p, col2, dest_ptr, 2);
+      spe_stqd(p, col2, dest_ptr, 2 * 16);
    case 2:
-      spe_stqd(p, col1, dest_ptr, 1);
+      spe_stqd(p, col1, dest_ptr, 1 * 16);
    case 1:
-      spe_stqd(p, col0, dest_ptr, 0);
+      spe_stqd(p, col0, dest_ptr, 0 * 16);
    }
 
 
@@ -166,17 +166,17 @@ emit_fetch(struct spe_function *p,
    float scale_signed = 0.0;
    float scale_unsigned = 0.0;
 
-   spe_lqd(p, v0, in_ptr, 0 + offset[0]);
-   spe_lqd(p, v1, in_ptr, 1 + offset[0]);
-   spe_lqd(p, v2, in_ptr, 2 + offset[0]);
-   spe_lqd(p, v3, in_ptr, 3 + offset[0]);
+   spe_lqd(p, v0, in_ptr, (0 + offset[0]) * 16);
+   spe_lqd(p, v1, in_ptr, (1 + offset[0]) * 16);
+   spe_lqd(p, v2, in_ptr, (2 + offset[0]) * 16);
+   spe_lqd(p, v3, in_ptr, (3 + offset[0]) * 16);
    offset[0] += 4;
    
    switch (bytes) {
    case 1:
       scale_signed = 1.0f / 127.0f;
       scale_unsigned = 1.0f / 255.0f;
-      spe_lqd(p, tmp, shuf_ptr, 1);
+      spe_lqd(p, tmp, shuf_ptr, 1 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -185,7 +185,7 @@ emit_fetch(struct spe_function *p,
    case 2:
       scale_signed = 1.0f / 32767.0f;
       scale_unsigned = 1.0f / 65535.0f;
-      spe_lqd(p, tmp, shuf_ptr, 2);
+      spe_lqd(p, tmp, shuf_ptr, 2 * 16);
       spe_shufb(p, v0, v0, v0, tmp);
       spe_shufb(p, v1, v1, v1, tmp);
       spe_shufb(p, v2, v2, v2, tmp);
@@ -241,11 +241,11 @@ emit_fetch(struct spe_function *p,
 
    switch (count) {
    case 1:
-      spe_stqd(p, float_zero, out_ptr, 1);
+      spe_stqd(p, float_zero, out_ptr, 1 * 16);
    case 2:
-      spe_stqd(p, float_zero, out_ptr, 2);
+      spe_stqd(p, float_zero, out_ptr, 2 * 16);
    case 3:
-      spe_stqd(p, float_one, out_ptr, 3);
+      spe_stqd(p, float_one, out_ptr, 3 * 16);
    }
 
    if (float_zero != -1) {
diff --git a/src/gallium/drivers/cell/spu/spu_colorpack.h b/src/gallium/drivers/cell/spu/spu_colorpack.h
index fd8dc6ded3e..d7ce0055248 100644
--- a/src/gallium/drivers/cell/spu/spu_colorpack.h
+++ b/src/gallium/drivers/cell/spu/spu_colorpack.h
@@ -31,6 +31,7 @@
 #define SPU_COLORPACK_H
 
 
+#include <transpose_matrix4x4.h>
 #include <spu_intrinsics.h>
 
 
@@ -84,10 +85,10 @@ spu_unpack_B8G8R8A8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             10, 10, 10, 10,
-                             5, 5, 5, 5,
+                             2, 2, 2, 2,
+                             1, 1, 1, 1,
                              0, 0, 0, 0,
-                             15, 15, 15, 15}) );
+                             3, 3, 3, 3}) );
    return spu_convtf(color_u4, 32);
 }
 
@@ -98,13 +99,47 @@ spu_unpack_A8R8G8B8(uint color)
    vector unsigned int color_u4 = spu_splats(color);
    color_u4 = spu_shuffle(color_u4, color_u4,
                           ((vector unsigned char) {
-                             5, 5, 5, 5,
-                             10, 10, 10, 10,
-                             15, 15, 15, 15,
+                             1, 1, 1, 1,
+                             2, 2, 2, 2,
+                             3, 3, 3, 3,
                              0, 0, 0, 0}) );
-
    return spu_convtf(color_u4, 32);
 }
 
 
+/**
+ * \param color_in - array of 32-bit packed ARGB colors
+ * \param color_out - returns float colors in RRRR, GGGG, BBBB, AAAA order
+ */
+static INLINE void
+spu_unpack_A8R8G8B8_transpose4(const vector unsigned int color_in[4],
+                               vector float color_out[4])
+{
+   vector unsigned int c0;
+
+   c0 = spu_shuffle(color_in[0], color_in[0],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[0] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[1], color_in[1],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[1] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[2], color_in[2],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[2] = spu_convtf(c0, 32);
+
+   c0 = spu_shuffle(color_in[3], color_in[3],
+                    ((vector unsigned char) {
+                       1, 1, 1, 1,  2, 2, 2, 2,  3, 3, 3, 3,  0, 0, 0, 0}) );
+   color_out[3] = spu_convtf(c0, 32);
+
+   _transpose_matrix4x4(color_out, color_out);
+}
+
+
+
 #endif /* SPU_COLORPACK_H */
diff --git a/src/gallium/drivers/cell/spu/spu_command.c b/src/gallium/drivers/cell/spu/spu_command.c
index ec9da5d8870..c28677ebf87 100644
--- a/src/gallium/drivers/cell/spu/spu_command.c
+++ b/src/gallium/drivers/cell/spu/spu_command.c
@@ -59,6 +59,14 @@ static unsigned char attribute_fetch_code_buffer[136 * PIPE_MAX_ATTRIBS]
 
 
 
+static INLINE int
+align(int value, int alignment)
+{
+   return (value + alignment - 1) & ~(alignment - 1);
+}
+
+
+
 /**
  * Tell the PPU that this SPU has finished copying a buffer to
  * local store and that it may be reused by the PPU.
@@ -231,6 +239,25 @@ cmd_state_fragment_program(const struct cell_command_fragment_program *fp)
 }
 
 
+static uint
+cmd_state_fs_constants(const uint64_t *buffer, uint pos)
+{
+   const uint num_const = buffer[pos + 1];
+   const float *constants = (const float *) &buffer[pos + 2];
+   uint i;
+
+   DEBUG_PRINTF("CMD_STATE_FS_CONSTANTS (%u)\n", num_const);
+
+   /* Expand each float to float[4] for SOA execution */
+   for (i = 0; i < num_const; i++) {
+      spu.constants[i] = spu_splats(constants[i]);
+   }
+
+   /* return new buffer pos (in 8-byte words) */
+   return pos + 2 + num_const / 2;
+}
+
+
 static void
 cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 {
@@ -276,16 +303,96 @@ cmd_state_framebuffer(const struct cell_command_framebuffer *cmd)
 }
 
 
+/**
+ * Tex texture mask_s/t and scale_s/t fields depend on the texture size and
+ * sampler wrap modes.
+ */
+static void
+update_tex_masks(struct spu_texture *texture,
+                 const struct pipe_sampler_state *sampler,
+                 uint unit)
+{
+   uint i;
+
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      int width = texture->level[i].width;
+      int height = texture->level[i].height;
+
+      if (sampler->wrap_s == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_s = spu_splats(width - 1);
+      else
+         texture->level[i].mask_s = spu_splats(~0);
+
+      if (sampler->wrap_t == PIPE_TEX_WRAP_REPEAT)
+         texture->level[i].mask_t = spu_splats(height - 1);
+      else
+         texture->level[i].mask_t = spu_splats(~0);
+
+      if (sampler->normalized_coords) {
+         texture->level[i].scale_s = spu_splats((float) width);
+         texture->level[i].scale_t = spu_splats((float) height);
+      }
+      else {
+         texture->level[i].scale_s = spu_splats(1.0f);
+         texture->level[i].scale_t = spu_splats(1.0f);
+      }
+   }
+
+   /* XXX temporary hack */
+   if (texture->target == PIPE_TEXTURE_CUBE) {
+      spu.sample_texture4[unit] = sample_texture4_cube;
+   }
+}
+
+
 static void
 cmd_state_sampler(const struct cell_command_sampler *sampler)
 {
-   DEBUG_PRINTF("SAMPLER [%u]\n", sampler->unit);
+   uint unit = sampler->unit;
+
+   DEBUG_PRINTF("SAMPLER [%u]\n", unit);
+
+   spu.sampler[unit] = sampler->state;
+
+   switch (spu.sampler[unit].min_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.min_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.min_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].mag_img_filter) {
+   case PIPE_TEX_FILTER_LINEAR:
+      spu.mag_sample_texture4[unit] = sample_texture4_bilinear;
+      break;
+   case PIPE_TEX_FILTER_ANISO:
+      /* fall-through, for now */
+   case PIPE_TEX_FILTER_NEAREST:
+      spu.mag_sample_texture4[unit] = sample_texture4_nearest;
+      break;
+   default:
+      ASSERT(0);
+   }
+
+   switch (spu.sampler[sampler->unit].min_mip_filter) {
+   case PIPE_TEX_MIPFILTER_NEAREST:
+   case PIPE_TEX_MIPFILTER_LINEAR:
+      spu.sample_texture4[unit] = sample_texture4_lod;
+      break;
+   case PIPE_TEX_MIPFILTER_NONE:
+      spu.sample_texture4[unit] = spu.mag_sample_texture4[unit];
+      break;
+   default:
+      ASSERT(0);
+   }
 
-   spu.sampler[sampler->unit] = sampler->state;
-   if (spu.sampler[sampler->unit].min_img_filter == PIPE_TEX_FILTER_LINEAR)
-      spu.sample_texture[sampler->unit] = sample_texture_bilinear;
-   else
-      spu.sample_texture[sampler->unit] = sample_texture_nearest;
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
 }
 
 
@@ -293,24 +400,44 @@ static void
 cmd_state_texture(const struct cell_command_texture *texture)
 {
    const uint unit = texture->unit;
-   const uint width = texture->width;
-   const uint height = texture->height;
+   uint i;
+
+   //if (spu.init.id==0) Debug=1;
+
+   DEBUG_PRINTF("TEXTURE [%u]\n", texture->unit);
+
+   spu.texture[unit].max_level = 0;
+   spu.texture[unit].target = texture->target;
 
-   DEBUG_PRINTF("TEXTURE [%u] at %p  size %u x %u\n",
-             texture->unit, texture->start,
-             texture->width, texture->height);
+   for (i = 0; i < CELL_MAX_TEXTURE_LEVELS; i++) {
+      uint width = texture->width[i];
+      uint height = texture->height[i];
+      uint depth = texture->depth[i];
 
-   spu.texture[unit].start = texture->start;
-   spu.texture[unit].width = width;
-   spu.texture[unit].height = height;
+      DEBUG_PRINTF("  LEVEL %u: at %p  size[0] %u x %u\n", i,
+             texture->start[i], texture->width[i], texture->height[i]);
 
-   spu.texture[unit].tiles_per_row = width / TILE_SIZE;
+      spu.texture[unit].level[i].start = texture->start[i];
+      spu.texture[unit].level[i].width = width;
+      spu.texture[unit].level[i].height = height;
+      spu.texture[unit].level[i].depth = depth;
 
-   spu.texture[unit].tex_size = (vector float) { width, height, 0.0, 0.0};
-   spu.texture[unit].tex_size_mask = (vector unsigned int)
-         { width - 1, height - 1, 0, 0 };
-   spu.texture[unit].tex_size_x_mask = spu_splats(width - 1);
-   spu.texture[unit].tex_size_y_mask = spu_splats(height - 1);
+      spu.texture[unit].level[i].tiles_per_row =
+         (width + TILE_SIZE - 1) / TILE_SIZE;
+
+      spu.texture[unit].level[i].bytes_per_image =
+         4 * align(width, TILE_SIZE) * align(height, TILE_SIZE) * depth;
+
+      spu.texture[unit].level[i].max_s = spu_splats((int) width - 1);
+      spu.texture[unit].level[i].max_t = spu_splats((int) height - 1);
+
+      if (texture->start[i])
+         spu.texture[unit].max_level = i;
+   }
+
+   update_tex_masks(&spu.texture[unit], &spu.sampler[unit], unit);
+
+   //Debug=0;
 }
 
 
@@ -456,6 +583,9 @@ cmd_batch(uint opcode)
             pos += sizeof(*fp) / 8;
          }
          break;
+      case CELL_CMD_STATE_FS_CONSTANTS:
+         pos = cmd_state_fs_constants(buffer, pos);
+         break;
       case CELL_CMD_STATE_SAMPLER:
          {
             struct cell_command_sampler *sampler
diff --git a/src/gallium/drivers/cell/spu/spu_funcs.c b/src/gallium/drivers/cell/spu/spu_funcs.c
index b57ad3f3b81..5c3ee305d48 100644
--- a/src/gallium/drivers/cell/spu/spu_funcs.c
+++ b/src/gallium/drivers/cell/spu/spu_funcs.c
@@ -35,53 +35,96 @@
 
 #include <string.h>
 #include <libmisc.h>
-#include <cos8_v.h>
-#include <sin8_v.h>
+#include <math.h>
+#include <cos14_v.h>
+#include <sin14_v.h>
+#include <transpose_matrix4x4.h>
 
 #include "cell/common.h"
 #include "spu_main.h"
 #include "spu_funcs.h"
 
 
-#define M_PI 3.1415926
+/** For "return"-ing four vectors */
+struct vec_4x4
+{
+   vector float v[4];
+};
 
 
 static vector float
 spu_cos(vector float x)
 {
-#if 0
-   static const float scale = 1.0 / (2.0 * M_PI);
-   x = x * spu_splats(scale); /* normalize */
-   return _cos8_v(x);
-#else
-   /* just pass-through to avoid trashing caller's stack */
-   return x;
-#endif
+   return _cos14_v(x);
 }
 
 static vector float
 spu_sin(vector float x)
 {
-#if 0
-   static const float scale = 1.0 / (2.0 * M_PI);
-   x = x * spu_splats(scale); /* normalize */
-   return _sin8_v(x);   /* 8-bit accuracy enough?? */
-#else
-   /* just pass-through to avoid trashing caller's stack */
-   return x;
-#endif
+   return _sin14_v(x);
+}
+
+static vector float
+spu_pow(vector float x, vector float y)
+{
+   float z0 = powf(spu_extract(x,0), spu_extract(y,0));
+   float z1 = powf(spu_extract(x,1), spu_extract(y,1));
+   float z2 = powf(spu_extract(x,2), spu_extract(y,2));
+   float z3 = powf(spu_extract(x,3), spu_extract(y,3));
+   return (vector float) {z0, z1, z2, z3};
 }
 
+static vector float
+spu_exp2(vector float x)
+{
+   float z0 = powf(2.0f, spu_extract(x,0));
+   float z1 = powf(2.0f, spu_extract(x,1));
+   float z2 = powf(2.0f, spu_extract(x,2));
+   float z3 = powf(2.0f, spu_extract(x,3));
+   return (vector float) {z0, z1, z2, z3};
+}
 
+static vector float
+spu_log2(vector float x)
+{
+   /*
+    * log_base_2(x) = log(x) / log(2)
+    * 1.442695 = 1/log(2).
+    */
+   static const vector float k = {1.442695F, 1.442695F, 1.442695F, 1.442695F};
+   float z0 = logf(spu_extract(x,0));
+   float z1 = logf(spu_extract(x,1));
+   float z2 = logf(spu_extract(x,2));
+   float z3 = logf(spu_extract(x,3));
+   vector float v = (vector float) {z0, z1, z2, z3};
+   return spu_mul(v, k);
+}
+
+
+static struct vec_4x4
+spu_txp(vector float s, vector float t, vector float r, vector float q,
+        unsigned unit)
+{
+   struct vec_4x4 colors;
+   spu.sample_texture4[unit](s, t, r, q, unit, 0, 0, colors.v);
+   return colors;
+}
+
+
+/**
+ * Add named function to list of "exported" functions that will be
+ * made available to the PPU-hosted code generator.
+ */
 static void
-add_func(struct cell_spu_function_info *spu_functions,
-             const char *name, void *addr)
+export_func(struct cell_spu_function_info *spu_functions,
+            const char *name, void *addr)
 {
    uint n = spu_functions->num;
    ASSERT(strlen(name) < 16);
    strcpy(spu_functions->names[n], name);
    spu_functions->addrs[n] = (uint) addr;
    spu_functions->num++;
+   ASSERT(spu_functions->num <= 16);
 }
 
 
@@ -99,8 +142,12 @@ return_function_info(void)
    ASSERT(sizeof(funcs) == 256); /* must be multiple of 16 bytes */
 
    funcs.num = 0;
-   add_func(&funcs, "spu_cos", &spu_cos);
-   add_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_cos", &spu_cos);
+   export_func(&funcs, "spu_sin", &spu_sin);
+   export_func(&funcs, "spu_pow", &spu_pow);
+   export_func(&funcs, "spu_exp2", &spu_exp2);
+   export_func(&funcs, "spu_log2", &spu_log2);
+   export_func(&funcs, "spu_txp", &spu_txp);
 
    /* Send the function info back to the PPU / main memory */
    mfc_put((void *) &funcs,  /* src in local store */
diff --git a/src/gallium/drivers/cell/spu/spu_main.h b/src/gallium/drivers/cell/spu/spu_main.h
index 29a305232ec..eff43b870ca 100644
--- a/src/gallium/drivers/cell/spu/spu_main.h
+++ b/src/gallium/drivers/cell/spu/spu_main.h
@@ -41,6 +41,9 @@
 #define MAX_HEIGHT 1024
 
 
+#define CELL_MAX_CONSTANTS 32  /**< number of float[4] constants */
+
+
 /**
  * A tile is basically a TILE_SIZE x TILE_SIZE block of 4-byte pixels.
  * The data may be addressed through several different types.
@@ -61,8 +64,13 @@ typedef union {
 
 
 /** Function for sampling textures */
-typedef vector float (*spu_sample_texture_func)(uint unit,
-                                                vector float texcoord);
+typedef void (*spu_sample_texture4_func)(vector float s,
+                                         vector float t,
+                                         vector float r,
+                                         vector float q,
+                                         uint unit, uint level, uint face,
+                                         vector float colors[4]);
+
 
 /** Function for performing per-fragment ops */
 typedef void (*spu_fragment_ops_func)(uint x, uint y,
@@ -73,7 +81,8 @@ typedef void (*spu_fragment_ops_func)(uint x, uint y,
                                       vector float fragGreen,
                                       vector float fragBlue,
                                       vector float fragAlpha,
-                                      vector unsigned int mask);
+                                      vector unsigned int mask,
+                                      uint facing);
 
 /** Function for running fragment program */
 typedef void (*spu_fragment_program_func)(vector float *inputs,
@@ -98,15 +107,27 @@ struct spu_framebuffer
 } ALIGN16_ATTRIB;
 
 
-struct spu_texture
+/** per-texture level info */
+struct spu_texture_level
 {
    void *start;
-   ushort width, height;
+   ushort width, height, depth;
    ushort tiles_per_row;
-   vector float tex_size;
-   vector unsigned int tex_size_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_x_mask; /**< == int(size - 1) */
-   vector unsigned int tex_size_y_mask; /**< == int(size - 1) */
+   uint bytes_per_image;
+   /** texcoord scale factors */
+   vector float scale_s, scale_t, scale_r;
+   /** texcoord masks (if REPEAT then size-1, else ~0) */
+   vector signed int mask_s, mask_t, mask_r;
+   /** texcoord clamp limits */
+   vector signed int max_s, max_t, max_r;
+} ALIGN16_ATTRIB;
+
+
+struct spu_texture
+{
+   struct spu_texture_level level[CELL_MAX_TEXTURE_LEVELS];
+   uint max_level;
+   uint target;  /**< PIPE_TEXTURE_x */
 } ALIGN16_ATTRIB;
 
 
@@ -154,11 +175,12 @@ struct spu_global
    spu_fragment_program_func fragment_program;
 
    /** Current texture sampler function */
-   spu_sample_texture_func sample_texture[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func min_sample_texture4[CELL_MAX_SAMPLERS];
+   spu_sample_texture4_func mag_sample_texture4[CELL_MAX_SAMPLERS];
 
-   /** Fragment program constants (XXX preliminary/used) */
-#define MAX_CONSTANTS 32
-   vector float constants[MAX_CONSTANTS];
+   /** Fragment program constants */
+   vector float constants[4 * CELL_MAX_CONSTANTS];
 
 } ALIGN16_ATTRIB;
 
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
index f107764fb28..d252fa6dc18 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.c
@@ -57,7 +57,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragG,
                           vector float fragB,
                           vector float fragA,
-                          vector unsigned int mask)
+                          vector unsigned int mask,
+                          uint facing)
 {
    vector float frag_aos[4];
    unsigned int fbc0, fbc1, fbc2, fbc3 ; /* framebuffer/tile colors */
@@ -433,23 +434,23 @@ spu_fallback_fragment_ops(uint x, uint y,
       /* Form bitmask depending on color buffer format and colormask bits */
       switch (spu.fb.color_format) {
       case PIPE_FORMAT_A8R8G8B8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x00ff0000; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x0000ff00; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0x000000ff; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0xff000000; /* alpha */
          break;
       case PIPE_FORMAT_B8G8R8A8_UNORM:
-         if (spu.blend.colormask & (1<<0))
+         if (spu.blend.colormask & PIPE_MASK_R)
             cmask |= 0x0000ff00; /* red */
-         if (spu.blend.colormask & (1<<1))
+         if (spu.blend.colormask & PIPE_MASK_G)
             cmask |= 0x00ff0000; /* green */
-         if (spu.blend.colormask & (1<<2))
+         if (spu.blend.colormask & PIPE_MASK_B)
             cmask |= 0xff000000; /* blue */
-         if (spu.blend.colormask & (1<<3))
+         if (spu.blend.colormask & PIPE_MASK_A)
             cmask |= 0x000000ff; /* alpha */
          break;
       default:
diff --git a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
index f817abf0463..a61689c83ab 100644
--- a/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
+++ b/src/gallium/drivers/cell/spu/spu_per_fragment_op.h
@@ -38,7 +38,8 @@ spu_fallback_fragment_ops(uint x, uint y,
                           vector float fragGreen,
                           vector float fragBlue,
                           vector float fragAlpha,
-                          vector unsigned int mask);
+                          vector unsigned int mask,
+                          uint facing);
 
 
 #endif /* SPU_PER_FRAGMENT_OP */
diff --git a/src/gallium/drivers/cell/spu/spu_render.c b/src/gallium/drivers/cell/spu/spu_render.c
index 305dc988810..82dbeb26b76 100644
--- a/src/gallium/drivers/cell/spu/spu_render.c
+++ b/src/gallium/drivers/cell/spu/spu_render.c
@@ -279,7 +279,7 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
          v1 = (const float *) (vertices + indexes[j+1] * vertex_size);
          v2 = (const float *) (vertices + indexes[j+2] * vertex_size);
 
-         drawn += tri_draw(v0, v1, v2, tx, ty);
+         drawn += tri_draw(v0, v1, v2, tx, ty, render->front_winding);
       }
 
       //printf("SPU %u: drew %u of %u\n", spu.init.id, drawn, render->num_indexes/3);
@@ -297,5 +297,3 @@ cmd_render(const struct cell_command_render *render, uint *pos_incr)
       printf("SPU %u: RENDER done\n",
              spu.init.id);
 }
-
-
diff --git a/src/gallium/drivers/cell/spu/spu_texture.c b/src/gallium/drivers/cell/spu/spu_texture.c
index 117b8a36f80..42eb06a3625 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.c
+++ b/src/gallium/drivers/cell/spu/spu_texture.c
@@ -26,6 +26,8 @@
  **************************************************************************/
 
 
+#include <math.h>
+
 #include "pipe/p_compiler.h"
 #include "spu_main.h"
 #include "spu_texture.h"
@@ -40,37 +42,19 @@
 void
 invalidate_tex_cache(void)
 {
-   uint unit = 0;
-   uint bytes = 4 * spu.texture[unit].width
-      * spu.texture[unit].height;
-
-   spu_dcache_mark_dirty((unsigned) spu.texture[unit].start, bytes);
-}
+   uint lvl;
+   for (lvl = 0; lvl < CELL_MAX_TEXTURE_LEVELS; lvl++) {
+      uint unit = 0;
+      uint bytes = 4 * spu.texture[unit].level[lvl].width
+         * spu.texture[unit].level[lvl].height;
 
+      if (spu.texture[unit].target == PIPE_TEXTURE_CUBE)
+         bytes *= 6;
+      else if (spu.texture[unit].target == PIPE_TEXTURE_3D)
+         bytes *= spu.texture[unit].level[lvl].depth;
 
-/**
- * XXX look into getting texels for all four pixels in a quad at once.
- */
-static uint
-get_texel(uint unit, vec_uint4 coordinate)
-{
-   /*
-    * XXX we could do the "/ TILE_SIZE" and "% TILE_SIZE" operations as
-    * SIMD since X and Y are already in a SIMD register.
-    */
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   ushort x = spu_extract(coordinate, 0);
-   ushort y = spu_extract(coordinate, 1);
-   unsigned tile_offset = sizeof(tile_t)
-      * ((y / TILE_SIZE * spu.texture[unit].tiles_per_row) + (x / TILE_SIZE));
-   ushort texel_offset = (ushort) 4
-      * (ushort) (((ushort) (y % TILE_SIZE) * (ushort) TILE_SIZE) + (x % TILE_SIZE));
-   vec_uint4 tmp;
-
-   spu_dcache_fetch_unaligned((qword *) & tmp,
-                              texture_ea + tile_offset + texel_offset,
-                              4);
-   return spu_extract(tmp, 0);
+      spu_dcache_mark_dirty((unsigned) spu.texture[unit].level[lvl].start, bytes);
+   }
 }
 
 
@@ -88,15 +72,17 @@ get_texel(uint unit, vec_uint4 coordinate)
  * a time.
  */
 static void
-get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
+get_four_texels(uint unit, uint level, uint face, vec_int4 x, vec_int4 y,
+                vec_uint4 *texels)
 {
-   const unsigned texture_ea = (uintptr_t) spu.texture[unit].start;
-   vec_uint4 tile_x = spu_rlmask(x, -5);
-   vec_uint4 tile_y = spu_rlmask(y, -5);
-   const qword offset_x = si_andi((qword) x, 0x1f);
-   const qword offset_y = si_andi((qword) y, 0x1f);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   unsigned texture_ea = (uintptr_t) tlevel->start;
+   const vec_int4 tile_x = spu_rlmask(x, -5);  /* tile_x = x / 32 */
+   const vec_int4 tile_y = spu_rlmask(y, -5);  /* tile_y = y / 32 */
+   const qword offset_x = si_andi((qword) x, 0x1f); /* offset_x = x & 0x1f */
+   const qword offset_y = si_andi((qword) y, 0x1f); /* offset_y = y & 0x1f */
 
-   const qword tiles_per_row = (qword) spu_splats(spu.texture[unit].tiles_per_row);
+   const qword tiles_per_row = (qword) spu_splats(tlevel->tiles_per_row);
    const qword tile_size = (qword) spu_splats((unsigned) sizeof(tile_t));
 
    qword tile_offset = si_mpya((qword) tile_y, tiles_per_row, (qword) tile_x);
@@ -107,6 +93,8 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
    
    vec_uint4 offset = (vec_uint4) si_a(tile_offset, texel_offset);
    
+   texture_ea = texture_ea + face * tlevel->bytes_per_image;
+
    spu_dcache_fetch_unaligned((qword *) & texels[0],
                               texture_ea + spu_extract(offset, 0), 4);
    spu_dcache_fetch_unaligned((qword *) & texels[1],
@@ -118,83 +106,496 @@ get_four_texels(uint unit, vec_uint4 x, vec_uint4 y, vec_uint4 *texels)
 }
 
 
+/** clamp vec to [0, max] */
+static INLINE vector signed int
+spu_clamp(vector signed int vec, vector signed int max)
+{
+   static const vector signed int zero = {0,0,0,0};
+   vector unsigned int c;
+   c = spu_cmpgt(vec, zero);    /* c = vec > zero ? ~0 : 0 */
+   vec = spu_sel(zero, vec, c);
+   c = spu_cmpgt(vec, max);    /* c = vec > max ? ~0 : 0 */
+   vec = spu_sel(vec, max, c);
+   return vec;
+}
+
+
+
 /**
- * Get texture sample at texcoord.
+ * Do nearest texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
  */
-vector float
-sample_texture_nearest(uint unit, vector float texcoord)
+void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, uint level, uint face,
+                        vector float colors[4])
 {
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   vector unsigned int itc = spu_convtu(tc, 0);  /* convert to int */
-   itc = spu_and(itc, spu.texture[unit].tex_size_mask); /* mask (GL_REPEAT) */
-   uint texel = get_texel(unit, itc);
-   return spu_unpack_A8R8G8B8(texel);
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   vector float ss = spu_mul(s, tlevel->scale_s);
+   vector float tt = spu_mul(t, tlevel->scale_t);
+   vector signed int is = spu_convts(ss, 0);
+   vector signed int it = spu_convts(tt, 0);
+   vec_uint4 texels[4];
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is = spu_and(is, tlevel->mask_s);
+   it = spu_and(it, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is = spu_clamp(is, tlevel->max_s);
+   it = spu_clamp(it, tlevel->max_t);
+
+   get_four_texels(unit, level, face, is, it, texels);
+
+   /* convert four packed ARGBA pixels to float RRRR,GGGG,BBBB,AAAA */
+   spu_unpack_A8R8G8B8_transpose4(texels, colors);
 }
 
 
-vector float
-sample_texture_bilinear(uint unit, vector float texcoord)
+/**
+ * Do bilinear texture sampling for four pixels.
+ * \param colors  returned colors in SOA format (rrrr, gggg, bbbb, aaaa).
+ */
+void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, uint level, uint face,
+                         vector float colors[4])
 {
-   static const vec_uint4 offset_x = {0, 0, 1, 1};
-   static const vec_uint4 offset_y = {0, 1, 0, 1};
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
 
-   vector float tc = spu_mul(texcoord, spu.texture[unit].tex_size);
-   tc = spu_add(tc, spu_splats(-0.5f));  /* half texel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
 
-   /* integer texcoords S,T: */
-   vec_uint4 itc = spu_convtu(tc, 0);  /* convert to int */
+   vector signed int is0 = spu_convts(ss, 0);
+   vector signed int it0 = spu_convts(tt, 0);
 
-   vec_uint4 texels[4];
-   
-   /* setup texcoords for quad:
-    *  +-----+-----+
-    *  |x0,y0|x1,y1|
-    *  +-----+-----+
-    *  |x2,y2|x3,y3|
-    *  +-----+-----+
-    */
-   vec_uint4 x = spu_splats(spu_extract(itc, 0));
-   vec_uint4 y = spu_splats(spu_extract(itc, 1));
-   x = spu_add(x, offset_x);
-   y = spu_add(y, offset_y);
+   /* is + 1, it + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
 
-   /* GL_REPEAT wrap mode: */
-   x = spu_and(x, spu.texture[unit].tex_size_x_mask);
-   y = spu_and(y, spu.texture[unit].tex_size_y_mask);
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
 
-   get_four_texels(unit, x, y, texels);
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
 
-   /* integer A8R8G8B8 to float texel conversion */
-   vector float texel00 = spu_unpack_A8R8G8B8(spu_extract(texels[0], 0));
-   vector float texel01 = spu_unpack_A8R8G8B8(spu_extract(texels[1], 0));
-   vector float texel10 = spu_unpack_A8R8G8B8(spu_extract(texels[2], 0));
-   vector float texel11 = spu_unpack_A8R8G8B8(spu_extract(texels[3], 0));
+   /* XXX possibly rework following code to compute the weighted sample
+    * colors with integer arithmetic for fewer int->float conversions.
+    */
 
+   /* convert packed int texels to float colors */
+   vector float ftexels[16];
+   spu_unpack_A8R8G8B8_transpose4(texels + 0, ftexels + 0);
+   spu_unpack_A8R8G8B8_transpose4(texels + 4, ftexels + 4);
+   spu_unpack_A8R8G8B8_transpose4(texels + 8, ftexels + 8);
+   spu_unpack_A8R8G8B8_transpose4(texels + 12, ftexels + 12);
 
    /* Compute weighting factors in [0,1]
     * Multiply texcoord by 1024, AND with 1023, convert back to float.
     */
-   vector float tc1024 = spu_mul(tc, spu_splats(1024.0f));
-   vector signed int itc1024 = spu_convts(tc1024, 0);
-   itc1024 = spu_and(itc1024, spu_splats((1 << 10) - 1));
-   vector float weight = spu_convtf(itc1024, 10);
-
-   /* smeared frac and 1-frac */
-   vector float sfrac = spu_splats(spu_extract(weight, 0));
-   vector float tfrac = spu_splats(spu_extract(weight, 1));
-   vector float sfrac1 = spu_sub(spu_splats(1.0f), sfrac);
-   vector float tfrac1 = spu_sub(spu_splats(1.0f), tfrac);
-
-   /* multiply the samples (colors) by the S/T weights */
-   texel00 = spu_mul(spu_mul(texel00, sfrac1), tfrac1);
-   texel10 = spu_mul(spu_mul(texel10, sfrac ), tfrac1);
-   texel01 = spu_mul(spu_mul(texel01, sfrac1), tfrac );
-   texel11 = spu_mul(spu_mul(texel11, sfrac ), tfrac );
-
-   /* compute sum of weighted samples */
-   vector float texel_sum = spu_add(texel00, texel01);
-   texel_sum = spu_add(texel_sum, texel10);
-   texel_sum = spu_add(texel_sum, texel11);
-
-   return texel_sum;
+   vector float ss1024 = spu_mul(ss, spu_splats(1024.0f));
+   vector signed int iss1024 = spu_convts(ss1024, 0);
+   iss1024 = spu_and(iss1024, 1023);
+   vector float sWeights0 = spu_convtf(iss1024, 10);
+
+   vector float tt1024 = spu_mul(tt, spu_splats(1024.0f));
+   vector signed int itt1024 = spu_convts(tt1024, 0);
+   itt1024 = spu_and(itt1024, 1023);
+   vector float tWeights0 = spu_convtf(itt1024, 10);
+
+   /* 1 - sWeight and 1 - tWeight */
+   vector float sWeights1 = spu_sub(spu_splats(1.0f), sWeights0);
+   vector float tWeights1 = spu_sub(spu_splats(1.0f), tWeights0);
+
+   /* reds, for four pixels */
+   ftexels[ 0] = spu_mul(ftexels[ 0], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 4] = spu_mul(ftexels[ 4], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 8] = spu_mul(ftexels[ 8], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[12] = spu_mul(ftexels[12], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[0] = spu_add(spu_add(ftexels[0], ftexels[4]),
+                       spu_add(ftexels[8], ftexels[12]));
+
+   /* greens, for four pixels */
+   ftexels[ 1] = spu_mul(ftexels[ 1], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 5] = spu_mul(ftexels[ 5], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[ 9] = spu_mul(ftexels[ 9], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[13] = spu_mul(ftexels[13], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[1] = spu_add(spu_add(ftexels[1], ftexels[5]),
+                       spu_add(ftexels[9], ftexels[13]));
+
+   /* blues, for four pixels */
+   ftexels[ 2] = spu_mul(ftexels[ 2], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 6] = spu_mul(ftexels[ 6], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[10] = spu_mul(ftexels[10], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[14] = spu_mul(ftexels[14], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[2] = spu_add(spu_add(ftexels[2], ftexels[6]),
+                       spu_add(ftexels[10], ftexels[14]));
+
+   /* alphas, for four pixels */
+   ftexels[ 3] = spu_mul(ftexels[ 3], spu_mul(sWeights1, tWeights1)); /*ul*/
+   ftexels[ 7] = spu_mul(ftexels[ 7], spu_mul(sWeights0, tWeights1)); /*ur*/
+   ftexels[11] = spu_mul(ftexels[11], spu_mul(sWeights1, tWeights0)); /*ll*/
+   ftexels[15] = spu_mul(ftexels[15], spu_mul(sWeights0, tWeights0)); /*lr*/
+   colors[3] = spu_add(spu_add(ftexels[3], ftexels[7]),
+                       spu_add(ftexels[11], ftexels[15]));
+}
+
+
+
+/**
+ * Adapted from /opt/cell/sdk/usr/spu/include/transpose_matrix4x4.h
+ */
+static INLINE void
+transpose(vector unsigned int *mOut0,
+          vector unsigned int *mOut1,
+          vector unsigned int *mOut2,
+          vector unsigned int *mOut3,
+          vector unsigned int *mIn)
+{
+  vector unsigned int abcd, efgh, ijkl, mnop;	/* input vectors */
+  vector unsigned int aeim, bfjn, cgko, dhlp;	/* output vectors */
+  vector unsigned int aibj, ckdl, emfn, gohp;	/* intermediate vectors */
+
+  vector unsigned char shufflehi = ((vector unsigned char) {
+					       0x00, 0x01, 0x02, 0x03,
+					       0x10, 0x11, 0x12, 0x13,
+					       0x04, 0x05, 0x06, 0x07,
+					       0x14, 0x15, 0x16, 0x17});
+  vector unsigned char shufflelo = ((vector unsigned char) {
+					       0x08, 0x09, 0x0A, 0x0B,
+					       0x18, 0x19, 0x1A, 0x1B,
+					       0x0C, 0x0D, 0x0E, 0x0F,
+					       0x1C, 0x1D, 0x1E, 0x1F});
+  abcd = *(mIn+0);
+  efgh = *(mIn+1);
+  ijkl = *(mIn+2);
+  mnop = *(mIn+3);
+
+  aibj = spu_shuffle(abcd, ijkl, shufflehi);
+  ckdl = spu_shuffle(abcd, ijkl, shufflelo);
+  emfn = spu_shuffle(efgh, mnop, shufflehi);
+  gohp = spu_shuffle(efgh, mnop, shufflelo);
+
+  aeim = spu_shuffle(aibj, emfn, shufflehi);
+  bfjn = spu_shuffle(aibj, emfn, shufflelo);
+  cgko = spu_shuffle(ckdl, gohp, shufflehi);
+  dhlp = spu_shuffle(ckdl, gohp, shufflelo);
+
+  *mOut0 = aeim;
+  *mOut1 = bfjn;
+  *mOut2 = cgko;
+  *mOut3 = dhlp;
+}
+
+
+/**
+ * Bilinear filtering, using int intead of float arithmetic
+ */
+void
+sample_texture4_bilinear_2(vector float s, vector float t,
+                           vector float r, vector float q,
+                           uint unit, uint level, uint face,
+                           vector float colors[4])
+{
+   const struct spu_texture_level *tlevel = &spu.texture[unit].level[level];
+   static const vector float half = {-0.5f, -0.5f, -0.5f, -0.5f};
+
+   /* Scale texcoords by size of texture, and add half pixel bias */
+   vector float ss = spu_madd(s, tlevel->scale_s, half);
+   vector float tt = spu_madd(t, tlevel->scale_t, half);
+
+   /* convert float coords to fixed-pt coords with 8 fraction bits */
+   vector signed int is = spu_convts(ss, 8);
+   vector signed int it = spu_convts(tt, 8);
+
+   /* compute integer texel weights in [0, 255] */
+   vector signed int sWeights0 = spu_and(is, 255);
+   vector signed int tWeights0 = spu_and(it, 255);
+   vector signed int sWeights1 = spu_sub(255, sWeights0);
+   vector signed int tWeights1 = spu_sub(255, tWeights0);
+
+   /* texel coords: is0 = is / 256, it0 = is / 256 */
+   vector signed int is0 = spu_rlmask(is, -8);
+   vector signed int it0 = spu_rlmask(it, -8);
+
+   /* texel coords: i1 = is0 + 1, it1 = it0 + 1 */
+   vector signed int is1 = spu_add(is0, 1);
+   vector signed int it1 = spu_add(it0, 1);
+
+   /* PIPE_TEX_WRAP_REPEAT */
+   is0 = spu_and(is0, tlevel->mask_s);
+   it0 = spu_and(it0, tlevel->mask_t);
+   is1 = spu_and(is1, tlevel->mask_s);
+   it1 = spu_and(it1, tlevel->mask_t);
+
+   /* PIPE_TEX_WRAP_CLAMP */
+   is0 = spu_clamp(is0, tlevel->max_s);
+   it0 = spu_clamp(it0, tlevel->max_t);
+   is1 = spu_clamp(is1, tlevel->max_s);
+   it1 = spu_clamp(it1, tlevel->max_t);
+
+   /* get packed int texels */
+   vector unsigned int texels[16];
+   get_four_texels(unit, level, face, is0, it0, texels + 0);  /* upper-left */
+   get_four_texels(unit, level, face, is1, it0, texels + 4);  /* upper-right */
+   get_four_texels(unit, level, face, is0, it1, texels + 8);  /* lower-left */
+   get_four_texels(unit, level, face, is1, it1, texels + 12); /* lower-right */
+
+   /* twiddle packed 32-bit BGRA pixels into RGBA as four unsigned ints */
+   {
+      static const unsigned char ZERO = 0x80;
+      int i;
+      for (i = 0; i < 16; i++) {
+         texels[i] = spu_shuffle(texels[i], texels[i],
+                                 ((vector unsigned char) {
+                                    ZERO, ZERO, ZERO, 1,
+                                    ZERO, ZERO, ZERO, 2,
+                                    ZERO, ZERO, ZERO, 3,
+                                    ZERO, ZERO, ZERO, 0}));
+      }
+   }
+
+   /* convert RGBA,RGBA,RGBA,RGBA to RRRR,GGGG,BBBB,AAAA */
+   vector unsigned int texel0, texel1, texel2, texel3, texel4, texel5, texel6, texel7,
+      texel8, texel9, texel10, texel11, texel12, texel13, texel14, texel15;
+   transpose(&texel0, &texel1, &texel2, &texel3, texels + 0);
+   transpose(&texel4, &texel5, &texel6, &texel7, texels + 4);
+   transpose(&texel8, &texel9, &texel10, &texel11, texels + 8);
+   transpose(&texel12, &texel13, &texel14, &texel15, texels + 12);
+
+   /* computed weighted colors */
+   vector unsigned int c0, c1, c2, c3, cSum;
+
+   /* red */
+   c0 = (vector unsigned int) si_mpyu((qword) texel0, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel4, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel8, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel12, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[0] = spu_convtf(cSum, 24);
+
+   /* green */
+   c0 = (vector unsigned int) si_mpyu((qword) texel1, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel5, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel9, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel13, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[1] = spu_convtf(cSum, 24);
+
+   /* blue */
+   c0 = (vector unsigned int) si_mpyu((qword) texel2, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel6, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel10, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel14, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[2] = spu_convtf(cSum, 24);
+
+   /* alpha */
+   c0 = (vector unsigned int) si_mpyu((qword) texel3, si_mpyu((qword) sWeights1, (qword) tWeights1)); /*ul*/
+   c1 = (vector unsigned int) si_mpyu((qword) texel7, si_mpyu((qword) sWeights0, (qword) tWeights1)); /*ur*/
+   c2 = (vector unsigned int) si_mpyu((qword) texel11, si_mpyu((qword) sWeights1, (qword) tWeights0)); /*ll*/
+   c3 = (vector unsigned int) si_mpyu((qword) texel15, si_mpyu((qword) sWeights0, (qword) tWeights0)); /*lr*/
+   cSum = spu_add(spu_add(c0, c1), spu_add(c2, c3));
+   colors[3] = spu_convtf(cSum, 24);
+}
+
+
+
+/**
+ * Compute level of detail factor from texcoords.
+ */
+static float
+compute_lambda(uint unit, vector float s, vector float t)
+{
+   uint baseLevel = 0;
+   float width = spu.texture[unit].level[baseLevel].width;
+   float height = spu.texture[unit].level[baseLevel].width;
+   float dsdx = width * (spu_extract(s, 1) - spu_extract(s, 0));
+   float dsdy = width * (spu_extract(s, 2) - spu_extract(s, 0));
+   float dtdx = height * (spu_extract(t, 1) - spu_extract(t, 0));
+   float dtdy = height * (spu_extract(t, 2) - spu_extract(t, 0));
+   float x = dsdx * dsdx + dtdx * dtdx;
+   float y = dsdy * dsdy + dtdy * dtdy;
+   float rho = x > y ? x : y;
+   rho = sqrtf(rho);
+   float lambda = logf(rho) * 1.442695f;
+   return lambda;
+}
+
+
+
+/**
+ * Texture sampling with level of detail selection.
+ */
+void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, uint level_ignored, uint face,
+                    vector float colors[4])
+{
+   /*
+    * Note that we're computing a lambda/lod here that's used for all
+    * four pixels in the quad.
+    */
+   float lambda = compute_lambda(unit, s, t);
+
+   /* apply lod bias */
+   lambda += spu.sampler[unit].lod_bias;
+
+   /* clamp */
+   if (lambda < spu.sampler[unit].min_lod)
+      lambda = spu.sampler[unit].min_lod;
+   else if (lambda > spu.sampler[unit].max_lod)
+      lambda = spu.sampler[unit].max_lod;
+
+   if (lambda <= 0.0f) {
+      /* magnify */
+      spu.mag_sample_texture4[unit](s, t, r, q, unit, 0, 0, colors);
+   }
+   else {
+      /* minify */
+      int level = (int) (lambda + 0.5f);
+      if (level > (int) spu.texture[unit].max_level)
+         level = spu.texture[unit].max_level;
+      spu.min_sample_texture4[unit](s, t, r, q, unit, level, 0, colors);
+      /* XXX to do: mipmap level interpolation */
+   }
+}
+
+
+/** XXX need a SIMD version of this */
+static unsigned
+choose_cube_face(float rx, float ry, float rz, float *newS, float *newT)
+{
+   /*
+      major axis
+      direction     target                             sc     tc    ma
+      ----------    -------------------------------    ---    ---   ---
+       +rx          TEXTURE_CUBE_MAP_POSITIVE_X_EXT    -rz    -ry   rx
+       -rx          TEXTURE_CUBE_MAP_NEGATIVE_X_EXT    +rz    -ry   rx
+       +ry          TEXTURE_CUBE_MAP_POSITIVE_Y_EXT    +rx    +rz   ry
+       -ry          TEXTURE_CUBE_MAP_NEGATIVE_Y_EXT    +rx    -rz   ry
+       +rz          TEXTURE_CUBE_MAP_POSITIVE_Z_EXT    +rx    -ry   rz
+       -rz          TEXTURE_CUBE_MAP_NEGATIVE_Z_EXT    -rx    -ry   rz
+   */
+   const float arx = fabsf(rx);
+   const float ary = fabsf(ry);
+   const float arz = fabsf(rz);
+   unsigned face;
+   float sc, tc, ma;
+
+   if (arx > ary && arx > arz) {
+      if (rx >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_X;
+         sc = -rz;
+         tc = -ry;
+         ma = arx;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_X;
+         sc = rz;
+         tc = -ry;
+         ma = arx;
+      }
+   }
+   else if (ary > arx && ary > arz) {
+      if (ry >= 0.0F) {
+         face = PIPE_TEX_FACE_POS_Y;
+         sc = rx;
+         tc = rz;
+         ma = ary;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Y;
+         sc = rx;
+         tc = -rz;
+         ma = ary;
+      }
+   }
+   else {
+      if (rz > 0.0F) {
+         face = PIPE_TEX_FACE_POS_Z;
+         sc = rx;
+         tc = -ry;
+         ma = arz;
+      }
+      else {
+         face = PIPE_TEX_FACE_NEG_Z;
+         sc = -rx;
+         tc = -ry;
+         ma = arz;
+      }
+   }
+
+   *newS = (sc / ma + 1.0F) * 0.5F;
+   *newT = (tc / ma + 1.0F) * 0.5F;
+
+   return face;
+}
+
+
+
+void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level, uint face_ignored,
+                     vector float colors[4])
+{
+   static const vector float zero = {0.0f, 0.0f, 0.0f, 0.0f};
+   uint p, faces[4];
+   float newS[4], newT[4];
+
+   /* Compute cube face referenced by the four sets of texcoords.
+    * XXX we should SIMD-ize this.
+    */
+   for (p = 0; p < 4; p++) {      
+      float rx = spu_extract(s, p);
+      float ry = spu_extract(t, p);
+      float rz = spu_extract(r, p);
+      faces[p] = choose_cube_face(rx, ry, rz, &newS[p], &newT[p]);
+   }
+
+   if (faces[0] == faces[1] &&
+       faces[0] == faces[2] &&
+       faces[0] == faces[3]) {
+      /* GOOD!  All four texcoords refer to the same cube face */
+      s = (vector float) {newS[0], newS[1], newS[2], newS[3]};
+      t = (vector float) {newT[0], newT[1], newT[2], newT[3]};
+      sample_texture4_nearest(s, t, zero, zero, unit, level, faces[0], colors);
+   }
+   else {
+      /* BAD!  The four texcoords refer to different faces */
+      for (p = 0; p < 4; p++) {      
+         vector float c[4];
+
+         sample_texture4_nearest(spu_splats(newS[p]), spu_splats(newT[p]),
+                                 zero, zero, unit, level, faces[p], c);
+
+         float red = spu_extract(c[0], p);
+         float green = spu_extract(c[1], p);
+         float blue = spu_extract(c[2], p);
+         float alpha = spu_extract(c[3], p);
+
+         colors[0] = spu_insert(red,   colors[0], p);
+         colors[1] = spu_insert(green, colors[1], p);
+         colors[2] = spu_insert(blue,  colors[2], p);
+         colors[3] = spu_insert(alpha, colors[3], p);
+      }
+   }
 }
diff --git a/src/gallium/drivers/cell/spu/spu_texture.h b/src/gallium/drivers/cell/spu/spu_texture.h
index f7c9738be88..387484c3ad3 100644
--- a/src/gallium/drivers/cell/spu/spu_texture.h
+++ b/src/gallium/drivers/cell/spu/spu_texture.h
@@ -36,12 +36,38 @@ extern void
 invalidate_tex_cache(void);
 
 
-extern vector float
-sample_texture_nearest(uint unit, vector float texcoord);
+extern void
+sample_texture4_nearest(vector float s, vector float t,
+                        vector float r, vector float q,
+                        uint unit, uint level, uint face,
+                        vector float colors[4]);
+
+
+extern void
+sample_texture4_bilinear(vector float s, vector float t,
+                         vector float r, vector float q,
+                         uint unit, uint level, uint face,
+                         vector float colors[4]);
+
+extern void
+sample_texture4_bilinear_2(vector float s, vector float t,
+                           vector float r, vector float q,
+                           uint unit, uint level, uint face,
+                           vector float colors[4]);
+
 
+extern void
+sample_texture4_lod(vector float s, vector float t,
+                    vector float r, vector float q,
+                    uint unit, uint level, uint face,
+                    vector float colors[4]);
 
-extern vector float
-sample_texture_bilinear(uint unit, vector float texcoord);
+
+extern void
+sample_texture4_cube(vector float s, vector float t,
+                     vector float r, vector float q,
+                     uint unit, uint level_ignored, uint face_ignored,
+                     vector float colors[4]);
 
 
 #endif /* SPU_TEXTURE_H */
diff --git a/src/gallium/drivers/cell/spu/spu_tri.c b/src/gallium/drivers/cell/spu/spu_tri.c
index 0a8fb56a62c..03f094373df 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.c
+++ b/src/gallium/drivers/cell/spu/spu_tri.c
@@ -116,21 +116,15 @@ struct setup_stage {
    struct edge etop;
    struct edge emaj;
 
-   float oneoverarea;
+   float oneOverArea;
 
-   uint tx, ty;
+   uint facing;
+
+   uint tx, ty;  /**< position of current tile (x, y) */
 
    int cliprect_minx, cliprect_maxx, cliprect_miny, cliprect_maxy;
 
-#if 0
-   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#else
    struct interp_coef coef[PIPE_MAX_SHADER_INPUTS];
-#endif
-
-#if 0
-   struct quad_header quad; 
-#endif
 
    struct {
       int left[2];   /**< [0] = row0, [1] = row1 */
@@ -142,101 +136,61 @@ struct setup_stage {
 };
 
 
-
 static struct setup_stage setup;
 
 
-
-
-#if 0
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-#endif
-
-#if 0
-/**
- * Clip setup.quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip(struct setup_stage *setup)
-{
-   const struct pipe_scissor_state *cliprect = &setup.softpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (setup.quad.x0 >= maxx ||
-       setup.quad.y0 >= maxy ||
-       setup.quad.x0 + 1 < minx ||
-       setup.quad.y0 + 1 < miny) {
-      /* totally clipped */
-      setup.quad.mask = 0x0;
-      return;
-   }
-   if (setup.quad.x0 < minx)
-      setup.quad.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (setup.quad.y0 < miny)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (setup.quad.x0 == maxx - 1)
-      setup.quad.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (setup.quad.y0 == maxy - 1)
-      setup.quad.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-#endif
-
-#if 0
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad(struct setup_stage *setup)
-{
-   quad_clip(setup);
-   if (setup.quad.mask) {
-      struct softpipe_context *sp = setup.softpipe;
-      sp->quad.first->run(sp->quad.first, &setup.quad);
-   }
-}
-#endif
-
 /**
  * Evaluate attribute coefficients (plane equations) to compute
  * attribute values for the four fragments in a quad.
  * Eg: four colors will be computed (in AoS format).
  */
 static INLINE void
-eval_coeff(uint slot, float x, float y, vector float result[4])
+eval_coeff(uint slot, float x, float y, vector float w, vector float result[4])
 {
-   switch (spu.vertex_info.interp_mode[slot]) {
+   switch (spu.vertex_info.attrib[slot].interp_mode) {
    case INTERP_CONSTANT:
       result[QUAD_TOP_LEFT] =
       result[QUAD_TOP_RIGHT] =
       result[QUAD_BOTTOM_LEFT] =
       result[QUAD_BOTTOM_RIGHT] = setup.coef[slot].a0.v;
       break;
-
    case INTERP_LINEAR:
-      /* fall-through, for now */
-   default:
       {
-         register vector float dadx = setup.coef[slot].dadx.v;
-         register vector float dady = setup.coef[slot].dady.v;
-         register vector float topLeft
-            = spu_add(setup.coef[slot].a0.v,
-                      spu_add(spu_mul(spu_splats(x), dadx),
-                              spu_mul(spu_splats(y), dady)));
+         vector float dadx = setup.coef[slot].dadx.v;
+         vector float dady = setup.coef[slot].dady.v;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0.v,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
 
          result[QUAD_TOP_LEFT] = topLeft;
          result[QUAD_TOP_RIGHT] = spu_add(topLeft, dadx);
          result[QUAD_BOTTOM_LEFT] = spu_add(topLeft, dady);
          result[QUAD_BOTTOM_RIGHT] = spu_add(spu_add(topLeft, dadx), dady);
       }
+      break;
+   case INTERP_PERSPECTIVE:
+      {
+         vector float dadx = setup.coef[slot].dadx.v;
+         vector float dady = setup.coef[slot].dady.v;
+         vector float topLeft =
+            spu_add(setup.coef[slot].a0.v,
+                    spu_add(spu_mul(spu_splats(x), dadx),
+                            spu_mul(spu_splats(y), dady)));
+
+         vector float wInv = spu_re(w);  /* 1.0 / w */
+
+         result[QUAD_TOP_LEFT] = spu_mul(topLeft, wInv);
+         result[QUAD_TOP_RIGHT] = spu_mul(spu_add(topLeft, dadx), wInv);
+         result[QUAD_BOTTOM_LEFT] = spu_mul(spu_add(topLeft, dady), wInv);
+         result[QUAD_BOTTOM_RIGHT] = spu_mul(spu_add(spu_add(topLeft, dadx), dady), wInv);
+      }
+      break;
+   case INTERP_POS:
+   case INTERP_NONE:
+      break;
+   default:
+      ASSERT(0);
    }
 }
 
@@ -246,14 +200,14 @@ eval_coeff(uint slot, float x, float y, vector float result[4])
  * XXX this will all be re-written someday.
  */
 static INLINE void
-eval_coeff_soa(uint slot, float x, float y, vector float result[4])
+eval_coeff_soa(uint slot, float x, float y, vector float w, vector float result[4])
 {
-   eval_coeff(slot, x, y, result);
+   eval_coeff(slot, x, y, w, result);
    _transpose_matrix4x4(result, result);
 }
 
 
-
+/** Evalute coefficients to get Z for four pixels in a quad */
 static INLINE vector float
 eval_z(float x, float y)
 {
@@ -267,6 +221,20 @@ eval_z(float x, float y)
 }
 
 
+/** Evalute coefficients to get W for four pixels in a quad */
+static INLINE vector float
+eval_w(float x, float y)
+{
+   const uint slot = 0;
+   const float dwdx = setup.coef[slot].dadx.f[3];
+   const float dwdy = setup.coef[slot].dady.f[3];
+   const float topLeft = setup.coef[slot].a0.f[3] + x * dwdx + y * dwdy;
+   const vector float topLeftv = spu_splats(topLeft);
+   const vector float derivs = (vector float) { 0.0, dwdx, dwdy, dwdx + dwdy };
+   return spu_add(topLeftv, derivs);
+}
+
+
 /**
  * Emit a quad (pass to next stage).  No clipping is done.
  * Note: about 1/5 to 1/7 of the time, mask is zero and this function
@@ -274,7 +242,7 @@ eval_z(float x, float y)
  * overall.
  */
 static INLINE void
-emit_quad( int x, int y, mask_t mask )
+emit_quad( int x, int y, mask_t mask)
 {
    /* If any bits in mask are set... */
    if (spu_extract(spu_orx(mask), 0)) {
@@ -284,84 +252,21 @@ emit_quad( int x, int y, mask_t mask )
       spu.cur_ctile_status = TILE_STATUS_DIRTY;
       spu.cur_ztile_status = TILE_STATUS_DIRTY;
 
-      if (spu.texture[0].start) {
-         /*
-          * Temporary texture mapping path
-          * This will go away when fragment programs support TEX inst.
-          */
-         const uint unit = 0;
-         vector float colors[4];
-         vector float texcoords[4];
-         eval_coeff(2, (float) x, (float) y, texcoords);
-
-         if (spu_extract(mask, 0))
-            colors[0] = spu.sample_texture[unit](unit, texcoords[0]);
-         if (spu_extract(mask, 1))
-            colors[1] = spu.sample_texture[unit](unit, texcoords[1]);
-         if (spu_extract(mask, 2))
-            colors[2] = spu.sample_texture[unit](unit, texcoords[2]);
-         if (spu_extract(mask, 3))
-            colors[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-
-         if (spu.texture[1].start) {
-            /* multi-texture mapping */
-            const uint unit = 1;
-            vector float colors1[4];
-
-            eval_coeff(2, (float) x, (float) y, texcoords);
-
-            if (spu_extract(mask, 0))
-               colors1[0] = spu.sample_texture[unit](unit, texcoords[0]);
-            if (spu_extract(mask, 1))
-               colors1[1] = spu.sample_texture[unit](unit, texcoords[1]);
-            if (spu_extract(mask, 2))
-               colors1[2] = spu.sample_texture[unit](unit, texcoords[2]);
-            if (spu_extract(mask, 3))
-               colors1[3] = spu.sample_texture[unit](unit, texcoords[3]);
-
-            /* hack: modulate first texture by second */
-            colors[0] = spu_mul(colors[0], colors1[0]);
-            colors[1] = spu_mul(colors[1], colors1[1]);
-            colors[2] = spu_mul(colors[2], colors1[2]);
-            colors[3] = spu_mul(colors[3], colors1[3]);
-         }
-
-         {
-            /* Convert fragment data from AoS to SoA format.
-             * I.e. (RGBA,RGBA,RGBA,RGBA) -> (RRRR,GGGG,BBBB,AAAA)
-             * This is temporary!
-             */
-            vector float soa_frag[4];
-            _transpose_matrix4x4(soa_frag, colors);
-
-            vector float fragZ = eval_z((float) x, (float) y);
-
-            /* Do all per-fragment/quad operations here, including:
-             * alpha test, z test, stencil test, blend and framebuffer writing.
-             */
-            spu.fragment_ops(ix, iy, &spu.ctile, &spu.ztile,
-                             fragZ,
-                             soa_frag[0], soa_frag[1],
-                             soa_frag[2], soa_frag[3],
-                             mask);
-         }
-
-      }
-      else {
+      {
          /*
           * Run fragment shader, execute per-fragment ops, update fb/tile.
           */
          vector float inputs[4*4], outputs[2*4];
          vector float fragZ = eval_z((float) x, (float) y);
+         vector float fragW = eval_w((float) x, (float) y);
 
          /* setup inputs */
 #if 0
-         eval_coeff_soa(1, (float) x, (float) y, inputs);
+         eval_coeff_soa(1, (float) x, (float) y, fragW, inputs);
 #else
          uint i;
          for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-            eval_coeff_soa(i+1, (float) x, (float) y, inputs + i * 4);
+            eval_coeff_soa(i+1, (float) x, (float) y, fragW, inputs + i * 4);
          }
 #endif
          ASSERT(spu.fragment_program);
@@ -379,7 +284,8 @@ emit_quad( int x, int y, mask_t mask )
                           outputs[0*4+1],
                           outputs[0*4+2],
                           outputs[0*4+3],
-                          mask);
+                          mask,
+                          setup.facing);
       }
    }
 }
@@ -389,7 +295,8 @@ emit_quad( int x, int y, mask_t mask )
  * Given an X or Y coordinate, return the block/quad coordinate that it
  * belongs to.
  */
-static INLINE int block( int x )
+static INLINE int
+block(int x)
 {
    return x & ~1;
 }
@@ -400,7 +307,8 @@ static INLINE int block( int x )
  * the triangle's bounds.
  * The mask is a uint4 vector and each element will be 0 or 0xffffffff.
  */
-static INLINE mask_t calculate_mask( int x )
+static INLINE mask_t
+calculate_mask(int x)
 {
    /* This is a little tricky.
     * Use & instead of && to avoid branches.
@@ -418,7 +326,8 @@ static INLINE mask_t calculate_mask( int x )
 /**
  * Render a horizontal span of quads
  */
-static void flush_spans( void )
+static void
+flush_spans(void)
 {
    int minleft, maxright;
    int x;
@@ -446,7 +355,6 @@ static void flush_spans( void )
       return;
    }
 
-
    /* OK, we're very likely to need the tile data now.
     * clear or finish waiting if needed.
     */
@@ -482,9 +390,7 @@ static void flush_spans( void )
     * calculate_mask() could be simplified a bit...
     */
    for (x = block(minleft); x <= block(maxright); x += 2) {
-#if 1
-      emit_quad( x, setup.span.y, calculate_mask( x ) );
-#endif
+      emit_quad( x, setup.span.y, calculate_mask( x ));
    }
 
    setup.span.y = 0;
@@ -493,8 +399,10 @@ static void flush_spans( void )
    setup.span.right[1] = 0;
 }
 
+
 #if DEBUG_VERTS
-static void print_vertex(const struct vertex_header *v)
+static void
+print_vertex(const struct vertex_header *v)
 {
    int i;
    fprintf(stderr, "Vertex: (%p)\n", v);
@@ -506,11 +414,11 @@ static void print_vertex(const struct vertex_header *v)
 #endif
 
 
-static boolean setup_sort_vertices(const struct vertex_header *v0,
-                                   const struct vertex_header *v1,
-                                   const struct vertex_header *v2)
+static boolean
+setup_sort_vertices(const struct vertex_header *v0,
+                    const struct vertex_header *v1,
+                    const struct vertex_header *v2)
 {
-
 #if DEBUG_VERTS
    fprintf(stderr, "Triangle:\n");
    print_vertex(v0);
@@ -599,13 +507,13 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
     * use the prim->det value because its sign is correct.
     */
    {
-      const float area = (setup.emaj.dx * setup.ebot.dy - 
-			    setup.ebot.dx * setup.emaj.dy);
+      const float area = (setup.emaj.dx * setup.ebot.dy -
+                          setup.ebot.dx * setup.emaj.dy);
 
-      setup.oneoverarea = 1.0f / area;
+      setup.oneOverArea = 1.0f / area;
       /*
       _mesa_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup.oneoverarea, area, prim->det );
+                   __FUNCTION__, setup.oneOverArea, area, prim->det );
       */
    }
 
@@ -628,7 +536,7 @@ static boolean setup_sort_vertices(const struct vertex_header *v0,
  * \param slot  which attribute slot 
  */
 static INLINE void
-const_coeff(uint slot)
+const_coeff4(uint slot)
 {
    setup.coef[slot].dadx.v = (vector float) {0.0, 0.0, 0.0, 0.0};
    setup.coef[slot].dady.v = (vector float) {0.0, 0.0, 0.0, 0.0};
@@ -637,58 +545,6 @@ const_coeff(uint slot)
 
 
 /**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static INLINE void
-tri_linear_coeff(uint slot, uint firstComp, uint lastComp)
-{
-   uint i;
-   const float *vmin_d = (float *) &setup.vmin->data[slot];
-   const float *vmid_d = (float *) &setup.vmid->data[slot];
-   const float *vmax_d = (float *) &setup.vmax->data[slot];
-   const float x = spu_extract(setup.vmin->data[0], 0) - 0.5f;
-   const float y = spu_extract(setup.vmin->data[0], 1) - 0.5f;
-
-   for (i = firstComp; i < lastComp; i++) {
-      float botda = vmid_d[i] - vmin_d[i];
-      float majda = vmax_d[i] - vmin_d[i];
-      float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-      float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-   
-      ASSERT(slot < PIPE_MAX_SHADER_INPUTS);
-
-      setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-      setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup.coef[slot].a0.f[i] = (vmin_d[i] - 
-                                 (setup.coef[slot].dadx.f[i] * x + 
-                                  setup.coef[slot].dady.f[i] * y));
-   }
-
-   /*
-   _mesa_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-		slot, "xyzw"[i], 
-		setup.coef[slot].a0[i],
-		setup.coef[slot].dadx.f[i],
-		setup.coef[slot].dady.f[i]);
-   */
-}
-
-
-/**
  * As above, but interp setup all four vector components.
  */
 static INLINE void
@@ -708,8 +564,8 @@ tri_linear_coeff4(uint slot)
    vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
                             spu_mul(majda, spu_splats(setup.ebot.dx)));
 
-   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneoverarea));
-   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneoverarea));
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
 
    vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
    vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
@@ -718,8 +574,6 @@ tri_linear_coeff4(uint slot)
 }
 
 
-
-#if 0
 /**
  * Compute a0, dadx and dady for a perspective-corrected interpolant,
  * for a triangle.
@@ -728,82 +582,76 @@ tri_linear_coeff4(uint slot)
  * Later, when we compute the value at a particular fragment position we'll
  * divide the interpolated value by the interpolated W at that fragment.
  */
-static void tri_persp_coeff( unsigned slot,
-                             unsigned i )
+static void
+tri_persp_coeff4(uint slot)
 {
-   /* premultiply by 1/w:
-    */
-   float mina = setup.vmin->data[slot][i] * setup.vmin->data[0][3];
-   float mida = setup.vmid->data[slot][i] * setup.vmid->data[0][3];
-   float maxa = setup.vmax->data[slot][i] * setup.vmax->data[0][3];
-
-   float botda = mida - mina;
-   float majda = maxa - mina;
-   float a = setup.ebot.dy * majda - botda * setup.emaj.dy;
-   float b = setup.emaj.dx * botda - majda * setup.ebot.dx;
-      
-   /*
-   printf("tri persp %d,%d: %f %f %f\n", slot, i,
-          setup.vmin->data[slot][i],
-          setup.vmid->data[slot][i],
-          setup.vmax->data[slot][i]
-          );
-   */
+   const vector float xxxx = spu_splats(spu_extract(setup.vmin->data[0], 0) - 0.5f);
+   const vector float yyyy = spu_splats(spu_extract(setup.vmin->data[0], 1) - 0.5f);
+
+   const vector float vmin_w = spu_splats(spu_extract(setup.vmin->data[0], 3));
+   const vector float vmid_w = spu_splats(spu_extract(setup.vmid->data[0], 3));
+   const vector float vmax_w = spu_splats(spu_extract(setup.vmax->data[0], 3));
+
+   vector float vmin_d = setup.vmin->data[slot];
+   vector float vmid_d = setup.vmid->data[slot];
+   vector float vmax_d = setup.vmax->data[slot];
 
-   assert(slot < PIPE_MAX_SHADER_INPUTS);
-   assert(i <= 3);
+   vmin_d = spu_mul(vmin_d, vmin_w);
+   vmid_d = spu_mul(vmid_d, vmid_w);
+   vmax_d = spu_mul(vmax_d, vmax_w);
 
-   setup.coef[slot].dadx.f[i] = a * setup.oneoverarea;
-   setup.coef[slot].dady.f[i] = b * setup.oneoverarea;
-   setup.coef[slot].a0.f[i] = (mina - 
-			    (setup.coef[slot].dadx.f[i] * (setup.vmin->data[0][0] - 0.5f) + 
-			     setup.coef[slot].dady.f[i] * (setup.vmin->data[0][1] - 0.5f)));
+   vector float botda = vmid_d - vmin_d;
+   vector float majda = vmax_d - vmin_d;
+
+   vector float a = spu_sub(spu_mul(spu_splats(setup.ebot.dy), majda),
+                            spu_mul(botda, spu_splats(setup.emaj.dy)));
+   vector float b = spu_sub(spu_mul(spu_splats(setup.emaj.dx), botda),
+                            spu_mul(majda, spu_splats(setup.ebot.dx)));
+
+   setup.coef[slot].dadx.v = spu_mul(a, spu_splats(setup.oneOverArea));
+   setup.coef[slot].dady.v = spu_mul(b, spu_splats(setup.oneOverArea));
+
+   vector float tempx = spu_mul(setup.coef[slot].dadx.v, xxxx);
+   vector float tempy = spu_mul(setup.coef[slot].dady.v, yyyy);
+                         
+   setup.coef[slot].a0.v = spu_sub(vmin_d, spu_add(tempx, tempy));
 }
-#endif
+
 
 
 /**
  * Compute the setup.coef[] array dadx, dady, a0 values.
  * Must be called after setup.vmin,vmid,vmax,vprovoke are initialized.
  */
-static void setup_tri_coefficients(void)
+static void
+setup_tri_coefficients(void)
 {
-#if 1
    uint i;
 
    for (i = 0; i < spu.vertex_info.num_attribs; i++) {
-      switch (spu.vertex_info.interp_mode[i]) {
+      switch (spu.vertex_info.attrib[i].interp_mode) {
       case INTERP_NONE:
          break;
-      case INTERP_POS:
-         /*tri_linear_coeff(i, 2, 3);*/
-         /* XXX interp W if PERSPECTIVE... */
-         tri_linear_coeff4(i);
-         break;
       case INTERP_CONSTANT:
-         const_coeff(i);
+         const_coeff4(i);
          break;
+      case INTERP_POS:
+         /* fall-through */
       case INTERP_LINEAR:
          tri_linear_coeff4(i);
          break;
       case INTERP_PERSPECTIVE:
-         tri_linear_coeff4(i);  /* temporary */
+         tri_persp_coeff4(i);
          break;
       default:
          ASSERT(0);
       }
    }
-#else
-   ASSERT(spu.vertex_info.interp_mode[0] == INTERP_POS);
-   ASSERT(spu.vertex_info.interp_mode[1] == INTERP_LINEAR ||
-          spu.vertex_info.interp_mode[1] == INTERP_CONSTANT);
-   tri_linear_coeff(0, 2, 3);  /* slot 0, z */
-   tri_linear_coeff(1, 0, 4);  /* slot 1, color */
-#endif
 }
 
 
-static void setup_tri_edges(void)
+static void
+setup_tri_edges(void)
 {
    float vmin_x = spu_extract(setup.vmin->data[0], 0) + 0.5f;
    float vmid_x = spu_extract(setup.vmid->data[0], 0) + 0.5f;
@@ -833,9 +681,8 @@ static void setup_tri_edges(void)
  * Render the upper or lower half of a triangle.
  * Scissoring/cliprect is applied here too.
  */
-static void subtriangle( struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
+static void
+subtriangle(struct edge *eleft, struct edge *eright, unsigned lines)
 {
    const int minx = setup.cliprect_minx;
    const int maxx = setup.cliprect_maxx;
@@ -903,12 +750,27 @@ static void subtriangle( struct edge *eleft,
 }
 
 
+static float
+determinant(const float *v0, const float *v1, const float *v2)
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+
+   /* det = cross(e,f).z */
+   return ex * fy - ey * fx;
+}
+
+
 /**
  * Draw triangle into tile at (tx, ty) (tile coords)
  * The tile data should have already been fetched.
  */
 boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
+tri_draw(const float *v0, const float *v1, const float *v2,
+         uint tx, uint ty, uint front_winding)
 {
    setup.tx = tx;
    setup.ty = ty;
@@ -919,6 +781,12 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.cliprect_maxx = (tx + 1) * TILE_SIZE;
    setup.cliprect_maxy = (ty + 1) * TILE_SIZE;
 
+   /* Before we sort vertices, determine the facing of the triangle,
+    * which will be needed for front/back-face stencil application
+    */
+   float det = determinant(v0, v1, v2);
+   setup.facing = (det > 0.0) ^ (front_winding == PIPE_WINDING_CW);
+
    if (!setup_sort_vertices((struct vertex_header *) v0,
                             (struct vertex_header *) v1,
                             (struct vertex_header *) v2)) {
@@ -932,19 +800,14 @@ tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty)
    setup.span.y_flags = 0;
    setup.span.right[0] = 0;
    setup.span.right[1] = 0;
-   /*   setup.span.z_mode = tri_z_mode( setup.ctx ); */
 
-   /*   init_constant_attribs( setup ); */
-      
-   if (setup.oneoverarea < 0.0) {
-      /* emaj on left:
-       */
+   if (setup.oneOverArea < 0.0) {
+      /* emaj on left */
       subtriangle( &setup.emaj, &setup.ebot, setup.ebot.lines );
       subtriangle( &setup.emaj, &setup.etop, setup.etop.lines );
    }
    else {
-      /* emaj on right:
-       */
+      /* emaj on right */
       subtriangle( &setup.ebot, &setup.emaj, setup.ebot.lines );
       subtriangle( &setup.etop, &setup.emaj, setup.etop.lines );
    }
diff --git a/src/gallium/drivers/cell/spu/spu_tri.h b/src/gallium/drivers/cell/spu/spu_tri.h
index aa694dd7c93..abc3d35160e 100644
--- a/src/gallium/drivers/cell/spu/spu_tri.h
+++ b/src/gallium/drivers/cell/spu/spu_tri.h
@@ -31,7 +31,7 @@
 
 
 extern boolean
-tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty);
+tri_draw(const float *v0, const float *v1, const float *v2, uint tx, uint ty, uint front_winding);
 
 
 #endif /* SPU_TRI_H */
diff --git a/src/gallium/drivers/i915simple/i915_prim_emit.c b/src/gallium/drivers/i915simple/i915_prim_emit.c
index d194c2fb158..8f1f58b2dd1 100644
--- a/src/gallium/drivers/i915simple/i915_prim_emit.c
+++ b/src/gallium/drivers/i915simple/i915_prim_emit.c
@@ -77,9 +77,9 @@ emit_hw_vertex( struct i915_context *i915,
    assert(!i915->dirty);
 
    for (i = 0; i < vinfo->num_attribs; i++) {
-      const uint j = vinfo->src_index[i];
+      const uint j = vinfo->attrib[i].src_index;
       const float *attrib = vertex->data[j];
-      switch (vinfo->emit[i]) {
+      switch (vinfo->attrib[i].emit) {
       case EMIT_1F:
          OUT_BATCH( fui(attrib[0]) );
          count++;
diff --git a/src/gallium/drivers/i915simple/i915_state_derived.c b/src/gallium/drivers/i915simple/i915_state_derived.c
index 488615067c5..178d4e8781d 100644
--- a/src/gallium/drivers/i915simple/i915_state_derived.c
+++ b/src/gallium/drivers/i915simple/i915_state_derived.c
@@ -88,12 +88,12 @@ static void calculate_vertex_layout( struct i915_context *i915 )
    if (needW) {
       draw_emit_vertex_attr(&vinfo, EMIT_4F, INTERP_LINEAR, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZW;
-      vinfo.emit[0] = EMIT_4F;
+      vinfo.attrib[0].emit = EMIT_4F;
    }
    else {
       draw_emit_vertex_attr(&vinfo, EMIT_3F, INTERP_LINEAR, src);
       vinfo.hwfmt[0] |= S4_VFMT_XYZ;
-      vinfo.emit[0] = EMIT_3F;
+      vinfo.attrib[0].emit = EMIT_3F;
    }
 
    /* hardware point size */
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
index 496ed43df26..0111469405f 100644
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ b/src/gallium/drivers/softpipe/sp_fs_sse.c
@@ -40,7 +40,7 @@
 #include "tgsi/tgsi_sse2.h"
 
 
-#ifdef PIPE_ARCH_X86
+#if defined(PIPE_ARCH_X86) && defined(PIPE_ARCH_SSE)
 
 #include "rtasm/rtasm_x86sse.h"
 
diff --git a/src/gallium/drivers/softpipe/sp_setup.c b/src/gallium/drivers/softpipe/sp_setup.c
index bc8263c33e3..13d80173937 100644
--- a/src/gallium/drivers/softpipe/sp_setup.c
+++ b/src/gallium/drivers/softpipe/sp_setup.c
@@ -773,10 +773,10 @@ static void setup_tri_coefficients( struct setup_context *setup )
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          for (j = 0; j < NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1084,10 +1084,10 @@ setup_line_coefficients(struct setup_context *setup,
    /* setup interpolation for all the remaining attributes:
     */
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          for (j = 0; j < NUM_CHANNELS; j++)
             const_coeff(setup, &setup->coef[fragSlot], vertSlot, j);
@@ -1331,10 +1331,10 @@ setup_point( struct setup_context *setup,
    const_coeff(setup, &setup->posCoef, 0, 3);
 
    for (fragSlot = 0; fragSlot < spfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->src_index[fragSlot];
+      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
       uint j;
 
-      switch (vinfo->interp_mode[fragSlot]) {
+      switch (vinfo->attrib[fragSlot].interp_mode) {
       case INTERP_CONSTANT:
          /* fall-through */
       case INTERP_LINEAR:
diff --git a/src/gallium/include/pipe/p_config.h b/src/gallium/include/pipe/p_config.h
index af3746c0265..ef05547819d 100644
--- a/src/gallium/include/pipe/p_config.h
+++ b/src/gallium/include/pipe/p_config.h
@@ -85,6 +85,14 @@
 #define PIPE_ARCH_X86_64
 #endif
 
+#if defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)
+#if defined(PIPE_CC_GCC) && !defined(__SSE2__)
+/* #warning SSE2 support requires -msse -msse2 compiler options */
+#else
+#define PIPE_ARCH_SSE
+#endif
+#endif
+
 #if 0 /* FIXME */
 #define PIPE_ARCH_PPC
 #endif
diff --git a/src/mesa/drivers/common/descrip.mms b/src/mesa/drivers/common/descrip.mms
index 99a2ae6c37d..d5bbc69dfd5 100644
--- a/src/mesa/drivers/common/descrip.mms
+++ b/src/mesa/drivers/common/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  [email protected]
-# Last revision : 3 October 2007
+# Last revision : 29 September 2008
 
 .first
 	define gl [----.include.gl]
@@ -19,7 +19,8 @@ VPATH = RCS
 
 INCDIR = [----.include],[--.main],[--.glapi],[--.shader]
 LIBDIR = [----.lib]
-CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ieee/ieee=denorm
+CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)\
+	/float=ieee/ieee=denorm/warn=disable=(PTRMISMATCH)
 
 SOURCES = driverfuncs.c
 
diff --git a/src/mesa/drivers/dri/common/dri_util.c b/src/mesa/drivers/dri/common/dri_util.c
index ceedd914fb3..a16cb504c73 100644
--- a/src/mesa/drivers/dri/common/dri_util.c
+++ b/src/mesa/drivers/dri/common/dri_util.c
@@ -315,9 +315,6 @@ static void driSwapBuffers(__DRIdrawable *dPriv)
 {
     __DRIscreen *psp = dPriv->driScreenPriv;
 
-    if (!dPriv->numClipRects)
-        return;
-
     psp->DriverAPI.SwapBuffers(dPriv);
 
     driReportDamage(dPriv, dPriv->pClipRects, dPriv->numClipRects);
diff --git a/src/mesa/drivers/dri/i915/i830_vtbl.c b/src/mesa/drivers/dri/i915/i830_vtbl.c
index 0ab27704d52..773a8b4dd01 100644
--- a/src/mesa/drivers/dri/i915/i830_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i830_vtbl.c
@@ -566,6 +566,13 @@ i830_destroy_context(struct intel_context *intel)
    GLuint i;
    struct i830_context *i830 = i830_context(&intel->ctx);
 
+   intel_region_release(&i830->state.draw_region);
+   intel_region_release(&i830->state.depth_region);
+   intel_region_release(&i830->meta.draw_region);
+   intel_region_release(&i830->meta.depth_region);
+   intel_region_release(&i830->initial.draw_region);
+   intel_region_release(&i830->initial.depth_region);
+
    for (i = 0; i < I830_TEX_UNITS; i++) {
       if (i830->state.tex_buffer[i] != NULL) {
 	 dri_bo_unreference(i830->state.tex_buffer[i]);
diff --git a/src/mesa/drivers/dri/i915/i915_context.c b/src/mesa/drivers/dri/i915/i915_context.c
index efcac911aa6..e0ddc7fd613 100644
--- a/src/mesa/drivers/dri/i915/i915_context.c
+++ b/src/mesa/drivers/dri/i915/i915_context.c
@@ -56,8 +56,6 @@ static const struct dri_extension i915_extensions[] = {
    {"GL_ARB_shadow", NULL},
    {"GL_ARB_texture_non_power_of_two", NULL},
    {"GL_EXT_shadow_funcs", NULL},
-   /* ARB extn won't work if not enabled */
-   {"GL_SGIX_depth_texture", NULL},
    {NULL, NULL}
 };
 
diff --git a/src/mesa/drivers/dri/i915/i915_context.h b/src/mesa/drivers/dri/i915/i915_context.h
index c6958dd8d4b..a2376e50e15 100644
--- a/src/mesa/drivers/dri/i915/i915_context.h
+++ b/src/mesa/drivers/dri/i915/i915_context.h
@@ -125,6 +125,9 @@ struct i915_fragment_program
    GLboolean on_hardware;
    GLboolean error;             /* If program is malformed for any reason. */
 
+   /** Record of which phases R registers were last written in. */
+   GLuint register_phases[16];
+   GLuint indirections;
    GLuint nr_tex_indirect;
    GLuint nr_tex_insn;
    GLuint nr_alu_insn;
diff --git a/src/mesa/drivers/dri/i915/i915_program.c b/src/mesa/drivers/dri/i915/i915_program.c
index 49193297a8c..350da5e1697 100644
--- a/src/mesa/drivers/dri/i915/i915_program.c
+++ b/src/mesa/drivers/dri/i915/i915_program.c
@@ -190,6 +190,9 @@ i915_emit_arith(struct i915_fragment_program * p,
    *(p->csr++) = (A1_SRC0(src0) | A1_SRC1(src1));
    *(p->csr++) = (A2_SRC1(src1) | A2_SRC2(src2));
 
+   if (GET_UREG_TYPE(dest) == REG_TYPE_R)
+      p->register_phases[GET_UREG_NR(dest)] = p->nr_tex_indirect;
+
    p->nr_alu_insn++;
    return dest;
 }
@@ -237,10 +240,22 @@ GLuint i915_emit_texld( struct i915_fragment_program *p,
    else {
       assert(GET_UREG_TYPE(dest) != REG_TYPE_CONST);
       assert(dest = UREG(GET_UREG_TYPE(dest), GET_UREG_NR(dest)));
+      /* Can't use unsaved temps for coords, as the phase boundary would result
+       * in the contents becoming undefined.
+       */
+      assert(GET_UREG_TYPE(coord) != REG_TYPE_U);
+
+      /* Output register being oC or oD defines a phase boundary */
+      if (GET_UREG_TYPE(dest) == REG_TYPE_OC ||
+	  GET_UREG_TYPE(dest) == REG_TYPE_OD)
+	 p->nr_tex_indirect++;
 
-      if (GET_UREG_TYPE(coord) != REG_TYPE_T) {
+      /* Reading from an r# register whose contents depend on output of the
+       * current phase defines a phase boundary.
+       */
+      if (GET_UREG_TYPE(coord) == REG_TYPE_R &&
+	  p->register_phases[GET_UREG_NR(coord)] == p->nr_tex_indirect)
 	 p->nr_tex_indirect++;
-      }
 
       *(p->csr++) = (op | 
 		     T0_DEST( dest ) |
@@ -249,6 +264,9 @@ GLuint i915_emit_texld( struct i915_fragment_program *p,
       *(p->csr++) = T1_ADDRESS_REG( coord );
       *(p->csr++) = T2_MBZ;
 
+      if (GET_UREG_TYPE(dest) == REG_TYPE_R)
+	 p->register_phases[GET_UREG_NR(dest)] = p->nr_tex_indirect;
+
       p->nr_tex_insn++;
       return dest;
    }
@@ -413,7 +431,8 @@ i915_init_program(struct i915_context *i915, struct i915_fragment_program *p)
    p->on_hardware = 0;
    p->error = 0;
 
-   p->nr_tex_indirect = 1;      /* correct? */
+   memset(&p->register_phases, 0, sizeof(p->register_phases));
+   p->nr_tex_indirect = 1;
    p->nr_tex_insn = 0;
    p->nr_alu_insn = 0;
    p->nr_decl_insn = 0;
diff --git a/src/mesa/drivers/dri/i915/i915_texstate.c b/src/mesa/drivers/dri/i915/i915_texstate.c
index ae42b102db8..d1b0dcdf319 100644
--- a/src/mesa/drivers/dri/i915/i915_texstate.c
+++ b/src/mesa/drivers/dri/i915/i915_texstate.c
@@ -307,10 +307,21 @@ i915_update_tex_unit(struct intel_context *intel, GLuint unit, GLuint ss3)
    }
 
 
-   state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
-                                                tObj->_BorderChan[1],
-                                                tObj->_BorderChan[2],
-                                                tObj->_BorderChan[3]);
+   if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+      /* GL specs that border color for depth textures is taken from the
+       * R channel, while the hardware uses A.  Spam R into all the channels
+       * for safety.
+       */
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
+						   tObj->_BorderChan[0],
+						   tObj->_BorderChan[0],
+						   tObj->_BorderChan[0]);
+   } else {
+      state[I915_TEXREG_SS4] = INTEL_PACKCOLOR8888(tObj->_BorderChan[0],
+						   tObj->_BorderChan[1],
+						   tObj->_BorderChan[2],
+						   tObj->_BorderChan[3]);
+   }
 
 
    I915_ACTIVESTATE(i915, I915_UPLOAD_TEX(unit), GL_TRUE);
diff --git a/src/mesa/drivers/dri/i915/i915_vtbl.c b/src/mesa/drivers/dri/i915/i915_vtbl.c
index edbbe23e094..7431a9cf76d 100644
--- a/src/mesa/drivers/dri/i915/i915_vtbl.c
+++ b/src/mesa/drivers/dri/i915/i915_vtbl.c
@@ -490,6 +490,13 @@ i915_destroy_context(struct intel_context *intel)
    GLuint i;
    struct i915_context *i915 = i915_context(&intel->ctx);
 
+   intel_region_release(&i915->state.draw_region);
+   intel_region_release(&i915->state.depth_region);
+   intel_region_release(&i915->meta.draw_region);
+   intel_region_release(&i915->meta.depth_region);
+   intel_region_release(&i915->initial.draw_region);
+   intel_region_release(&i915->initial.depth_region);
+
    for (i = 0; i < I915_TEX_UNITS; i++) {
       if (i915->state.tex_buffer[i] != NULL) {
 	 dri_bo_unreference(i915->state.tex_buffer[i]);
diff --git a/src/mesa/drivers/dri/i965/Makefile b/src/mesa/drivers/dri/i965/Makefile
index c2d555cd0cf..005460f3547 100644
--- a/src/mesa/drivers/dri/i965/Makefile
+++ b/src/mesa/drivers/dri/i965/Makefile
@@ -18,8 +18,9 @@ DRIVER_SOURCES = \
 	intel_screen.c \
 	intel_span.c \
 	intel_pixel.c \
-	intel_pixel_copy.c \
 	intel_pixel_bitmap.c \
+	intel_pixel_copy.c \
+	intel_pixel_draw.c \
 	intel_state.c \
 	intel_tex.c \
 	intel_tex_copy.c \
@@ -51,6 +52,7 @@ DRIVER_SOURCES = \
 	brw_metaops.c \
 	brw_misc_state.c \
 	brw_program.c \
+	brw_queryobj.c \
 	brw_sf.c \
 	brw_sf_emit.c \
 	brw_sf_state.c \
diff --git a/src/mesa/drivers/dri/i965/brw_context.c b/src/mesa/drivers/dri/i965/brw_context.c
index 92629016d98..474158b484b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.c
+++ b/src/mesa/drivers/dri/i965/brw_context.c
@@ -33,6 +33,7 @@
 #include "main/imports.h"
 #include "main/api_noop.h"
 #include "main/vtxfmt.h"
+#include "main/simple_list.h"
 #include "shader/shader_api.h"
 
 #include "brw_context.h"
@@ -68,6 +69,7 @@ static void brwInitDriverFunctions( struct dd_function_table *functions )
 
    brwInitFragProgFuncs( functions );
    brwInitProgFuncs( functions );
+   brw_init_queryobj_functions(functions);
 }
 
 
@@ -150,9 +152,9 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
    ctx->VertexProgram._MaintainTnlProgram = GL_TRUE;
    ctx->FragmentProgram._MaintainTexEnvProgram = GL_TRUE;
 
-   brw_draw_init( brw );
+   make_empty_list(&brw->query.active_head);
 
-   brw_ProgramCacheInit( ctx );
+   brw_draw_init( brw );
 
    return GL_TRUE;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index b04487ecee9..1c6a0dede0b 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -130,7 +130,6 @@ struct brw_context;
 #define BRW_NEW_CONTEXT                 0x80
 #define BRW_NEW_WM_INPUT_DIMENSIONS     0x100
 #define BRW_NEW_INPUT_VARYING           0x200
-#define BRW_NEW_TNL_PROGRAM             0x400
 #define BRW_NEW_PSP                     0x800
 #define BRW_NEW_METAOPS                 0x1000
 #define BRW_NEW_FENCE                   0x2000
@@ -411,7 +410,22 @@ struct brw_tnl_cache {
    GLuint size, n_items;
 };
 
+struct brw_query_object {
+   struct gl_query_object Base;
 
+   /** Doubly linked list of active query objects in the context. */
+   struct brw_query_object *prev, *next;
+
+   /** Last query BO associated with this query. */
+   dri_bo *bo;
+   /** First index in bo with query data for this object. */
+   int first_index;
+   /** Last index in bo with query data for this object. */
+   int last_index;
+
+   /* Total count of pixels from previous BOs */
+   unsigned int count;
+};
 
 struct brw_context 
 {
@@ -488,10 +502,6 @@ struct brw_context
       GLboolean active;
    } metaops;
 
-   /* Track fixed function t&l in a vertex program:
-    */
-   struct gl_vertex_program *tnl_program;
-   struct brw_tnl_cache tnl_program_cache;
 
    /* Active vertex program: 
     */
@@ -631,7 +641,12 @@ struct brw_context
       dri_bo *vp_bo;
    } cc;
 
-   
+   struct {
+      struct brw_query_object active_head;
+      dri_bo *bo;
+      int index;
+      GLboolean active;
+   } query;
    /* Used to give every program string a unique id
     */
    GLuint program_id;
@@ -656,7 +671,13 @@ GLboolean brwCreateContext( const __GLcontextModes *mesaVis,
 			    __DRIcontextPrivate *driContextPriv,
 			    void *sharedContextPrivate);
 
-
+/*======================================================================
+ * brw_queryobj.c
+ */
+void brw_init_queryobj_functions(struct dd_function_table *functions);
+void brw_prepare_query_begin(struct brw_context *brw);
+void brw_emit_query_begin(struct brw_context *brw);
+void brw_emit_query_end(struct brw_context *brw);
 
 /*======================================================================
  * brw_state.c
diff --git a/src/mesa/drivers/dri/i965/brw_draw.c b/src/mesa/drivers/dri/i965/brw_draw.c
index 9a353fc7b66..6c71b4abcf0 100644
--- a/src/mesa/drivers/dri/i965/brw_draw.c
+++ b/src/mesa/drivers/dri/i965/brw_draw.c
@@ -382,7 +382,6 @@ void brw_draw_prims( GLcontext *ctx,
       return;
    }
 
-
    /* Make a first attempt at drawing:
     */
    retval = brw_try_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
@@ -395,6 +394,7 @@ void brw_draw_prims( GLcontext *ctx,
        _swsetup_Wakeup(ctx);
       _tnl_draw_prims(ctx, arrays, prim, nr_prims, ib, min_index, max_index);
    }
+
 }
 
 void brw_draw_init( struct brw_context *brw )
@@ -409,8 +409,18 @@ void brw_draw_init( struct brw_context *brw )
 
 void brw_draw_destroy( struct brw_context *brw )
 {
+   int i;
+
    if (brw->vb.upload.bo != NULL) {
       dri_bo_unreference(brw->vb.upload.bo);
       brw->vb.upload.bo = NULL;
    }
+
+   for (i = 0; i < VERT_ATTRIB_MAX; i++) {
+      dri_bo_unreference(brw->vb.inputs[i].bo);
+      brw->vb.inputs[i].bo = NULL;
+   }
+
+   dri_bo_unreference(brw->ib.bo);
+   brw->ib.bo = NULL;
 }
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 303eaac5cf9..7b88b5eaa1e 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -365,8 +365,10 @@ static void brw_prepare_vertices(struct brw_context *brw)
 	 if (i == 0) {
 	    /* Position array not properly enabled:
 	     */
-	    if (input->glarray->StrideB == 0)
-	      return;
+            if (input->glarray->StrideB == 0) {
+               intel->Fallback = 1;
+               return;
+            }
 
 	    interleave = input->glarray->StrideB;
 	    ptr = input->glarray->Ptr;
@@ -413,6 +415,8 @@ static void brw_prepare_vertices(struct brw_context *brw)
           copy_array_to_vbo_array(brw, upload[i], upload[i]->element_size);
       }
    }
+
+   brw_prepare_query_begin(brw);
 }
 
 static void brw_emit_vertices(struct brw_context *brw)
@@ -433,6 +437,7 @@ static void brw_emit_vertices(struct brw_context *brw)
       enabled[nr_enabled++] = input;
    }
 
+   brw_emit_query_begin(brw);
 
    /* Now emit VB and VEP state packets.
     *
diff --git a/src/mesa/drivers/dri/i965/brw_fallback.c b/src/mesa/drivers/dri/i965/brw_fallback.c
index 2f6b7febbde..4ea660a51a3 100644
--- a/src/mesa/drivers/dri/i965/brw_fallback.c
+++ b/src/mesa/drivers/dri/i965/brw_fallback.c
@@ -74,10 +74,7 @@ static GLboolean do_check_fallback(struct brw_context *brw)
       if (texUnit->_ReallyEnabled) {
 	 struct intel_texture_object *intelObj = intel_texture_object(texUnit->_Current);
 	 struct gl_texture_image *texImage = intelObj->base.Image[0][intelObj->firstLevel];
-	 if (texImage->Border ||
-         ((texImage->_BaseFormat == GL_DEPTH_COMPONENT) &&
-          ((texImage->TexObject->WrapS == GL_CLAMP_TO_BORDER) || 
-           (texImage->TexObject->WrapT == GL_CLAMP_TO_BORDER)))) {
+	 if (texImage->Border) {
 	    DBG("FALLBACK: texture border\n");
 	    return GL_TRUE;
 	 }
diff --git a/src/mesa/drivers/dri/i965/brw_queryobj.c b/src/mesa/drivers/dri/i965/brw_queryobj.c
new file mode 100644
index 00000000000..a1a1353dee7
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/brw_queryobj.c
@@ -0,0 +1,263 @@
+/*
+ * Copyright © 2008 Intel Corporation
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Eric Anholt <[email protected]>
+ *
+ */
+
+/** @file support for ARB_query_object
+ *
+ * ARB_query_object is implemented by using the PIPE_CONTROL command to stall
+ * execution on the completion of previous depth tests, and write the
+ * current PS_DEPTH_COUNT to a buffer object.
+ *
+ * We use before and after counts when drawing during a query so that
+ * we don't pick up other clients' query data in ours.  To reduce overhead,
+ * a single BO is used to record the query data for all active queries at
+ * once.  This also gives us a simple bound on how much batchbuffer space is
+ * required for handling queries, so that we can be sure that we won't
+ * have to emit a batchbuffer without getting the ending PS_DEPTH_COUNT.
+ */
+#include "main/simple_list.h"
+#include "main/imports.h"
+
+#include "brw_context.h"
+#include "intel_batchbuffer.h"
+#include "intel_reg.h"
+
+/** Waits on the query object's BO and totals the results for this query */
+static void
+brw_queryobj_get_results(struct brw_query_object *query)
+{
+   int i;
+   uint64_t *results;
+
+   if (query->bo == NULL)
+      return;
+
+   /* Map and count the pixels from the current query BO */
+   dri_bo_map(query->bo, GL_FALSE);
+   results = query->bo->virtual;
+   for (i = query->first_index; i <= query->last_index; i++) {
+      query->Base.Result += results[i * 2 + 1] - results[i * 2];
+   }
+   dri_bo_unmap(query->bo);
+
+   dri_bo_unreference(query->bo);
+   query->bo = NULL;
+}
+
+static struct gl_query_object *
+brw_new_query_object(GLcontext *ctx, GLuint id)
+{
+   struct brw_query_object *query;
+
+   query = _mesa_calloc(sizeof(struct brw_query_object));
+
+   query->Base.Id = id;
+   query->Base.Result = 0;
+   query->Base.Active = GL_FALSE;
+   query->Base.Ready = GL_TRUE;
+
+   return &query->Base;
+}
+
+static void
+brw_delete_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   dri_bo_unreference(query->bo);
+   _mesa_free(query);
+}
+
+static void
+brw_begin_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Reset our driver's tracking of query state. */
+   dri_bo_unreference(query->bo);
+   query->bo = NULL;
+   query->first_index = -1;
+   query->last_index = -1;
+
+   insert_at_head(&brw->query.active_head, query);
+   intel->stats_wm++;
+}
+
+/**
+ * Begin the ARB_occlusion_query query on a query object.
+ */
+static void
+brw_end_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_context *brw = brw_context(ctx);
+   struct intel_context *intel = intel_context(ctx);
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   /* Flush the batchbuffer in case it has writes to our query BO.
+    * Have later queries write to a new query BO so that further rendering
+    * doesn't delay the collection of our results.
+    */
+   if (query->bo) {
+      brw_emit_query_end(brw);
+      intel_batchbuffer_flush(intel->batch);
+
+      dri_bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+   }
+
+   remove_from_list(query);
+
+   intel->stats_wm--;
+}
+
+static void brw_wait_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   brw_queryobj_get_results(query);
+   query->Base.Ready = GL_TRUE;
+}
+
+static void brw_check_query(GLcontext *ctx, struct gl_query_object *q)
+{
+   /* XXX: Need to expose dri_bo_is_idle from bufmgr. */
+#if 0
+   struct brw_query_object *query = (struct brw_query_object *)q;
+
+   if (dri_bo_is_idle(query->bo)) {
+      brw_queryobj_get_results(query);
+      query->Base.Ready = GL_TRUE;
+   }
+#else
+   brw_wait_query(ctx, q);
+#endif
+}
+
+/** Called to set up the query BO and account for its aperture space */
+void
+brw_prepare_query_begin(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   dri_bo *aper_array[] = {
+      intel->batch->buf,
+      brw->query.bo,
+   };
+
+   /* Skip if we're not doing any queries. */
+   if (is_empty_list(&brw->query.active_head))
+      return;
+
+   /* Get a new query BO if we're going to need it. */
+   if (brw->query.bo == NULL ||
+       brw->query.index * 2 + 1 >= 4096 / sizeof(uint64_t)) {
+      dri_bo_unreference(brw->query.bo);
+      brw->query.bo = NULL;
+
+      brw->query.bo = dri_bo_alloc(intel->bufmgr, "query", 4096, 1);
+      brw->query.index = 0;
+   }
+
+   if (dri_bufmgr_check_aperture_space(aper_array, ARRAY_SIZE(aper_array)))
+      intel_batchbuffer_flush(intel->batch);
+}
+
+/** Called just before primitive drawing to get a beginning PS_DEPTH_COUNT. */
+void
+brw_emit_query_begin(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+   struct brw_query_object *query;
+
+   /* Skip if we're not doing any queries, or we've emitted the start. */
+   if (brw->query.active || is_empty_list(&brw->query.active_head))
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   /* This object could be mapped cacheable, but we don't have an exposed
+    * mechanism to support that.  Since it's going uncached, tell GEM that
+    * we're writing to it.  The usual clflush should be all that's required
+    * to pick up the results.
+    */
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   foreach(query, &brw->query.active_head) {
+      if (query->bo != brw->query.bo) {
+	 if (query->bo != NULL)
+	    brw_queryobj_get_results(query);
+	 dri_bo_reference(brw->query.bo);
+	 query->bo = brw->query.bo;
+	 query->first_index = brw->query.index;
+      }
+      query->last_index = brw->query.index;
+   }
+   brw->query.active = GL_TRUE;
+}
+
+/** Called at batchbuffer flush to get an ending PS_DEPTH_COUNT */
+void
+brw_emit_query_end(struct brw_context *brw)
+{
+   struct intel_context *intel = &brw->intel;
+
+   if (!brw->query.active)
+      return;
+
+   BEGIN_BATCH(4, IGNORE_CLIPRECTS);
+   OUT_BATCH(_3DSTATE_PIPE_CONTROL |
+	     PIPE_CONTROL_DEPTH_STALL |
+	     PIPE_CONTROL_WRITE_DEPTH_COUNT);
+   OUT_RELOC(brw->query.bo,
+	     I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
+	     PIPE_CONTROL_GLOBAL_GTT_WRITE |
+	     ((brw->query.index * 2 + 1) * sizeof(uint64_t)));
+   OUT_BATCH(0);
+   OUT_BATCH(0);
+   ADVANCE_BATCH();
+
+   brw->query.active = GL_FALSE;
+   brw->query.index++;
+}
+
+void brw_init_queryobj_functions(struct dd_function_table *functions)
+{
+   functions->NewQueryObject = brw_new_query_object;
+   functions->DeleteQuery = brw_delete_query;
+   functions->BeginQuery = brw_begin_query;
+   functions->EndQuery = brw_end_query;
+   functions->CheckQuery = brw_check_query;
+   functions->WaitQuery = brw_wait_query;
+}
diff --git a/src/mesa/drivers/dri/i965/brw_state.h b/src/mesa/drivers/dri/i965/brw_state.h
index 3ea6151ae95..4c04036ef08 100644
--- a/src/mesa/drivers/dri/i965/brw_state.h
+++ b/src/mesa/drivers/dri/i965/brw_state.h
@@ -74,7 +74,6 @@ const struct brw_tracked_state brw_wm_unit;
 const struct brw_tracked_state brw_psp_urb_cbs;
 
 const struct brw_tracked_state brw_active_vertprog;
-const struct brw_tracked_state brw_tnl_vertprog;
 const struct brw_tracked_state brw_pipe_control;
 
 const struct brw_tracked_state brw_clear_surface_cache;
diff --git a/src/mesa/drivers/dri/i965/brw_state_cache.c b/src/mesa/drivers/dri/i965/brw_state_cache.c
index 1318dea5940..d5b51664066 100644
--- a/src/mesa/drivers/dri/i965/brw_state_cache.c
+++ b/src/mesa/drivers/dri/i965/brw_state_cache.c
@@ -497,9 +497,10 @@ void brw_destroy_cache( struct brw_context *brw )
    GLuint i;
 
    brw_clear_cache(brw);
-   for (i = 0; i < BRW_MAX_CACHE; i++)
+   for (i = 0; i < BRW_MAX_CACHE; i++) {
+      dri_bo_unreference(brw->cache.last_bo[i]);
       free(brw->cache.name[i]);
-
+   }
    free(brw->cache.items);
    brw->cache.items = NULL;
    brw->cache.size = 0;
diff --git a/src/mesa/drivers/dri/i965/brw_state_upload.c b/src/mesa/drivers/dri/i965/brw_state_upload.c
index 7d4fd467b1c..b6a52843a81 100644
--- a/src/mesa/drivers/dri/i965/brw_state_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_state_upload.c
@@ -45,7 +45,6 @@ const struct brw_tracked_state *atoms[] =
 {
    &brw_check_fallback,
 
-   &brw_tnl_vertprog,
    &brw_active_vertprog,
    &brw_wm_input_sizes,
    &brw_vs_prog,
diff --git a/src/mesa/drivers/dri/i965/brw_vs.h b/src/mesa/drivers/dri/i965/brw_vs.h
index 41a33ffe387..22388ec99d0 100644
--- a/src/mesa/drivers/dri/i965/brw_vs.h
+++ b/src/mesa/drivers/dri/i965/brw_vs.h
@@ -80,8 +80,4 @@ struct brw_vs_compile {
 
 void brw_vs_emit( struct brw_vs_compile *c );
 
-
-void brw_ProgramCacheDestroy( GLcontext *ctx );
-void brw_ProgramCacheInit( GLcontext *ctx );
-
 #endif
diff --git a/src/mesa/drivers/dri/i965/brw_vs_tnl.c b/src/mesa/drivers/dri/i965/brw_vs_tnl.c
index 9b04f19112a..eacc289f1f1 100644
--- a/src/mesa/drivers/dri/i965/brw_vs_tnl.c
+++ b/src/mesa/drivers/dri/i965/brw_vs_tnl.c
@@ -33,1635 +33,15 @@
 #include "main/glheader.h"
 #include "main/macros.h"
 #include "main/enums.h"
-#include "shader/prog_parameter.h"
-#include "shader/prog_print.h"
-#include "shader/prog_statevars.h"
 #include "brw_vs.h"
 #include "brw_state.h"
 
 
-struct state_key {
-   unsigned light_global_enabled:1;
-   unsigned light_local_viewer:1;
-   unsigned light_twoside:1;
-   unsigned light_color_material:1;
-   unsigned light_color_material_mask:12;
-   unsigned light_material_mask:12;
-   unsigned normalize:1;
-   unsigned rescale_normals:1;
-   unsigned fog_source_is_depth:1;
-   unsigned tnl_do_vertex_fog:1;
-   unsigned separate_specular:1;
-   unsigned fog_option:2;
-   unsigned point_attenuated:1;
-   unsigned texture_enabled_global:1;
-   unsigned fragprog_inputs_read:12;
-
-   struct {
-      unsigned light_enabled:1;
-      unsigned light_eyepos3_is_zero:1;
-      unsigned light_spotcutoff_is_180:1;
-      unsigned light_attenuated:1;      
-      unsigned texunit_really_enabled:1;
-      unsigned texmat_enabled:1;
-      unsigned texgen_enabled:4;
-      unsigned texgen_mode0:4;
-      unsigned texgen_mode1:4;
-      unsigned texgen_mode2:4;
-      unsigned texgen_mode3:4;
-   } unit[8];
-};
-
-
-
-#define FOG_NONE   0
-#define FOG_LINEAR 1
-#define FOG_EXP    2
-#define FOG_EXP2   3
-
-static GLuint translate_fog_mode( GLenum mode )
-{
-   switch (mode) {
-   case GL_LINEAR: return FOG_LINEAR;
-   case GL_EXP: return FOG_EXP;
-   case GL_EXP2: return FOG_EXP2;
-   default: return FOG_NONE;
-   }
-}
-
-#define TXG_NONE           0
-#define TXG_OBJ_LINEAR     1
-#define TXG_EYE_LINEAR     2
-#define TXG_SPHERE_MAP     3
-#define TXG_REFLECTION_MAP 4
-#define TXG_NORMAL_MAP     5
-
-static GLuint translate_texgen( GLboolean enabled, GLenum mode )
-{
-   if (!enabled)
-      return TXG_NONE;
-
-   switch (mode) {
-   case GL_OBJECT_LINEAR: return TXG_OBJ_LINEAR;
-   case GL_EYE_LINEAR: return TXG_EYE_LINEAR;
-   case GL_SPHERE_MAP: return TXG_SPHERE_MAP;
-   case GL_REFLECTION_MAP_NV: return TXG_REFLECTION_MAP;
-   case GL_NORMAL_MAP_NV: return TXG_NORMAL_MAP;
-   default: return TXG_NONE;
-   }
-}
-
-static void make_state_key( GLcontext *ctx, struct state_key *key )
-{
-   struct brw_context *brw = brw_context(ctx);
-   const struct gl_fragment_program *fp = brw->fragment_program;
-   GLuint i;
-
-   /* This now relies on texenvprogram.c being active:
-    */
-   assert(fp);
-
-   memset(key, 0, sizeof(*key));
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   key->fragprog_inputs_read = fp->Base.InputsRead;
-
-   /* _NEW_LIGHT */
-   key->separate_specular = (brw->attribs.Light->Model.ColorControl ==
-			     GL_SEPARATE_SPECULAR_COLOR);
-
-   /* _NEW_LIGHT */
-   if (brw->attribs.Light->Enabled) {
-      key->light_global_enabled = 1;
-
-      if (brw->attribs.Light->Model.LocalViewer)
-	 key->light_local_viewer = 1;
-
-      if (brw->attribs.Light->Model.TwoSide)
-	 key->light_twoside = 1;
-
-      if (brw->attribs.Light->ColorMaterialEnabled) {
-	 key->light_color_material = 1;
-	 key->light_color_material_mask = brw->attribs.Light->ColorMaterialBitmask;
-      }
-
-      /* BRW_NEW_INPUT_VARYING */
-
-      /* For these programs, material values are stuffed into the
-       * generic slots:
-       */
-      for (i = 0 ; i < MAT_ATTRIB_MAX ; i++) 
-	 if (brw->vb.info.varying & (1<<(VERT_ATTRIB_GENERIC0 + i))) 
-	    key->light_material_mask |= 1<<i;
-
-      for (i = 0; i < MAX_LIGHTS; i++) {
-	 struct gl_light *light = &brw->attribs.Light->Light[i];
-
-	 if (light->Enabled) {
-	    key->unit[i].light_enabled = 1;
-
-	    if (light->EyePosition[3] == 0.0)
-	       key->unit[i].light_eyepos3_is_zero = 1;
-	    
-	    if (light->SpotCutoff == 180.0)
-	       key->unit[i].light_spotcutoff_is_180 = 1;
-
-	    if (light->ConstantAttenuation != 1.0 ||
-		light->LinearAttenuation != 0.0 ||
-		light->QuadraticAttenuation != 0.0)
-	       key->unit[i].light_attenuated = 1;
-	 }
-      }
-   }
-
-   /* _NEW_TRANSFORM */
-   if (brw->attribs.Transform->Normalize)
-      key->normalize = 1;
-
-   if (brw->attribs.Transform->RescaleNormals)
-      key->rescale_normals = 1;
-
-   /* BRW_NEW_FRAGMENT_PROGRAM */
-   key->fog_option = translate_fog_mode(fp->FogOption);
-   if (key->fog_option)
-      key->fragprog_inputs_read |= FRAG_BIT_FOGC;
-   
-   /* _NEW_FOG */
-   if (brw->attribs.Fog->FogCoordinateSource == GL_FRAGMENT_DEPTH_EXT)
-      key->fog_source_is_depth = 1;
-   
-   /* _NEW_HINT, ??? */
-   if (1)
-      key->tnl_do_vertex_fog = 1;
-
-   /* _NEW_POINT */
-   if (brw->attribs.Point->_Attenuated)
-      key->point_attenuated = 1;
-
-   /* _NEW_TEXTURE */
-   if (brw->attribs.Texture->_TexGenEnabled ||
-       brw->attribs.Texture->_TexMatEnabled ||
-       brw->attribs.Texture->_EnabledUnits)
-      key->texture_enabled_global = 1;
-      
-   for (i = 0; i < MAX_TEXTURE_UNITS; i++) {
-      struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[i];
-
-      if (texUnit->_ReallyEnabled)
- 	 key->unit[i].texunit_really_enabled = 1;
-
-      if (brw->attribs.Texture->_TexMatEnabled & ENABLE_TEXMAT(i))      
-	 key->unit[i].texmat_enabled = 1;
-      
-      if (texUnit->TexGenEnabled) {
-	 key->unit[i].texgen_enabled = 1;
-      
-	 key->unit[i].texgen_mode0 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<0),
-			      texUnit->GenModeS );
-	 key->unit[i].texgen_mode1 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<1),
-			      texUnit->GenModeT );
-	 key->unit[i].texgen_mode2 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<2),
-			      texUnit->GenModeR );
-	 key->unit[i].texgen_mode3 = 
-	    translate_texgen( texUnit->TexGenEnabled & (1<<3),
-			      texUnit->GenModeQ );
-      }
-   }
-}
-
-
-   
-/* Very useful debugging tool - produces annotated listing of
- * generated program with line/function references for each
- * instruction back into this file:
- */
-#define DISASSEM 0
-
-/* Should be tunable by the driver - do we want to do matrix
- * multiplications with DP4's or with MUL/MAD's?  SSE works better
- * with the latter, drivers may differ.
- */
-#define PREFER_DP4 1
-
-
-/* Use uregs to represent registers internally, translate to Mesa's
- * expected formats on emit.  
- *
- * NOTE: These are passed by value extensively in this file rather
- * than as usual by pointer reference.  If this disturbs you, try
- * remembering they are just 32bits in size.
- *
- * GCC is smart enough to deal with these dword-sized structures in
- * much the same way as if I had defined them as dwords and was using
- * macros to access and set the fields.  This is much nicer and easier
- * to evolve.
- */
-struct ureg {
-   GLuint file:4;
-   GLint idx:8;      /* relative addressing may be negative */
-   GLuint negate:1;
-   GLuint swz:12;
-   GLuint pad:7;
-};
-
-
-struct tnl_program {
-   const struct state_key *state;
-   struct gl_vertex_program *program;
-   
-   GLuint nr_instructions;
-   GLuint temp_in_use;
-   GLuint temp_reserved;
-   
-   struct ureg eye_position;
-   struct ureg eye_position_normalized;
-   struct ureg eye_normal;
-   struct ureg identity;
-
-   GLuint materials;
-   GLuint color_materials;
-};
-
-
-const static struct ureg undef = { 
-   PROGRAM_UNDEFINED,
-   ~0,
-   0,
-   0,
-   0
-};
-
-/* Local shorthand:
- */
-#define X    SWIZZLE_X
-#define Y    SWIZZLE_Y
-#define Z    SWIZZLE_Z
-#define W    SWIZZLE_W
-
-
-/* Construct a ureg:
- */
-static struct ureg make_ureg(GLuint file, GLint idx)
-{
-   struct ureg reg;
-   reg.file = file;
-   reg.idx = idx;
-   reg.negate = 0;
-   reg.swz = SWIZZLE_NOOP;
-   reg.pad = 0;
-   return reg;
-}
-
-
-
-static struct ureg ureg_negate( struct ureg reg )
-{
-   reg.negate ^= 1;
-   return reg;
-} 
-
-
-static struct ureg swizzle( struct ureg reg, int x, int y, int z, int w )
-{
-   reg.swz = MAKE_SWIZZLE4(GET_SWZ(reg.swz, x),
-			   GET_SWZ(reg.swz, y),
-			   GET_SWZ(reg.swz, z),
-			   GET_SWZ(reg.swz, w));
-
-   return reg;
-}
-
-static struct ureg swizzle1( struct ureg reg, int x )
-{
-   return swizzle(reg, x, x, x, x);
-}
-
-static struct ureg get_temp( struct tnl_program *p )
-{
-   int bit = ffs( ~p->temp_in_use );
-   if (!bit) {
-      fprintf(stderr, "%s: out of temporaries\n", __FILE__);
-      assert(0);
-   }
-
-   if (bit > p->program->Base.NumTemporaries)
-      p->program->Base.NumTemporaries = bit;
-
-   p->temp_in_use |= 1<<(bit-1);
-   return make_ureg(PROGRAM_TEMPORARY, bit-1);
-}
-
-static struct ureg reserve_temp( struct tnl_program *p )
-{
-   struct ureg temp = get_temp( p );
-   p->temp_reserved |= 1<<temp.idx;
-   return temp;
-}
-
-static void release_temp( struct tnl_program *p, struct ureg reg )
-{
-   if (reg.file == PROGRAM_TEMPORARY) {
-      p->temp_in_use &= ~(1<<reg.idx);
-      p->temp_in_use |= p->temp_reserved; /* can't release reserved temps */
-   }
-}
-
-static void release_temps( struct tnl_program *p )
-{
-   p->temp_in_use = p->temp_reserved;
-}
-
-
-
-static struct ureg register_input( struct tnl_program *p, GLuint input )
-{
-   assert(input < 32);
-
-   p->program->Base.InputsRead |= (1<<input);
-   return make_ureg(PROGRAM_INPUT, input);
-}
-
-static struct ureg register_output( struct tnl_program *p, GLuint output )
-{
-   p->program->Base.OutputsWritten |= (1<<output);
-   return make_ureg(PROGRAM_OUTPUT, output);
-}
-
-static struct ureg register_const4f( struct tnl_program *p, 
-			      GLfloat s0,
-			      GLfloat s1,
-			      GLfloat s2,
-			      GLfloat s3)
-{
-   GLfloat values[4];
-   GLint idx;
-   GLuint swizzle;
-   values[0] = s0;
-   values[1] = s1;
-   values[2] = s2;
-   values[3] = s3;
-   idx = _mesa_add_unnamed_constant( p->program->Base.Parameters, values, 4,
-                                     &swizzle);
-   assert(swizzle == SWIZZLE_NOOP); /* Need to handle swizzle in reg setup */
-   return make_ureg(PROGRAM_STATE_VAR, idx);
-}
-
-#define register_const1f(p, s0)         register_const4f(p, s0, 0, 0, 1)
-#define register_scalar_const(p, s0)    register_const4f(p, s0, s0, s0, s0)
-#define register_const2f(p, s0, s1)     register_const4f(p, s0, s1, 0, 1)
-#define register_const3f(p, s0, s1, s2) register_const4f(p, s0, s1, s2, 1)
-
-static GLboolean is_undef( struct ureg reg )
-{
-   return reg.file == PROGRAM_UNDEFINED;
-}
-
-static struct ureg get_identity_param( struct tnl_program *p )
-{
-   if (is_undef(p->identity)) 
-      p->identity = register_const4f(p, 0,0,0,1);
-
-   return p->identity;
-}
-
-static struct ureg register_param5( struct tnl_program *p, 
-                                    GLint s0,
-                                    GLint s1,
-                                    GLint s2,
-                                    GLint s3,
-                                    GLint s4)
-{
-   gl_state_index tokens[STATE_LENGTH];
-   GLint idx;
-   tokens[0] = s0;
-   tokens[1] = s1;
-   tokens[2] = s2;
-   tokens[3] = s3;
-   tokens[4] = s4;
-   idx = _mesa_add_state_reference( p->program->Base.Parameters, tokens );
-   return make_ureg(PROGRAM_STATE_VAR, idx);
-}
-
-
-#define register_param1(p,s0)          register_param5(p,s0,0,0,0,0)
-#define register_param2(p,s0,s1)       register_param5(p,s0,s1,0,0,0)
-#define register_param3(p,s0,s1,s2)    register_param5(p,s0,s1,s2,0,0)
-#define register_param4(p,s0,s1,s2,s3) register_param5(p,s0,s1,s2,s3,0)
-
-
-static void register_matrix_param5( struct tnl_program *p,
-				    GLint s0, /* matrix name */
-				    GLint s1, /* texture matrix number */
-				    GLint s2, /* first row */
-				    GLint s3, /* last row */
-				    GLint s4, /* modifier */
-				    struct ureg *matrix )
-{
-   GLint i;
-
-   /* This is a bit sad as the support is there to pull the whole
-    * matrix out in one go:
-    */
-   for (i = 0; i <= s3 - s2; i++) 
-      matrix[i] = register_param5( p, s0, s1, i, i, s4 );
-}
-
-
-static void emit_arg( struct prog_src_register *src,
-		      struct ureg reg )
-{
-   src->File = reg.file;
-   src->Index = reg.idx;
-   src->Swizzle = reg.swz;
-   src->RelAddr = 0;
-   src->NegateBase = reg.negate;
-   src->Abs = 0;
-   src->NegateAbs = 0;
-}
-
-static void emit_dst( struct prog_dst_register *dst,
-		      struct ureg reg, GLuint mask )
-{
-   dst->File = reg.file;
-   dst->Index = reg.idx;
-   /* allow zero as a shorthand for xyzw */
-   dst->WriteMask = mask ? mask : WRITEMASK_XYZW; 
-   dst->CondMask = 0;
-   dst->CondSwizzle = 0;
-   dst->CondSrc = 0;
-   dst->pad = 0;
-}
-
-static void debug_insn( struct prog_instruction *inst, const char *fn,
-			GLuint line )
-{
-   if (DISASSEM) {
-      static const char *last_fn;
-   
-      if (fn != last_fn) {
-	 last_fn = fn;
-	 _mesa_printf("%s:\n", fn);
-      }
-	 
-      _mesa_printf("%d:\t", line);
-      _mesa_print_instruction(inst);
-   }
-}
-
-
-static void emit_op3fn(struct tnl_program *p,
-		       GLuint op,
-		       struct ureg dest,
-		       GLuint mask,
-		       struct ureg src0,
-		       struct ureg src1,
-		       struct ureg src2,
-		       const char *fn,
-		       GLuint line)
-{
-   GLuint nr = p->program->Base.NumInstructions++;
-      
-   if (nr >= p->nr_instructions) {
-      int new_nr_instructions = p->nr_instructions * 2;
-
-      p->program->Base.Instructions = 
-	 _mesa_realloc(p->program->Base.Instructions,
-		       sizeof(struct prog_instruction) * p->nr_instructions,
-		       sizeof(struct prog_instruction) * new_nr_instructions);
-      p->nr_instructions = new_nr_instructions;
-   }
-
-   {      
-      struct prog_instruction *inst = &p->program->Base.Instructions[nr];
-      memset(inst, 0, sizeof(*inst));
-      inst->Opcode = op; 
-      inst->StringPos = 0;
-      inst->Data = 0;
-   
-      emit_arg( &inst->SrcReg[0], src0 );
-      emit_arg( &inst->SrcReg[1], src1 );
-      emit_arg( &inst->SrcReg[2], src2 );   
-
-      emit_dst( &inst->DstReg, dest, mask );
-
-      debug_insn(inst, fn, line);
-   }
-}
-
-   
-
-#define emit_op3(p, op, dst, mask, src0, src1, src2) \
-   emit_op3fn(p, op, dst, mask, src0, src1, src2, __FUNCTION__, __LINE__)
-
-#define emit_op2(p, op, dst, mask, src0, src1) \
-    emit_op3fn(p, op, dst, mask, src0, src1, undef, __FUNCTION__, __LINE__)
-
-#define emit_op1(p, op, dst, mask, src0) \
-    emit_op3fn(p, op, dst, mask, src0, undef, undef, __FUNCTION__, __LINE__)
-
-
-static struct ureg make_temp( struct tnl_program *p, struct ureg reg )
-{
-   if (reg.file == PROGRAM_TEMPORARY && 
-       !(p->temp_reserved & (1<<reg.idx)))
-      return reg;
-   else {
-      struct ureg temp = get_temp(p);
-      emit_op1(p, OPCODE_MOV, temp, 0, reg);
-      return temp;
-   }
-}
-
-
-/* Currently no tracking performed of input/output/register size or
- * active elements.  Could be used to reduce these operations, as
- * could the matrix type.
- */
-static void emit_matrix_transform_vec4( struct tnl_program *p,
-					struct ureg dest,
-					const struct ureg *mat,
-					struct ureg src)
-{
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_X, src, mat[0]);
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_Y, src, mat[1]);
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_Z, src, mat[2]);
-   emit_op2(p, OPCODE_DP4, dest, WRITEMASK_W, src, mat[3]);
-}
-
-/* This version is much easier to implement if writemasks are not
- * supported natively on the target or (like SSE), the target doesn't
- * have a clean/obvious dotproduct implementation.
- */
-static void emit_transpose_matrix_transform_vec4( struct tnl_program *p,
-						  struct ureg dest,
-						  const struct ureg *mat,
-						  struct ureg src)
-{
-   struct ureg tmp;
-
-   if (dest.file != PROGRAM_TEMPORARY)
-      tmp = get_temp(p);
-   else
-      tmp = dest;
-
-   emit_op2(p, OPCODE_MUL, tmp, 0, swizzle1(src,X), mat[0]);
-   emit_op3(p, OPCODE_MAD, tmp, 0, swizzle1(src,Y), mat[1], tmp);
-   emit_op3(p, OPCODE_MAD, tmp, 0, swizzle1(src,Z), mat[2], tmp);
-   emit_op3(p, OPCODE_MAD, dest, 0, swizzle1(src,W), mat[3], tmp);
-
-   if (dest.file != PROGRAM_TEMPORARY)
-      release_temp(p, tmp);
-}
-
-static void emit_matrix_transform_vec3( struct tnl_program *p,
-					struct ureg dest,
-					const struct ureg *mat,
-					struct ureg src)
-{
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_X, src, mat[0]);
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_Y, src, mat[1]);
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_Z, src, mat[2]);
-}
-
-
-static void emit_normalize_vec3( struct tnl_program *p,
-				 struct ureg dest,
-				 struct ureg src )
-{
-   emit_op2(p, OPCODE_DP3, dest, WRITEMASK_W, src, src);
-   emit_op1(p, OPCODE_RSQ, dest, WRITEMASK_W, swizzle1(dest,W));
-   emit_op2(p, OPCODE_MUL, dest, WRITEMASK_XYZ, src, swizzle1(dest,W));
-}
-
-static void emit_passthrough( struct tnl_program *p, 
-			      GLuint input,
-			      GLuint output )
-{
-   struct ureg out = register_output(p, output);
-   emit_op1(p, OPCODE_MOV, out, 0, register_input(p, input)); 
-}
-
-static struct ureg get_eye_position( struct tnl_program *p )
-{
-   if (is_undef(p->eye_position)) {
-      struct ureg pos = register_input( p, VERT_ATTRIB_POS ); 
-      struct ureg modelview[4];
-
-      p->eye_position = reserve_temp(p);
-
-      if (PREFER_DP4) {
-	 register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 3, 
-				 0, modelview );
-
-	 emit_matrix_transform_vec4(p, p->eye_position, modelview, pos);
-      }
-      else {
-	 register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 3, 
-				 STATE_MATRIX_TRANSPOSE, modelview );
-
-	 emit_transpose_matrix_transform_vec4(p, p->eye_position, modelview, pos);
-      }
-   }
-   
-   return p->eye_position;
-}
-
-
-#if 0
-static struct ureg get_eye_z( struct tnl_program *p )
-{
-   if (!is_undef(p->eye_position)) {
-      return swizzle1(p->eye_position, Z);
-   }
-   else if (!is_undef(p->eye_z)) {
-      struct ureg pos = register_input( p, BRW_ATTRIB_POS ); 
-      struct ureg modelview2;
-
-      p->eye_z = reserve_temp(p);
-
-      register_matrix_param6( p, STATE_MATRIX, STATE_MODELVIEW, 0, 2, 1, 
-			      STATE_MATRIX, &modelview2 );
-
-      emit_matrix_transform_vec4(p, p->eye_position, modelview, pos);
-      emit_op2(p, OPCODE_DP4, p->eye_z, WRITEMASK_Z, pos, modelview2);
-   }
-   
-   return swizzle1(p->eye_z, Z)
-}
-#endif
-
-
-
-static struct ureg get_eye_position_normalized( struct tnl_program *p )
-{
-   if (is_undef(p->eye_position_normalized)) {
-      struct ureg eye = get_eye_position(p);
-      p->eye_position_normalized = reserve_temp(p);
-      emit_normalize_vec3(p, p->eye_position_normalized, eye);
-   }
-   
-   return p->eye_position_normalized;
-}
-
-
-static struct ureg get_eye_normal( struct tnl_program *p )
-{
-   if (is_undef(p->eye_normal)) {
-      struct ureg normal = register_input(p, VERT_ATTRIB_NORMAL );
-      struct ureg mvinv[3];
-
-      register_matrix_param5( p, STATE_MODELVIEW_MATRIX, 0, 0, 2,
-			      STATE_MATRIX_INVTRANS, mvinv );
-
-      p->eye_normal = reserve_temp(p);
-
-      /* Transform to eye space:
-       */
-      emit_matrix_transform_vec3( p, p->eye_normal, mvinv, normal );
-
-      /* Normalize/Rescale:
-       */
-      if (p->state->normalize) {
-	 emit_normalize_vec3( p, p->eye_normal, p->eye_normal );
-      }
-      else if (p->state->rescale_normals) {
-	 struct ureg rescale = register_param2(p, STATE_INTERNAL,
-					       STATE_NORMAL_SCALE);
-
-	 emit_op2( p, OPCODE_MUL, p->eye_normal, 0, p->eye_normal, 
-		   swizzle1(rescale, X));
-      }
-   }
-
-   return p->eye_normal;
-}
-
-
-
-static void build_hpos( struct tnl_program *p )
-{
-   struct ureg pos = register_input( p, VERT_ATTRIB_POS ); 
-   struct ureg hpos = register_output( p, VERT_RESULT_HPOS );
-   struct ureg mvp[4];
-
-   if (PREFER_DP4) {
-      register_matrix_param5( p, STATE_MVP_MATRIX, 0, 0, 3, 
-			      0, mvp );
-      emit_matrix_transform_vec4( p, hpos, mvp, pos );
-   }
-   else {
-      register_matrix_param5( p, STATE_MVP_MATRIX, 0, 0, 3, 
-			      STATE_MATRIX_TRANSPOSE, mvp );
-      emit_transpose_matrix_transform_vec4( p, hpos, mvp, pos );
-   }
-}
-
-
-static GLuint material_attrib( GLuint side, GLuint property )
-{
-   return (property - STATE_AMBIENT) * 2 + side;
-}
-
-/* Get a bitmask of which material values vary on a per-vertex basis.
- */
-static void set_material_flags( struct tnl_program *p )
-{
-   p->color_materials = 0;
-   p->materials = 0;
-
-   if (p->state->light_color_material) {
-      p->materials = 
-	 p->color_materials = p->state->light_color_material_mask;
-   }
-
-   p->materials |= p->state->light_material_mask;
-}
-
-
-static struct ureg get_material( struct tnl_program *p, GLuint side, 
-				 GLuint property )
-{
-   GLuint attrib = material_attrib(side, property);
-
-   if (p->color_materials & (1<<attrib))
-      return register_input(p, VERT_ATTRIB_COLOR0);
-   else if (p->materials & (1<<attrib)) 
-      return register_input( p, attrib + _TNL_ATTRIB_MAT_FRONT_AMBIENT );
-   else
-      return register_param3( p, STATE_MATERIAL, side, property );
-}
-
-#define SCENE_COLOR_BITS(side) ((MAT_BIT_FRONT_EMISSION | \
-				 MAT_BIT_FRONT_AMBIENT | \
-				 MAT_BIT_FRONT_DIFFUSE) << (side))
-
-/* Either return a precalculated constant value or emit code to
- * calculate these values dynamically in the case where material calls
- * are present between begin/end pairs.
- *
- * Probably want to shift this to the program compilation phase - if
- * we always emitted the calculation here, a smart compiler could
- * detect that it was constant (given a certain set of inputs), and
- * lift it out of the main loop.  That way the programs created here
- * would be independent of the vertex_buffer details.
- */
-static struct ureg get_scenecolor( struct tnl_program *p, GLuint side )
-{
-   if (p->materials & SCENE_COLOR_BITS(side)) {
-      struct ureg lm_ambient = register_param1(p, STATE_LIGHTMODEL_AMBIENT);
-      struct ureg material_emission = get_material(p, side, STATE_EMISSION);
-      struct ureg material_ambient = get_material(p, side, STATE_AMBIENT);
-      struct ureg material_diffuse = get_material(p, side, STATE_DIFFUSE);
-      struct ureg tmp = make_temp(p, material_diffuse);
-      emit_op3(p, OPCODE_MAD, tmp,  WRITEMASK_XYZ, lm_ambient, 
-	       material_ambient, material_emission);
-      return tmp;
-   }
-   else
-      return register_param2( p, STATE_LIGHTMODEL_SCENECOLOR, side );
-}
-
-
-static struct ureg get_lightprod( struct tnl_program *p, GLuint light, 
-				  GLuint side, GLuint property )
-{
-   GLuint attrib = material_attrib(side, property);
-   if (p->materials & (1<<attrib)) {
-      struct ureg light_value = 
-	 register_param3(p, STATE_LIGHT, light, property);
-      struct ureg material_value = get_material(p, side, property);
-      struct ureg tmp = get_temp(p);
-      emit_op2(p, OPCODE_MUL, tmp,  0, light_value, material_value);
-      return tmp;
-   }
-   else
-      return register_param4(p, STATE_LIGHTPROD, light, side, property);
-}
-
-static struct ureg calculate_light_attenuation( struct tnl_program *p,
-						GLuint i, 
-						struct ureg VPpli,
-						struct ureg dist )
-{
-   struct ureg attenuation = register_param3(p, STATE_LIGHT, i,
-					     STATE_ATTENUATION);
-   struct ureg att = get_temp(p);
-
-   /* Calculate spot attenuation:
-    */
-   if (!p->state->unit[i].light_spotcutoff_is_180) {
-      struct ureg spot_dir_norm = register_param3(p, STATE_INTERNAL,
-						  STATE_LIGHT_SPOT_DIR_NORMALIZED, i);
-      struct ureg spot = get_temp(p);
-      struct ureg slt = get_temp(p);
-
-      emit_op2(p, OPCODE_DP3, spot, 0, ureg_negate(VPpli), spot_dir_norm);
-      emit_op2(p, OPCODE_SLT, slt, 0, swizzle1(spot_dir_norm,W), spot);
-      emit_op2(p, OPCODE_POW, spot, 0, spot, swizzle1(attenuation, W));
-      emit_op2(p, OPCODE_MUL, att, 0, slt, spot);
-
-      release_temp(p, spot);
-      release_temp(p, slt);
-   }
-
-   /* Calculate distance attenuation:
-    */
-   if (p->state->unit[i].light_attenuated) {
-
-      /* 1/d,d,d,1/d */
-      emit_op1(p, OPCODE_RCP, dist, WRITEMASK_YZ, dist); 
-      /* 1,d,d*d,1/d */
-      emit_op2(p, OPCODE_MUL, dist, WRITEMASK_XZ, dist, swizzle1(dist,Y)); 
-      /* 1/dist-atten */
-      emit_op2(p, OPCODE_DP3, dist, 0, attenuation, dist); 
-
-      if (!p->state->unit[i].light_spotcutoff_is_180) {
-	 /* dist-atten */
-	 emit_op1(p, OPCODE_RCP, dist, 0, dist); 
-	 /* spot-atten * dist-atten */
-	 emit_op2(p, OPCODE_MUL, att, 0, dist, att);	
-      } else {
-	 /* dist-atten */
-	 emit_op1(p, OPCODE_RCP, att, 0, dist); 
-      }
-   }
-
-   return att;
-}
-						
-
-
-
-
-/* Need to add some addtional parameters to allow lighting in object
- * space - STATE_SPOT_DIRECTION and STATE_HALF_VECTOR implicitly assume eye
- * space lighting.
- */
-static void build_lighting( struct tnl_program *p )
-{
-   const GLboolean twoside = p->state->light_twoside;
-   const GLboolean separate = p->state->separate_specular;
-   GLuint nr_lights = 0, count = 0;
-   struct ureg normal = get_eye_normal(p);
-   struct ureg lit = get_temp(p);
-   struct ureg dots = get_temp(p);
-   struct ureg _col0 = undef, _col1 = undef;
-   struct ureg _bfc0 = undef, _bfc1 = undef;
-   GLuint i;
-
-   for (i = 0; i < MAX_LIGHTS; i++) 
-      if (p->state->unit[i].light_enabled)
-	 nr_lights++;
-   
-   set_material_flags(p);
-
-   {
-      struct ureg shininess = get_material(p, 0, STATE_SHININESS);
-      emit_op1(p, OPCODE_MOV, dots,  WRITEMASK_W, swizzle1(shininess,X));
-      release_temp(p, shininess);
-
-      _col0 = make_temp(p, get_scenecolor(p, 0));
-      if (separate)
-	 _col1 = make_temp(p, get_identity_param(p));
-      else
-	 _col1 = _col0;
-
-   }
-
-   if (twoside) {
-      struct ureg shininess = get_material(p, 1, STATE_SHININESS);
-      emit_op1(p, OPCODE_MOV, dots, WRITEMASK_Z, 
-	       ureg_negate(swizzle1(shininess,X)));
-      release_temp(p, shininess);
-
-      _bfc0 = make_temp(p, get_scenecolor(p, 1));
-      if (separate)
-	 _bfc1 = make_temp(p, get_identity_param(p));
-      else
-	 _bfc1 = _bfc0;
-   }
-
-
-   /* If no lights, still need to emit the scenecolor.
-    */
-   /* KW: changed to do this always - v1.17 "Fix lighting alpha result"? 
-    */
-   if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-   {
-      struct ureg res0 = register_output( p, VERT_RESULT_COL0 );
-      emit_op1(p, OPCODE_MOV, res0, 0, _col0);
-
-      if (twoside) {
-	 struct ureg res0 = register_output( p, VERT_RESULT_BFC0 );
-	 emit_op1(p, OPCODE_MOV, res0, 0, _bfc0);
-      }
-   }
-
-   if (separate && (p->state->fragprog_inputs_read & FRAG_BIT_COL1)) {
-
-      struct ureg res1 = register_output( p, VERT_RESULT_COL1 );
-      emit_op1(p, OPCODE_MOV, res1, 0, _col1);
-      
-      if (twoside) {
-	 struct ureg res1 = register_output( p, VERT_RESULT_BFC1 );
-	 emit_op1(p, OPCODE_MOV, res1, 0, _bfc1);
-      }
-   }
-      
-   if (nr_lights == 0) {
-      release_temps(p);
-      return;
-   }
-
-
-   for (i = 0; i < MAX_LIGHTS; i++) {
-      if (p->state->unit[i].light_enabled) {
-	 struct ureg half = undef;
-	 struct ureg att = undef, VPpli = undef;
-	  
-	 count++;
-
-	 if (p->state->unit[i].light_eyepos3_is_zero) {
-	    /* Can used precomputed constants in this case.
-	     * Attenuation never applies to infinite lights.
-	     */
-	    VPpli = register_param3(p, STATE_LIGHT, i, 
-				    STATE_LIGHT_POSITION_NORMALIZED);
-            if (p->state->light_local_viewer) {
-                struct ureg eye_hat = get_eye_position_normalized(p);
-                half = get_temp(p);
-                emit_op2(p, OPCODE_SUB, half, 0, VPpli, eye_hat);
-                emit_normalize_vec3(p, half, half);
-            } else {
-                half = register_param3(p, STATE_LIGHT, i, STATE_HALF_VECTOR);
-            }
-	 } 
-	 else {
-	    struct ureg Ppli = register_param3(p, STATE_LIGHT, i, 
-					       STATE_POSITION); 
-	    struct ureg V = get_eye_position(p);
-	    struct ureg dist = get_temp(p);
-	    struct ureg tmpPpli = get_temp(p);
-
-	    VPpli = get_temp(p); 
-	    half = get_temp(p);
-
-	    /* In homogeneous object coordinates
-	     */
-	    emit_op1(p, OPCODE_RCP, dist, 0, swizzle1(Ppli, W));
-	    emit_op2(p, OPCODE_MUL, tmpPpli, 0, Ppli, dist);
- 
-	    /* Calulate VPpli vector
-	     */
-	    emit_op2(p, OPCODE_SUB, VPpli, 0, tmpPpli, V); 
-
-	    /* Normalize VPpli.  The dist value also used in
-	     * attenuation below.
-	     */
-	    emit_op2(p, OPCODE_DP3, dist, 0, VPpli, VPpli);
-	    emit_op1(p, OPCODE_RSQ, dist, 0, dist);
-	    emit_op2(p, OPCODE_MUL, VPpli, 0, VPpli, dist);
-
-
-	    /* Calculate  attenuation:
-	     */ 
-	    if (!p->state->unit[i].light_spotcutoff_is_180 ||
-		p->state->unit[i].light_attenuated) {
-	       att = calculate_light_attenuation(p, i, VPpli, dist);
-	    }
-	 
-      
-	    /* Calculate viewer direction, or use infinite viewer:
-	     */
-	    if (p->state->light_local_viewer) {
-	       struct ureg eye_hat = get_eye_position_normalized(p);
-	       emit_op2(p, OPCODE_SUB, half, 0, VPpli, eye_hat);
-	    }
-	    else {
-	       struct ureg z_dir = swizzle(get_identity_param(p),X,Y,W,Z); 
-	       emit_op2(p, OPCODE_ADD, half, 0, VPpli, z_dir);
-	    }
-
-	    emit_normalize_vec3(p, half, half);
-
-	    release_temp(p, dist);
-	    release_temp(p, tmpPpli);
-	 }
-
-	 /* Calculate dot products:
-	  */
-	 emit_op2(p, OPCODE_DP3, dots, WRITEMASK_X, normal, VPpli);
-	 emit_op2(p, OPCODE_DP3, dots, WRITEMASK_Y, normal, half);
-
-	
-	 /* Front face lighting:
-	  */
-	 {
-	    struct ureg ambient = get_lightprod(p, i, 0, STATE_AMBIENT);
-	    struct ureg diffuse = get_lightprod(p, i, 0, STATE_DIFFUSE);
-	    struct ureg specular = get_lightprod(p, i, 0, STATE_SPECULAR);
-	    struct ureg res0, res1;
-	    GLuint mask0, mask1;
-
-	    emit_op1(p, OPCODE_LIT, lit, 0, dots);
-   
-	    if (!is_undef(att)) 
-	       emit_op2(p, OPCODE_MUL, lit, 0, lit, att);
-
-
-	    mask0 = 0;
-	    mask1 = 0;
-	    res0 = _col0;
-	    res1 = _col1;
-	    
-	    if (count == nr_lights) {
-	       if (separate) {
-		  mask0 = WRITEMASK_XYZ;
-		  mask1 = WRITEMASK_XYZ;
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res0 = register_output( p, VERT_RESULT_COL0 );
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL1)
-		     res1 = register_output( p, VERT_RESULT_COL1 );
-	       }
-	       else {
-		  mask1 = WRITEMASK_XYZ;
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res1 = register_output( p, VERT_RESULT_COL0 );
-	       }
-	    } 
-
-	    emit_op3(p, OPCODE_MAD, _col0, 0, swizzle1(lit,X), ambient, _col0);
-	    emit_op3(p, OPCODE_MAD, res0, mask0, swizzle1(lit,Y), diffuse, _col0);
-	    emit_op3(p, OPCODE_MAD, res1, mask1, swizzle1(lit,Z), specular, _col1);
-      
-	    release_temp(p, ambient);
-	    release_temp(p, diffuse);
-	    release_temp(p, specular);
-	 }
-
-	 /* Back face lighting:
-	  */
-	 if (twoside) {
-	    struct ureg ambient = get_lightprod(p, i, 1, STATE_AMBIENT);
-	    struct ureg diffuse = get_lightprod(p, i, 1, STATE_DIFFUSE);
-	    struct ureg specular = get_lightprod(p, i, 1, STATE_SPECULAR);
-	    struct ureg res0, res1;
-	    GLuint mask0, mask1;
-	       
-	    emit_op1(p, OPCODE_LIT, lit, 0, ureg_negate(swizzle(dots,X,Y,W,Z)));
-
-	    if (!is_undef(att)) 
-	       emit_op2(p, OPCODE_MUL, lit, 0, lit, att);
-
-	    mask0 = 0;
-	    mask1 = 0;
-	    res0 = _bfc0;
-	    res1 = _bfc1;
-
-	    if (count == nr_lights) {
-	       if (separate) {
-		  mask0 = WRITEMASK_XYZ;
-		  mask1 = WRITEMASK_XYZ;
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res0 = register_output( p, VERT_RESULT_BFC0 );
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL1)
-		     res1 = register_output( p, VERT_RESULT_BFC1 );
-	       }
-	       else {
-		  mask1 = WRITEMASK_XYZ;
-
-		  if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-		     res1 = register_output( p, VERT_RESULT_BFC0 );
-	       }
-	    }
-
-	    emit_op3(p, OPCODE_MAD, _bfc0, 0, swizzle1(lit,X), ambient, _bfc0);
-	    emit_op3(p, OPCODE_MAD, res0, mask0, swizzle1(lit,Y), diffuse, _bfc0);
-	    emit_op3(p, OPCODE_MAD, res1, mask1, swizzle1(lit,Z), specular, _bfc1);
-
-	    release_temp(p, ambient);
-	    release_temp(p, diffuse);
-	    release_temp(p, specular);
-	 }
-
-	 release_temp(p, half);
-	 release_temp(p, VPpli);
-	 release_temp(p, att);
-      }
-   }
-
-   release_temps( p );
-}
-
-
-static void build_fog( struct tnl_program *p )
-{
-   struct ureg fog = register_output(p, VERT_RESULT_FOGC);
-   struct ureg input;
-   GLuint useabs = p->state->fog_source_is_depth && p->state->fog_option &&
-		   (p->state->fog_option != FOG_EXP2);
-
-   if (p->state->fog_source_is_depth) {
-      input = swizzle1(get_eye_position(p), Z);
-   }
-   else {
-      input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X);
-      if (p->state->fog_option &&
-	  p->state->tnl_do_vertex_fog)
-	  input = swizzle1(register_input(p, VERT_ATTRIB_FOG), X);
-      else
-	  input = register_input(p, VERT_ATTRIB_FOG);
-   }
-
-   if (p->state->fog_option &&
-       p->state->tnl_do_vertex_fog) {
-      struct ureg params = register_param2(p, STATE_INTERNAL,
-					   STATE_FOG_PARAMS_OPTIMIZED);
-      struct ureg tmp = get_temp(p);
-      struct ureg id = get_identity_param(p);
-
-      emit_op1(p, OPCODE_MOV, fog, 0, id);
-
-      if (useabs) {
-	 emit_op1(p, OPCODE_ABS, tmp, 0, input);
-      }
-
-      switch (p->state->fog_option) {
-      case FOG_LINEAR: {
-	 emit_op3(p, OPCODE_MAD, tmp, 0, useabs ? tmp : input,
-			swizzle1(params,X), swizzle1(params,Y));
-	 emit_op2(p, OPCODE_MAX, tmp, 0, tmp, swizzle1(id,X)); /* saturate */
-	 emit_op2(p, OPCODE_MIN, fog, WRITEMASK_X, tmp, swizzle1(id,W));
-	 break;
-      }
-      case FOG_EXP:
-	 emit_op2(p, OPCODE_MUL, tmp, 0, useabs ? tmp : input,
-			swizzle1(params,Z));
-	 emit_op1(p, OPCODE_EX2, fog, WRITEMASK_X, ureg_negate(tmp));
-	 break;
-      case FOG_EXP2:
-	 emit_op2(p, OPCODE_MUL, tmp, 0, input, swizzle1(params,W));
-	 emit_op2(p, OPCODE_MUL, tmp, 0, tmp, tmp);
-	 emit_op1(p, OPCODE_EX2, fog, WRITEMASK_X, ureg_negate(tmp));
-	 break;
-      }
-
-      release_temp(p, tmp);
-   }
-   else {
-      /* results = incoming fog coords (compute fog per-fragment later) 
-       *
-       * KW:  Is it really necessary to do anything in this case?
-       */
-      emit_op1(p, useabs ? OPCODE_ABS : OPCODE_MOV, fog, 0, input);
-   }
-}
- 
-static void build_reflect_texgen( struct tnl_program *p,
-				  struct ureg dest,
-				  GLuint writemask )
-{
-   struct ureg normal = get_eye_normal(p);
-   struct ureg eye_hat = get_eye_position_normalized(p);
-   struct ureg tmp = get_temp(p);
-
-   /* n.u */
-   emit_op2(p, OPCODE_DP3, tmp, 0, normal, eye_hat); 
-   /* 2n.u */
-   emit_op2(p, OPCODE_ADD, tmp, 0, tmp, tmp); 
-   /* (-2n.u)n + u */
-   emit_op3(p, OPCODE_MAD, dest, writemask, ureg_negate(tmp), normal, eye_hat);
-
-   release_temp(p, tmp);
-}
-
-static void build_sphere_texgen( struct tnl_program *p,
-				 struct ureg dest,
-				 GLuint writemask )
-{
-   struct ureg normal = get_eye_normal(p);
-   struct ureg eye_hat = get_eye_position_normalized(p);
-   struct ureg tmp = get_temp(p);
-   struct ureg half = register_scalar_const(p, .5);
-   struct ureg r = get_temp(p);
-   struct ureg inv_m = get_temp(p);
-   struct ureg id = get_identity_param(p);
-
-   /* Could share the above calculations, but it would be
-    * a fairly odd state for someone to set (both sphere and
-    * reflection active for different texture coordinate
-    * components.  Of course - if two texture units enable
-    * reflect and/or sphere, things start to tilt in favour
-    * of seperating this out:
-    */
-
-   /* n.u */
-   emit_op2(p, OPCODE_DP3, tmp, 0, normal, eye_hat); 
-   /* 2n.u */
-   emit_op2(p, OPCODE_ADD, tmp, 0, tmp, tmp); 
-   /* (-2n.u)n + u */
-   emit_op3(p, OPCODE_MAD, r, 0, ureg_negate(tmp), normal, eye_hat); 
-   /* r + 0,0,1 */
-   emit_op2(p, OPCODE_ADD, tmp, 0, r, swizzle(id,X,Y,W,Z)); 
-   /* rx^2 + ry^2 + (rz+1)^2 */
-   emit_op2(p, OPCODE_DP3, tmp, 0, tmp, tmp); 
-   /* 2/m */
-   emit_op1(p, OPCODE_RSQ, tmp, 0, tmp); 
-   /* 1/m */
-   emit_op2(p, OPCODE_MUL, inv_m, 0, tmp, half); 
-   /* r/m + 1/2 */
-   emit_op3(p, OPCODE_MAD, dest, writemask, r, inv_m, half); 
-	       
-   release_temp(p, tmp);
-   release_temp(p, r);
-   release_temp(p, inv_m);
-}
-
-
-static void build_texture_transform( struct tnl_program *p )
-{
-   GLuint i, j;
-
-   for (i = 0; i < MAX_TEXTURE_UNITS; i++) {
-
-      if (!(p->state->fragprog_inputs_read & (FRAG_BIT_TEX0<<i)))
-	 continue;
-							     
-      if (p->state->unit[i].texgen_enabled || 
-	  p->state->unit[i].texmat_enabled) {
-	 
-	 GLuint texmat_enabled = p->state->unit[i].texmat_enabled;
-	 struct ureg out = register_output(p, VERT_RESULT_TEX0 + i);
-	 struct ureg out_texgen = undef;
-
-	 if (p->state->unit[i].texgen_enabled) {
-	    GLuint copy_mask = 0;
-	    GLuint sphere_mask = 0;
-	    GLuint reflect_mask = 0;
-	    GLuint normal_mask = 0;
-	    GLuint modes[4];
-	 
-	    if (texmat_enabled) 
-	       out_texgen = get_temp(p);
-	    else
-	       out_texgen = out;
-
-	    modes[0] = p->state->unit[i].texgen_mode0;
-	    modes[1] = p->state->unit[i].texgen_mode1;
-	    modes[2] = p->state->unit[i].texgen_mode2;
-	    modes[3] = p->state->unit[i].texgen_mode3;
-
-	    for (j = 0; j < 4; j++) {
-	       switch (modes[j]) {
-	       case TXG_OBJ_LINEAR: {
-		  struct ureg obj = register_input(p, VERT_ATTRIB_POS);
-		  struct ureg plane = 
-		     register_param3(p, STATE_TEXGEN, i,
-				     STATE_TEXGEN_OBJECT_S + j);
-
-		  emit_op2(p, OPCODE_DP4, out_texgen, WRITEMASK_X << j, 
-			   obj, plane );
-		  break;
-	       }
-	       case TXG_EYE_LINEAR: {
-		  struct ureg eye = get_eye_position(p);
-		  struct ureg plane = 
-		     register_param3(p, STATE_TEXGEN, i, 
-				     STATE_TEXGEN_EYE_S + j);
-
-		  emit_op2(p, OPCODE_DP4, out_texgen, WRITEMASK_X << j, 
-			   eye, plane );
-		  break;
-	       }
-	       case TXG_SPHERE_MAP: 
-		  sphere_mask |= WRITEMASK_X << j;
-		  break;
-	       case TXG_REFLECTION_MAP:
-		  reflect_mask |= WRITEMASK_X << j;
-		  break;
-	       case TXG_NORMAL_MAP: 
-		  normal_mask |= WRITEMASK_X << j;
-		  break;
-	       case TXG_NONE:
-		  copy_mask |= WRITEMASK_X << j;
-	       }
-
-	    }
-
-	 
-	    if (sphere_mask) {
-	       build_sphere_texgen(p, out_texgen, sphere_mask);
-	    }
-
-	    if (reflect_mask) {
-	       build_reflect_texgen(p, out_texgen, reflect_mask);
-	    }
-
-	    if (normal_mask) {
-	       struct ureg normal = get_eye_normal(p);
-	       emit_op1(p, OPCODE_MOV, out_texgen, normal_mask, normal );
-	    }
-
-	    if (copy_mask) {
-	       struct ureg in = register_input(p, VERT_ATTRIB_TEX0+i);
-	       emit_op1(p, OPCODE_MOV, out_texgen, copy_mask, in );
-	    }
-	 }
-
-	 if (texmat_enabled) {
-	    struct ureg texmat[4];
-	    struct ureg in = (!is_undef(out_texgen) ? 
-			      out_texgen : 
-			      register_input(p, VERT_ATTRIB_TEX0+i));
-	    if (PREFER_DP4) {
-	       register_matrix_param5( p, STATE_TEXTURE_MATRIX, i, 0, 3,
-				       0, texmat );
-	       emit_matrix_transform_vec4( p, out, texmat, in );
-	    }
-	    else {
-	       register_matrix_param5( p, STATE_TEXTURE_MATRIX, i, 0, 3,
-				       STATE_MATRIX_TRANSPOSE, texmat );
-	       emit_transpose_matrix_transform_vec4( p, out, texmat, in );
-	    }
-	 }
-
-	 release_temps(p);
-      } 
-      else {
-	 emit_passthrough(p, VERT_ATTRIB_TEX0+i, VERT_RESULT_TEX0+i);
-      }
-   }
-}
-
-
-/* Seems like it could be tighter:
- */
-static void build_pointsize( struct tnl_program *p )
-{
-   struct ureg eye = get_eye_position(p);
-   struct ureg state_size = register_param1(p, STATE_POINT_SIZE);
-   struct ureg state_attenuation = register_param1(p, STATE_POINT_ATTENUATION);
-   struct ureg out = register_output(p, VERT_RESULT_PSIZ);
-   struct ureg ut = get_temp(p);
-
-   /* 1, Z, Z * Z, 1 */      
-   emit_op1(p, OPCODE_MOV, ut, WRITEMASK_XW, swizzle1(get_identity_param(p), W));
-   emit_op1(p, OPCODE_ABS, ut, WRITEMASK_YZ, swizzle1(eye, Z));
-   emit_op2(p, OPCODE_MUL, ut, WRITEMASK_Z, ut, ut);
-
-
-   /* p1 +  p2 * dist + p3 * dist * dist, 0 */
-   emit_op2(p, OPCODE_DP3, ut, WRITEMASK_X, ut, state_attenuation);
-
-   /* 1 / sqrt(factor) */
-   emit_op1(p, OPCODE_RSQ, ut, WRITEMASK_X, ut ); 
-
-   /* ut = pointSize / factor */
-   emit_op2(p, OPCODE_MUL, ut, WRITEMASK_X, ut, state_size); 
-
-   /* Clamp to min/max - state_size.[yz]
-    */
-   emit_op2(p, OPCODE_MAX, ut, WRITEMASK_X, ut, swizzle1(state_size, Y)); 
-   emit_op2(p, OPCODE_MIN, out, 0, swizzle1(ut, X), swizzle1(state_size, Z)); 
-   
-   release_temp(p, ut);
-}
-
-static void build_tnl_program( struct tnl_program *p )
-{  
-   /* Emit the program, starting with modelviewproject:
-    */
-   build_hpos(p);
-
-   /* Lighting calculations:
-    */
-   if (p->state->fragprog_inputs_read & (FRAG_BIT_COL0|FRAG_BIT_COL1)) {
-      if (p->state->light_global_enabled)
-	 build_lighting(p);
-      else {
-	 if (p->state->fragprog_inputs_read & FRAG_BIT_COL0)
-	    emit_passthrough(p, VERT_ATTRIB_COLOR0, VERT_RESULT_COL0);
-
-	 if (p->state->fragprog_inputs_read & FRAG_BIT_COL1)
-	    emit_passthrough(p, VERT_ATTRIB_COLOR1, VERT_RESULT_COL1);
-      }
-   }
-
-   if ((p->state->fragprog_inputs_read & FRAG_BIT_FOGC) ||
-       p->state->fog_option != FOG_NONE)
-      build_fog(p);
-
-   if (p->state->fragprog_inputs_read & FRAG_BITS_TEX_ANY)
-      build_texture_transform(p);
-
-   if (p->state->point_attenuated)
-      build_pointsize(p);
-
-   /* Finish up:
-    */
-   emit_op1(p, OPCODE_END, undef, 0, undef);
-
-   /* Disassemble:
-    */
-   if (DISASSEM) {
-      _mesa_printf ("\n");
-   }
-}
-
-
-static void build_new_tnl_program( const struct state_key *key,
-				   struct gl_vertex_program *program,
-				   GLuint max_temps)
-{
-   struct tnl_program p;
-
-   _mesa_memset(&p, 0, sizeof(p));
-   p.state = key;
-   p.program = program;
-   p.eye_position = undef;
-   p.eye_position_normalized = undef;
-   p.eye_normal = undef;
-   p.identity = undef;
-   p.temp_in_use = 0;
-   p.nr_instructions = 16;
-   
-   if (max_temps >= sizeof(int) * 8)
-      p.temp_reserved = 0;
-   else
-      p.temp_reserved = ~((1<<max_temps)-1);
-
-   p.program->Base.Instructions = 
-      _mesa_malloc(sizeof(struct prog_instruction) * p.nr_instructions);
-   p.program->Base.String = 0;
-   p.program->Base.NumInstructions =
-   p.program->Base.NumTemporaries =
-   p.program->Base.NumParameters =
-   p.program->Base.NumAttributes = p.program->Base.NumAddressRegs = 0;
-   p.program->Base.Parameters = _mesa_new_parameter_list();
-   p.program->Base.InputsRead = 0;
-   p.program->Base.OutputsWritten = 0;
-
-   build_tnl_program( &p );
-}
-
-static void *search_cache( struct brw_tnl_cache *cache,
-			   GLuint hash,
-			   const void *key,
-			   GLuint keysize)
-{
-   struct brw_tnl_cache_item *c;
-
-   for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->hash == hash && memcmp(c->key, key, keysize) == 0)
-	 return c->data;
-   }
-
-   return NULL;
-}
-
-static void rehash( struct brw_tnl_cache *cache )
-{
-   struct brw_tnl_cache_item **items;
-   struct brw_tnl_cache_item *c, *next;
-   GLuint size, i;
-
-   size = cache->size * 3;
-   items = (struct brw_tnl_cache_item**) _mesa_malloc(size * sizeof(*items));
-   _mesa_memset(items, 0, size * sizeof(*items));
-
-   for (i = 0; i < cache->size; i++)
-      for (c = cache->items[i]; c; c = next) {
-	 next = c->next;
-	 c->next = items[c->hash % size];
-	 items[c->hash % size] = c;
-      }
-
-   FREE(cache->items);
-   cache->items = items;
-   cache->size = size;
-}
-
-static void cache_item( struct brw_tnl_cache *cache,
-			GLuint hash,
-			const struct state_key *key,
-			void *data )
-{
-   struct brw_tnl_cache_item *c = MALLOC(sizeof(*c));
-   c->hash = hash;
-
-   c->key = malloc(sizeof(*key));
-   memcpy(c->key, key, sizeof(*key));
-
-   c->data = data;
-
-   if (++cache->n_items > cache->size * 1.5)
-      rehash(cache);
-
-   c->next = cache->items[hash % cache->size];
-   cache->items[hash % cache->size] = c;
-}
-
-
-static GLuint hash_key( struct state_key *key )
-{
-   GLuint *ikey = (GLuint *)key;
-   GLuint hash = 0, i;
-
-   /* I'm sure this can be improved on, but speed is important:
-    */
-   for (i = 0; i < sizeof(*key)/sizeof(GLuint); i++)
-      hash += ikey[i];
-
-   return hash;
-}
-
-static void prepare_tnl_program( struct brw_context *brw )
-{
-   GLcontext *ctx = &brw->intel.ctx;
-   struct state_key key;
-   GLuint hash;
-   struct gl_vertex_program *old = brw->tnl_program;
-
-   /* _NEW_PROGRAM */
-   if (brw->attribs.VertexProgram->_Current) 
-      return;
-      
-   /* Grab all the relevent state and put it in a single structure:
-    */
-   make_state_key(ctx, &key);
-   hash = hash_key(&key);
-
-   /* Look for an already-prepared program for this state:
-    */
-   brw->tnl_program = (struct gl_vertex_program *)
-      search_cache( &brw->tnl_program_cache, hash, &key, sizeof(key) );
-   
-   /* OK, we'll have to build a new one:
-    */
-   if (!brw->tnl_program) {
-      brw->tnl_program = (struct gl_vertex_program *)
-	 ctx->Driver.NewProgram(ctx, GL_VERTEX_PROGRAM_ARB, 0); 
-
-      build_new_tnl_program( &key, brw->tnl_program, 
-/* 			     ctx->Const.MaxVertexProgramTemps  */
-			     32
-	 );
-
-      if (ctx->Driver.ProgramStringNotify)
-	 ctx->Driver.ProgramStringNotify( ctx, GL_VERTEX_PROGRAM_ARB, 
-					  &brw->tnl_program->Base );
-
-      cache_item( &brw->tnl_program_cache, 
-		  hash, &key, brw->tnl_program );
-   }
-
-   if (old != brw->tnl_program)
-      brw->state.dirty.brw |= BRW_NEW_TNL_PROGRAM;
-   return;
-}
-
-/* Note: See brw_draw.c - the vertex program must not rely on
- * brw->primitive or brw->reduced_prim.
- */
-const struct brw_tracked_state brw_tnl_vertprog = {
-   .dirty = {
-      .mesa = (_NEW_PROGRAM | 
-	       _NEW_LIGHT | 
-	       _NEW_TRANSFORM | 
-	       _NEW_FOG | 
-	       _NEW_HINT | 
-	       _NEW_POINT | 
-	       _NEW_TEXTURE |
-          _NEW_TEXTURE_MATRIX),
-      .brw = (BRW_NEW_FRAGMENT_PROGRAM | 
-	      BRW_NEW_INPUT_VARYING),
-      .cache = 0
-   },
-   .prepare = prepare_tnl_program
-};
-
-
-
-
 static void prepare_active_vertprog( struct brw_context *brw )
 {
    const struct gl_vertex_program *prev = brw->vertex_program;
 
-   /* NEW_PROGRAM */
-   if (brw->attribs.VertexProgram->_Current) {
-      brw->vertex_program = brw->attribs.VertexProgram->_Current;
-   }
-   else {
-      /* BRW_NEW_TNL_PROGRAM */
-      brw->vertex_program = brw->tnl_program;
-   }
+   brw->vertex_program = brw->attribs.VertexProgram->_Current;
 
    if (brw->vertex_program != prev) 
       brw->state.dirty.brw |= BRW_NEW_VERTEX_PROGRAM;
@@ -1672,37 +52,8 @@ static void prepare_active_vertprog( struct brw_context *brw )
 const struct brw_tracked_state brw_active_vertprog = {
    .dirty = {
       .mesa = _NEW_PROGRAM,
-      .brw = BRW_NEW_TNL_PROGRAM,
+      .brw = 0,
       .cache = 0
    },
    .prepare = prepare_active_vertprog
 };
-
-
-void brw_ProgramCacheInit( GLcontext *ctx )
-{
-   struct brw_context *brw = brw_context(ctx);
-
-   brw->tnl_program_cache.size = 17;
-   brw->tnl_program_cache.n_items = 0;
-   brw->tnl_program_cache.items = (struct brw_tnl_cache_item **)
-      _mesa_calloc(brw->tnl_program_cache.size * 
-		   sizeof(struct brw_tnl_cache_item));
-}
-
-void brw_ProgramCacheDestroy( GLcontext *ctx )
-{
-   struct brw_context *brw = brw_context(ctx);
-   struct brw_tnl_cache_item *c, *next;
-   GLuint i;
-
-   for (i = 0; i < brw->tnl_program_cache.size; i++)
-      for (c = brw->tnl_program_cache.items[i]; c; c = next) {
-	 next = c->next;
-	 FREE(c->key);
-	 FREE(c->data);
-	 FREE(c);
-      }
-
-   FREE(brw->tnl_program_cache.items);
-}
diff --git a/src/mesa/drivers/dri/i965/brw_vtbl.c b/src/mesa/drivers/dri/i965/brw_vtbl.c
index 2a03fc59f3e..a64e437860f 100644
--- a/src/mesa/drivers/dri/i965/brw_vtbl.c
+++ b/src/mesa/drivers/dri/i965/brw_vtbl.c
@@ -51,6 +51,12 @@
 #include "brw_vs.h"
 #include <stdarg.h>
 
+static void
+dri_bo_release(dri_bo **bo)
+{
+   dri_bo_unreference(*bo);
+   *bo = NULL;
+}
 
 /* called from intelDestroyContext()
  */
@@ -58,13 +64,40 @@ static void brw_destroy_context( struct intel_context *intel )
 {
    GLcontext *ctx = &intel->ctx;
    struct brw_context *brw = brw_context(&intel->ctx);
+   int i;
 
    brw_destroy_metaops(brw);
    brw_destroy_state(brw);
    brw_draw_destroy( brw );
 
-   brw_ProgramCacheDestroy( ctx );
    brw_FrameBufferTexDestroy( brw );
+
+   for (i = 0; i < brw->state.nr_draw_regions; i++)
+       intel_region_release(&brw->state.draw_regions[i]);
+   brw->state.nr_draw_regions = 0;
+   intel_region_release(&brw->state.depth_region);
+
+   dri_bo_release(&brw->curbe.curbe_bo);
+   dri_bo_release(&brw->vs.prog_bo);
+   dri_bo_release(&brw->vs.state_bo);
+   dri_bo_release(&brw->gs.prog_bo);
+   dri_bo_release(&brw->gs.state_bo);
+   dri_bo_release(&brw->clip.prog_bo);
+   dri_bo_release(&brw->clip.state_bo);
+   dri_bo_release(&brw->clip.vp_bo);
+   dri_bo_release(&brw->sf.prog_bo);
+   dri_bo_release(&brw->sf.state_bo);
+   dri_bo_release(&brw->sf.vp_bo);
+   for (i = 0; i < BRW_MAX_TEX_UNIT; i++)
+      dri_bo_release(&brw->wm.sdc_bo[i]);
+   dri_bo_release(&brw->wm.bind_bo);
+   for (i = 0; i < BRW_WM_MAX_SURF; i++)
+      dri_bo_release(&brw->wm.surf_bo[i]);
+   dri_bo_release(&brw->wm.prog_bo);
+   dri_bo_release(&brw->wm.state_bo);
+   dri_bo_release(&brw->cc.prog_bo);
+   dri_bo_release(&brw->cc.state_bo);
+   dri_bo_release(&brw->cc.vp_bo);
 }
 
 /* called from intelDrawBuffer()
@@ -87,6 +120,15 @@ static void brw_set_draw_region( struct intel_context *intel,
    brw->state.nr_draw_regions = num_regions;
 }
 
+/* called from intel_batchbuffer_flush and children before sending a
+ * batchbuffer off.
+ */
+static void brw_finish_batch(struct intel_context *intel)
+{
+   struct brw_context *brw = brw_context(&intel->ctx);
+
+   brw_emit_query_end(brw);
+}
 
 /* called from intelFlushBatchLocked
  */
@@ -185,6 +227,7 @@ void brwInitVtbl( struct brw_context *brw )
    brw->intel.vtbl.note_fence = brw_note_fence; 
    brw->intel.vtbl.note_unlock = brw_note_unlock; 
    brw->intel.vtbl.new_batch = brw_new_batch;
+   brw->intel.vtbl.finish_batch = brw_finish_batch;
    brw->intel.vtbl.destroy = brw_destroy_context;
    brw->intel.vtbl.set_draw_region = brw_set_draw_region;
    brw->intel.vtbl.flush_cmd = brw_flush_cmd;
diff --git a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
index e1db31ec08c..f12ef47a7d7 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_sampler_state.c
@@ -229,6 +229,9 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
 	 struct wm_sampler_entry *entry = &key->sampler[unit];
 	 struct gl_texture_unit *texUnit = &brw->attribs.Texture->Unit[unit];
 	 struct gl_texture_object *texObj = texUnit->_Current;
+	 struct intel_texture_object *intelObj = intel_texture_object(texObj);
+	 struct gl_texture_image *firstImage =
+	    texObj->Image[0][intelObj->firstLevel];
 
 	 entry->wrap_r = texObj->WrapR;
 	 entry->wrap_s = texObj->WrapS;
@@ -244,8 +247,22 @@ brw_wm_sampler_populate_key(struct brw_context *brw,
     entry->comparefunc = texObj->CompareFunc;
 
 	 dri_bo_unreference(brw->wm.sdc_bo[unit]);
-	 brw->wm.sdc_bo[unit] = upload_default_color(brw, texObj->BorderColor);
-
+	 if (firstImage->_BaseFormat == GL_DEPTH_COMPONENT) {
+	    float bordercolor[4] = {
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0],
+	       texObj->BorderColor[0]
+	    };
+	    /* GL specs that border color for depth textures is taken from the
+	     * R channel, while the hardware uses A.  Spam R into all the
+	     * channels for safety.
+	     */
+	    brw->wm.sdc_bo[unit] = upload_default_color(brw, bordercolor);
+	 } else {
+	    brw->wm.sdc_bo[unit] = upload_default_color(brw,
+							texObj->BorderColor);
+	 }
 	 key->sampler_count = unit + 1;
       }
    }
@@ -304,7 +321,7 @@ static void upload_wm_samplers( struct brw_context *brw )
 	    continue;
 
 	 dri_bo_emit_reloc(brw->wm.sampler_bo,
-			   I915_GEM_DOMAIN_INSTRUCTION, 0,
+			   I915_GEM_DOMAIN_SAMPLER, 0,
 			   0,
 			   i * sizeof(struct brw_sampler_state) +
 			   offsetof(struct brw_sampler_state, ss2),
diff --git a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
index 47127c04210..3790b50c976 100644
--- a/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
+++ b/src/mesa/drivers/dri/i965/brw_wm_surface_state.c
@@ -274,7 +274,6 @@ brw_update_texture_surface( GLcontext *ctx, GLuint unit )
    key.width = firstImage->Width;
    key.height = firstImage->Height;
    key.cpp = intelObj->mt->cpp;
-   key.depth = firstImage->Depth;
    key.tiling = intelObj->mt->region->tiling;
 
    dri_bo_unreference(brw->wm.surf_bo[unit + MAX_DRAW_BUFFERS]);
diff --git a/src/mesa/drivers/dri/i965/intel_pixel_draw.c b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
new file mode 120000
index 00000000000..8431a24edfc
--- /dev/null
+++ b/src/mesa/drivers/dri/i965/intel_pixel_draw.c
@@ -0,0 +1 @@
+../intel/intel_pixel_draw.c
+\ No newline at end of file
diff --git a/src/mesa/drivers/dri/intel/intel_buffers.c b/src/mesa/drivers/dri/intel/intel_buffers.c
index defa5b173d9..f5eaf765f38 100644
--- a/src/mesa/drivers/dri/intel/intel_buffers.c
+++ b/src/mesa/drivers/dri/intel/intel_buffers.c
@@ -135,6 +135,14 @@ intel_readbuf_region(struct intel_context *intel)
 static void
 intelSetRenderbufferClipRects(struct intel_context *intel)
 {
+   /* If the batch contents require looping over cliprects, flush them before
+    * we go changing which cliprects get referenced when that happens.
+    */
+   if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+       (intel->fboRect.x2 != intel->ctx.DrawBuffer->Width ||
+	intel->fboRect.x2 != intel->ctx.DrawBuffer->Height))
+      intel_batchbuffer_flush(intel->batch);
+
    assert(intel->ctx.DrawBuffer->Width > 0);
    assert(intel->ctx.DrawBuffer->Height > 0);
    intel->fboRect.x1 = 0;
@@ -160,6 +168,12 @@ intelSetFrontClipRects(struct intel_context *intel)
    if (!dPriv)
       return;
 
+   /* If the batch contents require looping over cliprects, flush them before
+    * we go changing which cliprects get referenced when that happens.
+    */
+   if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+       intel->pClipRects != dPriv->pClipRects)
+      intel_batchbuffer_flush(intel->batch);
    intel->numClipRects = dPriv->numClipRects;
    intel->pClipRects = dPriv->pClipRects;
    intel->drawX = dPriv->x;
@@ -183,6 +197,10 @@ intelSetBackClipRects(struct intel_context *intel)
 
    if (intel_fb->pf_active || dPriv->numBackClipRects == 0) {
       /* use the front clip rects */
+      if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+	  intel->pClipRects != dPriv->pClipRects)
+	 intel_batchbuffer_flush(intel->batch);
+
       intel->numClipRects = dPriv->numClipRects;
       intel->pClipRects = dPriv->pClipRects;
       intel->drawX = dPriv->x;
@@ -190,6 +208,10 @@ intelSetBackClipRects(struct intel_context *intel)
    }
    else {
       /* use the back clip rects */
+      if (intel->batch->cliprect_mode == LOOP_CLIPRECTS &&
+	  intel->pClipRects != dPriv->pBackClipRects)
+	 intel_batchbuffer_flush(intel->batch);
+
       intel->numClipRects = dPriv->numBackClipRects;
       intel->pClipRects = dPriv->pBackClipRects;
       intel->drawX = dPriv->backX;
@@ -900,12 +922,6 @@ intel_draw_buffer(GLcontext * ctx, struct gl_framebuffer *fb)
    if (fb->Name)
       intel_validate_paired_depth_stencil(ctx, fb);
 
-   /* If the batch contents require looping over cliprects, flush them before
-    * we go changing which cliprects get referenced when that happens.
-    */
-   if (intel->batch->cliprect_mode == LOOP_CLIPRECTS)
-      intel_batchbuffer_flush(intel->batch);
-
    /*
     * How many color buffers are we drawing into?
     */
diff --git a/src/mesa/drivers/dri/intel/intel_context.c b/src/mesa/drivers/dri/intel/intel_context.c
index 57e574447a6..2b3a9b9d371 100644
--- a/src/mesa/drivers/dri/intel/intel_context.c
+++ b/src/mesa/drivers/dri/intel/intel_context.c
@@ -68,14 +68,15 @@
 int INTEL_DEBUG = (0);
 #endif
 
-#define need_GL_NV_point_sprite
 #define need_GL_ARB_multisample
+#define need_GL_ARB_occlusion_query
 #define need_GL_ARB_point_parameters
+#define need_GL_ARB_shader_objects
 #define need_GL_ARB_texture_compression
 #define need_GL_ARB_vertex_buffer_object
 #define need_GL_ARB_vertex_program
+#define need_GL_ARB_vertex_shader
 #define need_GL_ARB_window_pos
-#define need_GL_ARB_occlusion_query
 #define need_GL_EXT_blend_color
 #define need_GL_EXT_blend_equation_separate
 #define need_GL_EXT_blend_func_separate
@@ -84,14 +85,13 @@ int INTEL_DEBUG = (0);
 #define need_GL_EXT_fog_coord
 #define need_GL_EXT_framebuffer_object
 #define need_GL_EXT_multi_draw_arrays
+#define need_GL_EXT_point_parameters
 #define need_GL_EXT_secondary_color
-#define need_GL_NV_vertex_program
 #define need_GL_ATI_separate_stencil
-#define need_GL_EXT_point_parameters
+#define need_GL_NV_point_sprite
+#define need_GL_NV_vertex_program
 #define need_GL_VERSION_2_0
 #define need_GL_VERSION_2_1
-#define need_GL_ARB_shader_objects
-#define need_GL_ARB_vertex_shader
 
 #include "extension_helper.h"
 
@@ -344,88 +344,82 @@ intel_viewport(GLcontext *ctx, GLint x, GLint y, GLsizei w, GLsizei h)
  * i965_dri.
  */
 static const struct dri_extension card_extensions[] = {
-   {"GL_ARB_multisample", GL_ARB_multisample_functions},
-   {"GL_ARB_multitexture", NULL},
-   {"GL_ARB_point_parameters", GL_ARB_point_parameters_functions},
-   {"GL_NV_point_sprite", GL_NV_point_sprite_functions},
-   {"GL_ARB_texture_border_clamp", NULL},
-   {"GL_ARB_texture_compression", GL_ARB_texture_compression_functions},
-   {"GL_ARB_texture_cube_map", NULL},
-   {"GL_ARB_texture_env_add", NULL},
-   {"GL_ARB_texture_env_combine", NULL},
-   {"GL_ARB_texture_env_crossbar", NULL},
-   {"GL_ARB_texture_env_dot3", NULL},
-   {"GL_ARB_texture_mirrored_repeat", NULL},
-   {"GL_ARB_texture_non_power_of_two",   NULL },
-   {"GL_ARB_texture_rectangle", NULL},
-   {"GL_NV_texture_rectangle", NULL},
-   {"GL_EXT_texture_rectangle", NULL},
-   {"GL_ARB_point_parameters", NULL}, 
-   {"GL_ARB_vertex_buffer_object", GL_ARB_vertex_buffer_object_functions},
-   {"GL_ARB_vertex_program", GL_ARB_vertex_program_functions},
-   {"GL_ARB_window_pos", GL_ARB_window_pos_functions},
-   {"GL_EXT_blend_color", GL_EXT_blend_color_functions},
-   {"GL_EXT_blend_equation_separate",
-    GL_EXT_blend_equation_separate_functions},
-   {"GL_EXT_blend_func_separate", GL_EXT_blend_func_separate_functions},
-   {"GL_EXT_blend_minmax", GL_EXT_blend_minmax_functions},
-   {"GL_EXT_blend_logic_op", NULL},
-   {"GL_EXT_blend_subtract", NULL},
-   {"GL_EXT_cull_vertex", GL_EXT_cull_vertex_functions},
-   {"GL_EXT_fog_coord", GL_EXT_fog_coord_functions},
-   {"GL_EXT_multi_draw_arrays", GL_EXT_multi_draw_arrays_functions},
-   {"GL_ATI_separate_stencil", GL_ATI_separate_stencil_functions},
-#if 1                           /* XXX FBO temporary? */
-   {"GL_EXT_packed_depth_stencil", NULL},
-#endif
-   {"GL_EXT_secondary_color", GL_EXT_secondary_color_functions},
-   {"GL_EXT_stencil_wrap", NULL},
-   {"GL_EXT_texture_edge_clamp", NULL},
-   {"GL_EXT_texture_env_combine", NULL},
-   {"GL_EXT_texture_env_dot3", NULL},
-   {"GL_EXT_texture_filter_anisotropic", NULL},
-   {"GL_EXT_texture_lod_bias", NULL},
-   {"GL_3DFX_texture_compression_FXT1", NULL},
-   {"GL_APPLE_client_storage", NULL},
-   {"GL_MESA_pack_invert", NULL},
-   {"GL_MESA_ycbcr_texture", NULL},
-   {"GL_NV_blend_square", NULL},
-   {"GL_NV_vertex_program", GL_NV_vertex_program_functions},
-   {"GL_NV_vertex_program1_1", NULL},
-   { "GL_SGIS_generate_mipmap", NULL },
-   {NULL, NULL}
+   { "GL_ARB_multisample",                GL_ARB_multisample_functions },
+   { "GL_ARB_multitexture",               NULL },
+   { "GL_ARB_point_parameters",           GL_ARB_point_parameters_functions },
+   { "GL_ARB_texture_border_clamp",       NULL },
+   { "GL_ARB_texture_compression",        GL_ARB_texture_compression_functions },
+   { "GL_ARB_texture_cube_map",           NULL },
+   { "GL_ARB_texture_env_add",            NULL },
+   { "GL_ARB_texture_env_combine",        NULL },
+   { "GL_ARB_texture_env_crossbar",       NULL },
+   { "GL_ARB_texture_env_dot3",           NULL },
+   { "GL_ARB_texture_mirrored_repeat",    NULL },
+   { "GL_ARB_texture_rectangle",          NULL },
+   { "GL_ARB_vertex_buffer_object",       GL_ARB_vertex_buffer_object_functions },
+   { "GL_ARB_vertex_program",             GL_ARB_vertex_program_functions },
+   { "GL_ARB_window_pos",                 GL_ARB_window_pos_functions },
+   { "GL_EXT_blend_color",                GL_EXT_blend_color_functions },
+   { "GL_EXT_blend_equation_separate",    GL_EXT_blend_equation_separate_functions },
+   { "GL_EXT_blend_func_separate",        GL_EXT_blend_func_separate_functions },
+   { "GL_EXT_blend_minmax",               GL_EXT_blend_minmax_functions },
+   { "GL_EXT_blend_logic_op",             NULL },
+   { "GL_EXT_blend_subtract",             NULL },
+   { "GL_EXT_cull_vertex",                GL_EXT_cull_vertex_functions },
+   { "GL_EXT_fog_coord",                  GL_EXT_fog_coord_functions },
+   { "GL_EXT_multi_draw_arrays",          GL_EXT_multi_draw_arrays_functions },
+   { "GL_EXT_packed_depth_stencil",       NULL },
+   { "GL_EXT_secondary_color",            GL_EXT_secondary_color_functions },
+   { "GL_EXT_stencil_wrap",               NULL },
+   { "GL_EXT_texture_edge_clamp",         NULL },
+   { "GL_EXT_texture_env_combine",        NULL },
+   { "GL_EXT_texture_env_dot3",           NULL },
+   { "GL_EXT_texture_filter_anisotropic", NULL },
+   { "GL_EXT_texture_lod_bias",           NULL },
+   { "GL_3DFX_texture_compression_FXT1",  NULL },
+   { "GL_APPLE_client_storage",           NULL },
+   { "GL_MESA_pack_invert",               NULL },
+   { "GL_MESA_ycbcr_texture",             NULL },
+   { "GL_NV_blend_square",                NULL },
+   { "GL_NV_point_sprite",                GL_NV_point_sprite_functions },
+   { "GL_NV_vertex_program",              GL_NV_vertex_program_functions },
+   { "GL_NV_vertex_program1_1",           NULL },
+   { "GL_SGIS_generate_mipmap",           NULL },
+   { NULL, NULL }
 };
 
 static const struct dri_extension brw_extensions[] = {
-   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions},
-   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions},
-   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions},
-   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions},
-   { "GL_ARB_point_sprite", 		  NULL},
-   { "GL_ARB_fragment_shader",            NULL },
-   { "GL_ARB_draw_buffers",               NULL },
    { "GL_ARB_depth_texture",              NULL },
+   { "GL_ARB_draw_buffers",               NULL },
    { "GL_ARB_fragment_program",           NULL },
+   { "GL_ARB_fragment_program_shadow",    NULL },
+   { "GL_ARB_fragment_shader",            NULL },
+   { "GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions },
+   { "GL_ARB_point_sprite", 		  NULL },
+   { "GL_ARB_shader_objects",             GL_ARB_shader_objects_functions },
+   { "GL_ARB_shading_language_100",       GL_VERSION_2_0_functions },
+#if 0
+   /* Support for GLSL 1.20 is currently broken in core Mesa.
+    */
+   { "GL_ARB_shading_language_120",       GL_VERSION_2_1_functions },
+#endif
    { "GL_ARB_shadow",                     NULL },
+   { "GL_ARB_texture_non_power_of_two",   NULL },
+   { "GL_ARB_vertex_shader",              GL_ARB_vertex_shader_functions },
    { "GL_EXT_shadow_funcs",               NULL },
-   { "GL_ARB_fragment_program_shadow",    NULL },
-   /* ARB extn won't work if not enabled */
-   { "GL_SGIX_depth_texture",             NULL },
-   { "GL_EXT_texture_sRGB",		  NULL},
+   { "GL_EXT_texture_sRGB",		  NULL },
+   { "GL_ATI_separate_stencil",           GL_ATI_separate_stencil_functions },
    { NULL,                                NULL }
 };
 
-#ifdef I915_MMIO_READ
-static const struct dri_extension arb_oc_extensions[] = {
-   {"GL_ARB_occlusion_query",            GL_ARB_occlusion_query_functions},
-   {NULL, NULL}
+static const struct dri_extension arb_oq_extensions[] = {
+   { NULL, NULL }
 };
-#endif
 
 static const struct dri_extension ttm_extensions[] = {
-   {"GL_EXT_framebuffer_object", GL_EXT_framebuffer_object_functions},
-   {"GL_ARB_pixel_buffer_object", NULL},
-   {NULL, NULL}
+   { "GL_ARB_pixel_buffer_object",        NULL },
+   { "GL_EXT_framebuffer_object",         GL_EXT_framebuffer_object_functions },
+   { NULL, NULL }
 };
 
 /**
@@ -445,13 +439,6 @@ void intelInitExtensions(GLcontext *ctx, GLboolean enable_imaging)
    if (intel == NULL || intel->ttm)
       driInitExtensions(ctx, ttm_extensions, GL_FALSE);
 
-#ifdef I915_MMIO_READ
-   if (intel == NULL || 
-       (IS_965(intel->intelScreen->deviceID) && 
-	intel->intelScreen->drmMinor >= 8))
-      driInitExtensions(ctx, arb_oc_extensions, GL_FALSE);
-#endif
-
    if (intel == NULL || IS_965(intel->intelScreen->deviceID))
       driInitExtensions(ctx, brw_extensions, GL_FALSE);
 }
@@ -548,39 +535,6 @@ intelFinish(GLcontext * ctx)
    }
 }
 
-#ifdef I915_MMIO_READ
-static void
-intelBeginQuery(GLcontext *ctx, struct gl_query_object *q)
-{
-	struct intel_context *intel = intel_context( ctx );
-	struct drm_i915_mmio io = {
-		.read_write = I915_MMIO_READ,
-		.reg = MMIO_REGS_PS_DEPTH_COUNT,
-		.data = &q->Result 
-	};
-	intel->stats_wm++;
-	intelFinish(&intel->ctx);
-	drmCommandWrite(intel->driFd, DRM_I915_MMIO, &io, sizeof(io));
-}
-
-static void
-intelEndQuery(GLcontext *ctx, struct gl_query_object *q)
-{
-	struct intel_context *intel = intel_context( ctx );
-	GLuint64EXT tmp;	
-	struct drm_i915_mmio io = {
-		.read_write = I915_MMIO_READ,
-		.reg = MMIO_REGS_PS_DEPTH_COUNT,
-		.data = &tmp
-	};
-	intelFinish(&intel->ctx);
-	drmCommandWrite(intel->driFd, DRM_I915_MMIO, &io, sizeof(io));
-	q->Result = tmp - q->Result;
-	q->Ready = GL_TRUE;
-	intel->stats_wm--;
-}
-#endif
-
 void
 intelInitDriverFunctions(struct dd_function_table *functions)
 {
@@ -597,11 +551,6 @@ intelInitDriverFunctions(struct dd_function_table *functions)
    functions->CopyConvolutionFilter1D = _swrast_CopyConvolutionFilter1D;
    functions->CopyConvolutionFilter2D = _swrast_CopyConvolutionFilter2D;
 
-#ifdef I915_MMIO_READ
-   functions->BeginQuery = intelBeginQuery;
-   functions->EndQuery = intelEndQuery;
-#endif
-
    intelInitTextureFuncs(functions);
    intelInitStateFuncs(functions);
    intelInitBufferFuncs(functions);
@@ -810,7 +759,12 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
       intel->Fallback = 0;      /* don't call _swrast_Flush later */
 
       intel_batchbuffer_free(intel->batch);
+      intel->batch = NULL;
+
       free(intel->prim.vb);
+      intel->prim.vb = NULL;
+      dri_bo_unreference(intel->prim.vb_bo);
+      intel->prim.vb_bo = NULL;
 
       if (release_texture_heaps) {
          /* This share group is about to go away, free our private
@@ -820,6 +774,13 @@ intelDestroyContext(__DRIcontextPrivate * driContextPriv)
             fprintf(stderr, "do something to free texture heaps\n");
       }
 
+      intel_region_release(&intel->front_region);
+      intel_region_release(&intel->back_region);
+      intel_region_release(&intel->third_region);
+      intel_region_release(&intel->depth_region);
+
+      driDestroyOptionCache(&intel->optionCache);
+
       /* free the Mesa context */
       _mesa_free_context_data(&intel->ctx);
    }
diff --git a/src/mesa/drivers/dri/intel/intel_context.h b/src/mesa/drivers/dri/intel/intel_context.h
index 4af4cb9c96f..554159ac441 100644
--- a/src/mesa/drivers/dri/intel/intel_context.h
+++ b/src/mesa/drivers/dri/intel/intel_context.h
@@ -226,7 +226,6 @@ struct intel_context
    GLenum reduced_primitive;
    GLuint vertex_size;
    GLubyte *verts;              /* points to tnl->clipspace.vertex_buf */
-   struct intel_region *draw_region;
 
    /* Fallback rasterization functions 
     */
diff --git a/src/mesa/drivers/dri/intel/intel_pixel.c b/src/mesa/drivers/dri/intel/intel_pixel.c
index f39fac13cf5..b267ffd890c 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel.c
@@ -181,9 +181,9 @@ intelInitPixelFuncs(struct dd_function_table *functions)
    if (!getenv("INTEL_NO_BLIT")) {
       functions->Bitmap = intelBitmap;
       functions->CopyPixels = intelCopyPixels;
+      functions->DrawPixels = intelDrawPixels;
 #ifdef I915
       functions->ReadPixels = intelReadPixels;
-      functions->DrawPixels = intelDrawPixels;
 #endif
    }
 }
diff --git a/src/mesa/drivers/dri/intel/intel_pixel_draw.c b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
index be213e7b961..b60dad74601 100644
--- a/src/mesa/drivers/dri/intel/intel_pixel_draw.c
+++ b/src/mesa/drivers/dri/intel/intel_pixel_draw.c
@@ -51,7 +51,6 @@
 #include "intel_regions.h"
 #include "intel_pixel.h"
 #include "intel_buffer_objects.h"
-#include "intel_tris.h"
 
 
 static GLboolean
diff --git a/src/mesa/drivers/dri/intel/intel_reg.h b/src/mesa/drivers/dri/intel/intel_reg.h
index 96af7e1a030..c21f4080935 100644
--- a/src/mesa/drivers/dri/intel/intel_reg.h
+++ b/src/mesa/drivers/dri/intel/intel_reg.h
@@ -45,6 +45,25 @@
 #define I1_LOAD_S(n)                      (1<<(4+n))
 
 /** @{
+ *
+ * PIPE_CONTROL operation, a combination MI_FLUSH and register write with
+ * additional flushing control.
+ */
+#define _3DSTATE_PIPE_CONTROL		(CMD_3D | (3 << 27) | (2 << 24) | 2)
+#define PIPE_CONTROL_NO_WRITE		(0 << 14)
+#define PIPE_CONTROL_WRITE_IMMEDIATE	(1 << 14)
+#define PIPE_CONTROL_WRITE_DEPTH_COUNT	(2 << 14)
+#define PIPE_CONTROL_WRITE_TIMESTAMP	(3 << 14)
+#define PIPE_CONTROL_DEPTH_STALL	(1 << 13)
+#define PIPE_CONTROL_WRITE_FLUSH	(1 << 12)
+#define PIPE_CONTROL_INSTRUCTION_FLUSH	(1 << 11)
+#define PIPE_CONTROL_INTERRUPT_ENABLE	(1 << 8)
+#define PIPE_CONTROL_PPGTT_WRITE	(0 << 2)
+#define PIPE_CONTROL_GLOBAL_GTT_WRITE	(1 << 2)
+
+/** @} */
+
+/** @{
  * 915 definitions
  */
 #define S0_VB_OFFSET_MASK		0xffffffc
diff --git a/src/mesa/drivers/dri/intel/intel_regions.c b/src/mesa/drivers/dri/intel/intel_regions.c
index cb0f4ba083b..45faf64c713 100644
--- a/src/mesa/drivers/dri/intel/intel_regions.c
+++ b/src/mesa/drivers/dri/intel/intel_regions.c
@@ -478,6 +478,11 @@ intel_recreate_static(struct intel_context *intel,
    region->pitch = intelScreen->pitch;
    region->height = intelScreen->height;     /* needed? */
 
+   if (region->buffer != NULL) {
+      dri_bo_unreference(region->buffer);
+      region->buffer = NULL;
+   }
+
    if (intel->ttm) {
       assert(region_desc->bo_handle != -1);
       region->buffer = intel_bo_gem_create_from_name(intel->bufmgr,
@@ -486,6 +491,11 @@ intel_recreate_static(struct intel_context *intel,
 
       intel_set_region_tiling_gem(intel, region, region_desc->bo_handle);
    } else {
+      if (region->classic_map != NULL) {
+	 drmUnmap(region->classic_map,
+		  region->pitch * region->cpp * region->height);
+	 region->classic_map = NULL;
+      }
       ret = drmMap(intel->driFd, region_desc->handle,
 		   region->pitch * region->cpp * region->height,
 		   &region->classic_map);
diff --git a/src/mesa/drivers/dri/intel/intel_tex_copy.c b/src/mesa/drivers/dri/intel/intel_tex_copy.c
index a7b88b39c04..f4cb4a781c6 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_copy.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_copy.c
@@ -60,7 +60,7 @@ get_teximage_source(struct intel_context *intel, GLenum internalFormat)
 
    switch (internalFormat) {
    case GL_DEPTH_COMPONENT:
-   case GL_DEPTH_COMPONENT16_ARB:
+   case GL_DEPTH_COMPONENT16:
       irb = intel_get_renderbuffer(intel->ctx.ReadBuffer, BUFFER_DEPTH);
       if (irb && irb->region && irb->region->cpp == 2)
          return irb->region;
diff --git a/src/mesa/drivers/dri/intel/intel_tex_validate.c b/src/mesa/drivers/dri/intel/intel_tex_validate.c
index 3dae738ac2e..820683d42eb 100644
--- a/src/mesa/drivers/dri/intel/intel_tex_validate.c
+++ b/src/mesa/drivers/dri/intel/intel_tex_validate.c
@@ -141,10 +141,7 @@ intel_finalize_mipmap_tree(struct intel_context *intel, GLuint unit)
 
    /* Fallback case:
     */
-   if (firstImage->base.Border ||
-       ((firstImage->base._BaseFormat == GL_DEPTH_COMPONENT) &&
-        ((tObj->WrapS == GL_CLAMP_TO_BORDER) ||
-         (tObj->WrapT == GL_CLAMP_TO_BORDER)))) {
+   if (firstImage->base.Border) {
       if (intelObj->mt) {
          intel_miptree_release(intel, &intelObj->mt);
       }
diff --git a/src/mesa/drivers/dri/r300/r300_context.c b/src/mesa/drivers/dri/r300/r300_context.c
index ee4a69dce36..37436275e34 100644
--- a/src/mesa/drivers/dri/r300/r300_context.c
+++ b/src/mesa/drivers/dri/r300/r300_context.c
@@ -138,7 +138,6 @@ const struct dri_extension card_extensions[] = {
   {"GL_NV_blend_square",		NULL},
   {"GL_NV_vertex_program",		GL_NV_vertex_program_functions},
   {"GL_SGIS_generate_mipmap",		NULL},
-  {"GL_SGIX_depth_texture",		NULL},
   {NULL,				NULL}
   /* *INDENT-ON* */
 };
diff --git a/src/mesa/glapi/glapi.c b/src/mesa/glapi/glapi.c
index 4b23b422233..9b5144a88b6 100644
--- a/src/mesa/glapi/glapi.c
+++ b/src/mesa/glapi/glapi.c
@@ -294,7 +294,31 @@ _glapi_get_context(void)
 #endif
 }
 
+#ifdef USE_X86_ASM
 
+#if defined( GLX_USE_TLS )
+extern       GLubyte gl_dispatch_functions_start[];
+extern       GLubyte gl_dispatch_functions_end[];
+#else
+extern const GLubyte gl_dispatch_functions_start[];
+#endif
+
+#endif /* USE_X86_ASM */
+
+
+#if defined(USE_X64_64_ASM) && defined(GLX_USE_TLS)
+# define DISPATCH_FUNCTION_SIZE  16
+#elif defined(USE_X86_ASM)
+# if defined(THREADS) && !defined(GLX_USE_TLS)
+#  define DISPATCH_FUNCTION_SIZE  32
+# else
+#  define DISPATCH_FUNCTION_SIZE  16
+# endif
+#endif
+
+#if !defined(DISPATCH_FUNCTION_SIZE) && !defined(XFree86Server) && !defined(XGLServer)
+# define NEED_FUNCTION_POINTER
+#endif
 
 #if defined(PTHREADS) || defined(GLX_USE_TLS)
 /**
diff --git a/src/mesa/main/api_exec.c b/src/mesa/main/api_exec.c
index 0c3c9c4de49..bae3bf11cb5 100644
--- a/src/mesa/main/api_exec.c
+++ b/src/mesa/main/api_exec.c
@@ -58,7 +58,7 @@
 #include "colortab.h"
 #endif
 #include "context.h"
-#if FEATURE_convolution
+#if FEATURE_convolve
 #include "convolve.h"
 #endif
 #include "depth.h"
@@ -402,7 +402,7 @@ _mesa_init_exec_table(struct _glapi_table *exec)
    SET_GetColorTableParameteriv(exec, _mesa_GetColorTableParameteriv);
 #endif
 
-#if FEATURE_convolution
+#if FEATURE_convolve
    SET_ConvolutionFilter1D(exec, _mesa_ConvolutionFilter1D);
    SET_ConvolutionFilter2D(exec, _mesa_ConvolutionFilter2D);
    SET_ConvolutionParameterf(exec, _mesa_ConvolutionParameterf);
diff --git a/src/mesa/main/api_loopback.c b/src/mesa/main/api_loopback.c
index 924d7134a27..0e3f5ff9570 100644
--- a/src/mesa/main/api_loopback.c
+++ b/src/mesa/main/api_loopback.c
@@ -31,7 +31,6 @@
 
 #include "glheader.h"
 #include "macros.h"
-#include "colormac.h"
 #include "api_loopback.h"
 #include "mtypes.h"
 #include "glapi/glapi.h"
diff --git a/src/mesa/main/blend.c b/src/mesa/main/blend.c
index 4d4a8971419..39cf6153e28 100644
--- a/src/mesa/main/blend.c
+++ b/src/mesa/main/blend.c
@@ -31,7 +31,6 @@
 
 #include "glheader.h"
 #include "blend.h"
-#include "colormac.h"
 #include "context.h"
 #include "enums.h"
 #include "macros.h"
diff --git a/src/mesa/main/colormac.h b/src/mesa/main/colormac.h
index a19521fc85b..a34bd2ed388 100644
--- a/src/mesa/main/colormac.h
+++ b/src/mesa/main/colormac.h
@@ -1,8 +1,8 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.1
+ * Version:  7.3
  *
- * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -180,20 +180,20 @@ do {						\
  */
 /*@{*/
 
-#define PACK_COLOR_8888( R, G, B, A )					\
-   (((R) << 24) | ((G) << 16) | ((B) << 8) | (A))
+#define PACK_COLOR_8888( X, Y, Z, W ) \
+   (((X) << 24) | ((Y) << 16) | ((Z) << 8) | (W))
 
-#define PACK_COLOR_8888_REV( R, G, B, A )				\
-   (((A) << 24) | ((B) << 16) | ((G) << 8) | (R))
+#define PACK_COLOR_8888_REV( X, Y, Z, W ) \
+   (((W) << 24) | ((Z) << 16) | ((Y) << 8) | (X))
 
-#define PACK_COLOR_888( R, G, B )					\
-   (((R) << 16) | ((G) << 8) | (B))
+#define PACK_COLOR_888( X, Y, Z ) \
+   (((X) << 16) | ((Y) << 8) | (Z))
 
-#define PACK_COLOR_565( R, G, B )					\
-   ((((R) & 0xf8) << 8) | (((G) & 0xfc) << 3) | (((B) & 0xf8) >> 3))
+#define PACK_COLOR_565( X, Y, Z )                                  \
+   ((((X) & 0xf8) << 8) | (((Y) & 0xfc) << 3) | (((Z) & 0xf8) >> 3))
 
-#define PACK_COLOR_565_REV( R, G, B )					\
-   (((R) & 0xf8) | ((G) & 0xe0) >> 5 | (((G) & 0x1c) << 11) | (((B) & 0xf8) << 5))
+#define PACK_COLOR_565_REV( X, Y, Z ) \
+   (((X) & 0xf8) | ((Y) & 0xe0) >> 5 | (((Y) & 0x1c) << 11) | (((Z) & 0xf8) << 5))
 
 #define PACK_COLOR_1555( A, B, G, R )					\
    ((((B) & 0xf8) << 7) | (((G) & 0xf8) << 2) | (((R) & 0xf8) >> 3) |	\
diff --git a/src/mesa/main/depthstencil.c b/src/mesa/main/depthstencil.c
index fb54d6184d9..9d208e2997d 100644
--- a/src/mesa/main/depthstencil.c
+++ b/src/mesa/main/depthstencil.c
@@ -282,8 +282,8 @@ _mesa_new_z24_renderbuffer_wrapper(GLcontext *ctx,
    z24rb->RefCount = 1;
    z24rb->Width = dsrb->Width;
    z24rb->Height = dsrb->Height;
-   z24rb->InternalFormat = GL_DEPTH_COMPONENT24_ARB;
-   z24rb->_ActualFormat = GL_DEPTH_COMPONENT24_ARB;
+   z24rb->InternalFormat = GL_DEPTH_COMPONENT24;
+   z24rb->_ActualFormat = GL_DEPTH_COMPONENT24;
    z24rb->_BaseFormat = GL_DEPTH_COMPONENT;
    z24rb->DataType = GL_UNSIGNED_INT;
    z24rb->DepthBits = 24;
diff --git a/src/mesa/main/descrip.mms b/src/mesa/main/descrip.mms
index 3ef215f47fa..e49ec65d42d 100644
--- a/src/mesa/main/descrip.mms
+++ b/src/mesa/main/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  [email protected]
-# Last revision : 2 October 2007
+# Last revision : 29 September 2008
 
 .first
 	define gl [---.include.gl]
@@ -21,6 +21,7 @@ CFLAGS = /include=($(INCDIR),[])/define=(PTHREADS=1)/name=(as_is,short)/float=ie
 
 SOURCES =accum.c \
 	api_arrayelt.c \
+	api_exec.c \
 	api_loopback.c \
 	api_noop.c \
 	api_validate.c \
@@ -29,6 +30,7 @@ SOURCES =accum.c \
 	blend.c \
 	bufferobj.c \
 	buffers.c \
+	clear.c \
 	clip.c \
 	colortab.c \
 	context.c \
@@ -46,6 +48,7 @@ SOURCES =accum.c \
 	extensions.c \
 	fbobject.c \
 	feedback.c \
+	ffvertex_prog.c \
 	fog.c \
 	framebuffer.c \
 	get.c \
@@ -60,22 +63,29 @@ SOURCES =accum.c \
 	matrix.c \
 	mipmap.c \
 	mm.c \
+	multisample.c \
 	pixel.c \
+	pixelstore.c \
 	points.c \
 	polygon.c \
 	rastpos.c \
 	rbadaptors.c \
+	readpix.c \
 	renderbuffer.c \
+	scissor.c \
 	shaders.c \
 	state.c \
 	stencil.c \
 	texcompress.c \
 	texcompress_fxt1.c \
 	texcompress_s3tc.c \
+	texenv.c \
 	texenvprogram.c \
 	texformat.c \
+	texgen.c \
 	teximage.c \
 	texobj.c \
+	texparam.c \
 	texrender.c \
 	texstate.c \
 	texstore.c \
@@ -86,6 +96,7 @@ SOURCES =accum.c \
 
 OBJECTS=accum.obj,\
 api_arrayelt.obj,\
+api_exec.obj,\
 api_loopback.obj,\
 api_noop.obj,\
 api_validate.obj,\
@@ -94,6 +105,7 @@ attrib.obj,\
 blend.obj,\
 bufferobj.obj,\
 buffers.obj,\
+clear.obj,\
 clip.obj,\
 colortab.obj,\
 context.obj,\
@@ -111,6 +123,7 @@ execmem.obj,\
 extensions.obj,\
 fbobject.obj,\
 feedback.obj,\
+ffvertex_prog.obj,\
 fog.obj,\
 framebuffer.obj,\
 get.obj,\
@@ -125,21 +138,28 @@ lines.obj,\
 matrix.obj,\
 mipmap.obj,\
 mm.obj,\
+multisample.obj,\
 pixel.obj,\
+pixelstore.obj,\
 points.obj,\
 polygon.obj,\
 rastpos.obj,\
+readpix.obj,\
 renderbuffer.obj,\
+scissor.obj,\
 shaders.obj,\
 state.obj,\
 stencil.obj,\
 texcompress.obj,\
 texcompress_fxt1.obj,\
 texcompress_s3tc.obj,\
+texenv.obj,\
 texenvprogram.obj,\
 texformat.obj,\
+texgen.obj,\
 teximage.obj,\
 texobj.obj,\
+texparam.obj,\
 texrender.obj,\
 texstate.obj,\
 texstore.obj,\
@@ -226,3 +246,13 @@ vtxfmt.obj : vtxfmt.c
 shaders.obj : shaders.c
 queryobj.obj : queryobj.c
 rbadaptors.obj : rbadaptors.c
+clear.obj : clear.c
+multisample.obj : multisample.c
+scissor.obj : scissor.c
+texenv.obj : texenv.c
+texgen.obj : texgen.c
+texparam.obj : texparam.c
+readpix.obj : readpix.c
+ffvertex_prog.obj : ffvertex_prog.c
+api_exec.obj : api_exec.c
+pixelstore.obj : pixelstore.c
diff --git a/src/mesa/main/dlist.c b/src/mesa/main/dlist.c
index f7660930a9c..c7db435506e 100644
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@@ -41,7 +41,6 @@
 #endif
 #include "arrayobj.h"
 #include "clip.h"
-#include "colormac.h"
 #include "colortab.h"
 #include "context.h"
 #include "convolve.h"
diff --git a/src/mesa/main/extensions.c b/src/mesa/main/extensions.c
index 8dfbfeb62e4..95bf1165f45 100644
--- a/src/mesa/main/extensions.c
+++ b/src/mesa/main/extensions.c
@@ -165,7 +165,7 @@ static const struct {
    { OFF, "GL_SGIS_texture_border_clamp",      F(ARB_texture_border_clamp) },
    { ON,  "GL_SGIS_texture_edge_clamp",        F(SGIS_texture_edge_clamp) },
    { ON,  "GL_SGIS_texture_lod",               F(SGIS_texture_lod) },
-   { OFF, "GL_SGIX_depth_texture",             F(SGIX_depth_texture) },
+   { OFF, "GL_SGIX_depth_texture",             F(ARB_depth_texture) },
    { OFF, "GL_SGIX_shadow",                    F(SGIX_shadow) },
    { OFF, "GL_SGIX_shadow_ambient",            F(SGIX_shadow_ambient) },
    { OFF, "GL_SUN_multi_draw_arrays",          F(EXT_multi_draw_arrays) },
@@ -292,7 +292,6 @@ _mesa_enable_sw_extensions(GLcontext *ctx)
    ctx->Extensions.SGI_texture_color_table = GL_TRUE;
    ctx->Extensions.SGIS_generate_mipmap = GL_TRUE;
    ctx->Extensions.SGIS_texture_edge_clamp = GL_TRUE;
-   ctx->Extensions.SGIX_depth_texture = GL_TRUE;
    ctx->Extensions.SGIX_shadow = GL_TRUE;
    ctx->Extensions.SGIX_shadow_ambient = GL_TRUE;
 #if FEATURE_ARB_vertex_program || FEATURE_ARB_fragment_program
diff --git a/src/mesa/main/ffvertex_prog.c b/src/mesa/main/ffvertex_prog.c
index 787672be9f1..308b4ef7115 100644
--- a/src/mesa/main/ffvertex_prog.c
+++ b/src/mesa/main/ffvertex_prog.c
@@ -178,12 +178,12 @@ static GLboolean check_active_shininess( GLcontext *ctx,
 
 
 
-static struct state_key *make_state_key( GLcontext *ctx )
+static void make_state_key( GLcontext *ctx, struct state_key *key )
 {
    const struct gl_fragment_program *fp;
-   struct state_key *key = CALLOC_STRUCT(state_key);
    GLuint i;
 
+   memset(key, 0, sizeof(struct state_key));
    fp = ctx->FragmentProgram._Current;
 
    /* This now relies on texenvprogram.c being active:
@@ -301,8 +301,6 @@ static struct state_key *make_state_key( GLcontext *ctx )
 			      texUnit->GenModeQ );
       }
    }
-   
-   return key;
 }
 
 
@@ -1714,16 +1712,16 @@ struct gl_vertex_program *
 _mesa_get_fixed_func_vertex_program(GLcontext *ctx)
 {
    struct gl_vertex_program *prog;
-   struct state_key *key;
+   struct state_key key;
 
    /* Grab all the relevent state and put it in a single structure:
     */
-   key = make_state_key(ctx);
+   make_state_key(ctx, &key);
 
    /* Look for an already-prepared program for this state:
     */
    prog = (struct gl_vertex_program *)
-      _mesa_search_program_cache(ctx->VertexProgram.Cache, key, sizeof(*key));
+      _mesa_search_program_cache(ctx->VertexProgram.Cache, &key, sizeof(key));
    
    if (!prog) {
       /* OK, we'll have to build a new one */
@@ -1735,7 +1733,7 @@ _mesa_get_fixed_func_vertex_program(GLcontext *ctx)
       if (!prog)
          return NULL;
 
-      create_new_program( key, prog,
+      create_new_program( &key, prog,
                           ctx->Const.VertexProgram.MaxTemps );
 
 #if 0
@@ -1744,10 +1742,8 @@ _mesa_get_fixed_func_vertex_program(GLcontext *ctx)
                                           &prog->Base );
 #endif
       _mesa_program_cache_insert(ctx, ctx->VertexProgram.Cache,
-                                 key, sizeof(*key), &prog->Base);
+                                 &key, sizeof(key), &prog->Base);
    }
 
-   _mesa_free(key);
-
    return prog;
 }
diff --git a/src/mesa/main/getstring.c b/src/mesa/main/getstring.c
index 1a82ccce59a..94bf5de1e8b 100644
--- a/src/mesa/main/getstring.c
+++ b/src/mesa/main/getstring.c
@@ -25,7 +25,6 @@
 
 
 #include "glheader.h"
-#include "colormac.h"
 #include "context.h"
 #include "get.h"
 #include "version.h"
diff --git a/src/mesa/main/mfeatures.h b/src/mesa/main/mfeatures.h
index ed78f57edf6..3819da3d680 100644
--- a/src/mesa/main/mfeatures.h
+++ b/src/mesa/main/mfeatures.h
@@ -39,7 +39,7 @@
 #define FEATURE_accum  _HAVE_FULL_GL
 #define FEATURE_attrib_stack  _HAVE_FULL_GL
 #define FEATURE_colortable  _HAVE_FULL_GL
-#define FEATURE_convolution  _HAVE_FULL_GL
+#define FEATURE_convolve  _HAVE_FULL_GL
 #define FEATURE_dispatch  _HAVE_FULL_GL
 #define FEATURE_dlist  _HAVE_FULL_GL
 #define FEATURE_draw_read_buffer  _HAVE_FULL_GL
diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 8e4f6a2e663..052da2c18e1 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -2628,7 +2628,6 @@ struct gl_extensions
    GLboolean SGIS_generate_mipmap;
    GLboolean SGIS_texture_edge_clamp;
    GLboolean SGIS_texture_lod;
-   GLboolean SGIX_depth_texture;
    GLboolean SGIX_shadow;
    GLboolean SGIX_shadow_ambient; /* or GL_ARB_shadow_ambient */
    GLboolean TDFX_texture_compression_FXT1;
@@ -2722,6 +2721,7 @@ struct gl_matrix_stack
 #define _NEW_MULTISAMPLE        0x2000000  /**< __GLcontextRec::Multisample */
 #define _NEW_TRACK_MATRIX       0x4000000  /**< __GLcontextRec::VertexProgram */
 #define _NEW_PROGRAM            0x8000000  /**< __GLcontextRec::VertexProgram */
+#define _NEW_CURRENT_ATTRIB     0x10000000  /**< __GLcontextRec::Current */
 #define _NEW_ALL ~0
 /*@}*/
 
@@ -3049,6 +3049,8 @@ struct __GLcontextRec
    GLenum RenderMode;        /**< either GL_RENDER, GL_SELECT, GL_FEEDBACK */
    GLbitfield NewState;      /**< bitwise-or of _NEW_* flags */
 
+   GLbitfield varying_vp_inputs;  /**< mask of VERT_BIT_* flags */
+
    /** \name Derived state */
    /*@{*/
    /** Bitwise-or of DD_* flags.  Note that this bitfield may be used before
diff --git a/src/mesa/main/queryobj.c b/src/mesa/main/queryobj.c
index a1e32e70ba9..554e0b0d181 100644
--- a/src/mesa/main/queryobj.c
+++ b/src/mesa/main/queryobj.c
@@ -90,12 +90,11 @@ _mesa_wait_query(GLcontext *ctx, struct gl_query_object *q)
 
 
 /**
- * Delete an occlusion query object.
+ * Delete a query object.  Called via ctx->Driver.DeleteQuery().
  * Not removed from hash table here.
- * XXX maybe add Delete() method to gl_query_object class and call that instead
  */
 void
-_mesa_delete_query(struct gl_query_object *q)
+_mesa_delete_query(GLcontext *ctx, struct gl_query_object *q)
 {
    _mesa_free(q);
 }
@@ -546,6 +545,6 @@ delete_queryobj_cb(GLuint id, void *data, void *userData)
 void
 _mesa_free_query_data(GLcontext *ctx)
 {
-   _mesa_HashDeleteAll(ctx->Query.QueryObjects, delete_queryobj_cb, NULL);
+   _mesa_HashDeleteAll(ctx->Query.QueryObjects, delete_queryobj_cb, ctx);
    _mesa_DeleteHashTable(ctx->Query.QueryObjects);
 }
diff --git a/src/mesa/main/queryobj.h b/src/mesa/main/queryobj.h
index c05a1f3da82..9a9774641bb 100644
--- a/src/mesa/main/queryobj.h
+++ b/src/mesa/main/queryobj.h
@@ -37,7 +37,7 @@ extern void
 _mesa_free_query_data(GLcontext *ctx);
 
 extern void
-_mesa_delete_query(struct gl_query_object *q);
+_mesa_delete_query(GLcontext *ctx, struct gl_query_object *q);
 
 extern void
 _mesa_begin_query(GLcontext *ctx, struct gl_query_object *q);
diff --git a/src/mesa/main/shaders.c b/src/mesa/main/shaders.c
index f0db0d2a818..e5c54bb10d7 100644
--- a/src/mesa/main/shaders.c
+++ b/src/mesa/main/shaders.c
@@ -233,13 +233,32 @@ _mesa_GetObjectParameterivARB(GLhandleARB object, GLenum pname, GLint *params)
    GET_CURRENT_CONTEXT(ctx);
    /* Implement in terms of GetProgramiv, GetShaderiv */
    if (ctx->Driver.IsProgram(ctx, object)) {
-      ctx->Driver.GetProgramiv(ctx, object, pname, params);
+      if (pname == GL_OBJECT_TYPE_ARB) {
+	 *params = GL_PROGRAM_OBJECT_ARB;
+      } else {
+	 ctx->Driver.GetProgramiv(ctx, object, pname, params);
+      }
    }
    else if (ctx->Driver.IsShader(ctx, object)) {
-      ctx->Driver.GetShaderiv(ctx, object, pname, params);
+      if (pname == GL_OBJECT_TYPE_ARB) {
+	 *params = GL_SHADER_OBJECT_ARB;
+      } else {
+	 ctx->Driver.GetShaderiv(ctx, object, pname, params);
+      }
    }
    else {
-      _mesa_error(ctx, GL_INVALID_OPERATION, "glGetObjectParameterivARB");
+      /* error code depends on pname */
+      GLenum err;
+      switch (pname) {
+      case GL_OBJECT_TYPE_ARB:
+      case GL_OBJECT_DELETE_STATUS_ARB:
+      case GL_OBJECT_INFO_LOG_LENGTH_ARB:
+         err = GL_INVALID_OPERATION;
+         break;
+      default:
+         err = GL_INVALID_VALUE;
+      }
+      _mesa_error(ctx, err, "glGetObjectParameterivARB");
    }
 }
 
diff --git a/src/mesa/main/state.c b/src/mesa/main/state.c
index 5913019bc12..2f0e7cc368e 100644
--- a/src/mesa/main/state.c
+++ b/src/mesa/main/state.c
@@ -258,12 +258,6 @@ update_program(GLcontext *ctx)
       }
    }
 
-   if (ctx->VertexProgram._Current)
-      assert(ctx->VertexProgram._Current->Base.Parameters);
-   if (ctx->FragmentProgram._Current)
-      assert(ctx->FragmentProgram._Current->Base.Parameters);
-
-
    /* XXX: get rid of _Active flag.
     */
 #if 1
@@ -447,6 +441,9 @@ _mesa_update_state_locked( GLcontext *ctx )
    GLbitfield new_state = ctx->NewState;
    GLbitfield prog_flags = _NEW_PROGRAM;
 
+   if (new_state == _NEW_CURRENT_ATTRIB) 
+      goto out;
+
    if (MESA_VERBOSE & VERBOSE_STATE)
       _mesa_print_state("_mesa_update_state", new_state);
 
@@ -510,7 +507,8 @@ _mesa_update_state_locked( GLcontext *ctx )
       _mesa_update_tnl_spaces( ctx, new_state );
 
    if (ctx->FragmentProgram._MaintainTexEnvProgram) {
-      prog_flags |= (_NEW_TEXTURE | _NEW_FOG | _DD_NEW_SEPARATE_SPECULAR);
+      prog_flags |= (_NEW_ARRAY | _NEW_TEXTURE_MATRIX | _NEW_LIGHT |
+                     _NEW_TEXTURE | _NEW_FOG | _DD_NEW_SEPARATE_SPECULAR);
    }
    if (ctx->VertexProgram._MaintainTnlProgram) {
       prog_flags |= (_NEW_ARRAY | _NEW_TEXTURE | _NEW_TEXTURE_MATRIX |
@@ -532,6 +530,7 @@ _mesa_update_state_locked( GLcontext *ctx )
     * Set ctx->NewState to zero to avoid recursion if
     * Driver.UpdateState() has to call FLUSH_VERTICES().  (fixed?)
     */
+ out:
    new_state = ctx->NewState;
    ctx->NewState = 0;
    ctx->Driver.UpdateState(ctx, new_state);
@@ -548,3 +547,38 @@ _mesa_update_state( GLcontext *ctx )
    _mesa_update_state_locked(ctx);
    _mesa_unlock_context_textures(ctx);
 }
+
+
+
+
+/* Want to figure out which fragment program inputs are actually
+ * constant/current values from ctx->Current.  These should be
+ * referenced as a tracked state variable rather than a fragment
+ * program input, to save the overhead of putting a constant value in
+ * every submitted vertex, transferring it to hardware, interpolating
+ * it across the triangle, etc...
+ *
+ * When there is a VP bound, just use vp->outputs.  But when we're
+ * generating vp from fixed function state, basically want to
+ * calculate:
+ *
+ * vp_out_2_fp_in( vp_in_2_vp_out( varying_inputs ) | 
+ *                 potential_vp_outputs )
+ *
+ * Where potential_vp_outputs is calculated by looking at enabled
+ * texgen, etc.
+ * 
+ * The generated fragment program should then only declare inputs that
+ * may vary or otherwise differ from the ctx->Current values.
+ * Otherwise, the fp should track them as state values instead.
+ */
+void
+_mesa_set_varying_vp_inputs( GLcontext *ctx,
+                             GLbitfield varying_inputs )
+{
+   if (ctx->varying_vp_inputs != varying_inputs) {
+      ctx->varying_vp_inputs = varying_inputs;
+      ctx->NewState |= _NEW_ARRAY;
+      //_mesa_printf("%s %x\n", __FUNCTION__, varying_inputs);
+   }
+}
diff --git a/src/mesa/main/state.h b/src/mesa/main/state.h
index bb7cb8f32a3..79f2f6beb0c 100644
--- a/src/mesa/main/state.h
+++ b/src/mesa/main/state.h
@@ -37,5 +37,8 @@ _mesa_update_state( GLcontext *ctx );
 extern void
 _mesa_update_state_locked( GLcontext *ctx );
 
+void
+_mesa_set_varying_vp_inputs( GLcontext *ctx,
+                             GLbitfield varying_inputs );
 
 #endif
diff --git a/src/mesa/main/texenvprogram.c b/src/mesa/main/texenvprogram.c
index 2bce93eef1d..c23173014de 100644
--- a/src/mesa/main/texenvprogram.c
+++ b/src/mesa/main/texenvprogram.c
@@ -55,15 +55,17 @@ struct texenvprog_cache_item
 #define DISASSEM (MESA_VERBOSE & VERBOSE_DISASSEM)
 
 struct mode_opt {
-   GLuint Source:4;
-   GLuint Operand:3;
+   GLubyte Source:4;
+   GLubyte Operand:3;
 };
 
 struct state_key {
-   GLbitfield enabled_units;
+   GLuint nr_enabled_units:8;
+   GLuint enabled_units:8;
    GLuint separate_specular:1;
    GLuint fog_enabled:1;
    GLuint fog_mode:2;
+   GLuint inputs_available:12;
 
    struct {
       GLuint enabled:1;
@@ -74,10 +76,10 @@ struct state_key {
 
       GLuint NumArgsRGB:2;
       GLuint ModeRGB:4;
-      struct mode_opt OptRGB[3];
-
       GLuint NumArgsA:2;
       GLuint ModeA:4;
+
+      struct mode_opt OptRGB[3];
       struct mode_opt OptA[3];
    } unit[8];
 };
@@ -199,6 +201,66 @@ static GLuint translate_tex_src_bit( GLbitfield bit )
    }
 }
 
+#define VERT_BIT_TEX_ANY    (0xff << VERT_ATTRIB_TEX0)
+#define VERT_RESULT_TEX_ANY (0xff << VERT_RESULT_TEX0)
+
+/**
+ * Identify all possible varying inputs.  The fragment program will
+ * never reference non-varying inputs, but will track them via state
+ * constants instead.
+ *
+ * This function figures out all the inputs that the fragment program
+ * has access to.  The bitmask is later reduced to just those which
+ * are actually referenced.
+ */
+static GLbitfield get_fp_input_mask( GLcontext *ctx )
+{
+   GLbitfield fp_inputs = 0x0;
+
+   if (!ctx->VertexProgram._Enabled ||
+       !ctx->VertexProgram._Current) {
+
+      /* Fixed function logic */
+      GLbitfield varying_inputs = ctx->varying_vp_inputs;
+
+      /* First look at what values may be computed by the generated
+       * vertex program:
+       */
+      if (ctx->Light.Enabled) {
+         fp_inputs |= FRAG_BIT_COL0;
+
+         if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
+            fp_inputs |= FRAG_BIT_COL1;
+      }
+
+      fp_inputs |= (ctx->Texture._TexGenEnabled |
+                    ctx->Texture._TexMatEnabled) << FRAG_ATTRIB_TEX0;
+
+      /* Then look at what might be varying as a result of enabled
+       * arrays, etc:
+       */
+      if (varying_inputs & VERT_BIT_COLOR0) fp_inputs |= FRAG_BIT_COL0;
+      if (varying_inputs & VERT_BIT_COLOR1) fp_inputs |= FRAG_BIT_COL1;
+
+      fp_inputs |= (((varying_inputs & VERT_BIT_TEX_ANY) >> VERT_ATTRIB_TEX0) 
+                    << FRAG_ATTRIB_TEX0);
+
+   }
+   else {
+      /* calculate from vp->outputs */
+      GLbitfield vp_outputs = ctx->VertexProgram._Current->Base.OutputsWritten;
+
+      if (vp_outputs & (1 << VERT_RESULT_COL0)) fp_inputs |= FRAG_BIT_COL0;
+      if (vp_outputs & (1 << VERT_RESULT_COL1)) fp_inputs |= FRAG_BIT_COL1;
+
+      fp_inputs |= (((vp_outputs & VERT_RESULT_TEX_ANY) >> VERT_RESULT_TEX0) 
+                    << FRAG_ATTRIB_TEX0);
+   }
+   
+   return fp_inputs;
+}
+
+
 /**
  * Examine current texture environment state and generate a unique
  * key to identify it.
@@ -206,7 +268,9 @@ static GLuint translate_tex_src_bit( GLbitfield bit )
 static void make_state_key( GLcontext *ctx,  struct state_key *key )
 {
    GLuint i, j;
-	
+   GLbitfield inputs_referenced = FRAG_BIT_COL0;
+   GLbitfield inputs_available = get_fp_input_mask( ctx );
+
    memset(key, 0, sizeof(*key));
 
    for (i=0;i<MAX_TEXTURE_UNITS;i++) {
@@ -217,6 +281,8 @@ static void make_state_key( GLcontext *ctx,  struct state_key *key )
 
       key->unit[i].enabled = 1;
       key->enabled_units |= (1<<i);
+      key->nr_enabled_units = i+1;
+      inputs_referenced |= FRAG_BIT_TEX(i);
 
       key->unit[i].source_index = 
 	 translate_tex_src_bit(texUnit->_ReallyEnabled);		
@@ -245,16 +311,22 @@ static void make_state_key( GLcontext *ctx,  struct state_key *key )
       }
    }
 	
-   if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR)
+   if (ctx->_TriangleCaps & DD_SEPARATE_SPECULAR) {
       key->separate_specular = 1;
+      inputs_referenced |= FRAG_BIT_COL1;
+   }
 
    if (ctx->Fog.Enabled) {
       key->fog_enabled = 1;
       key->fog_mode = translate_fog_mode(ctx->Fog.Mode);
+      inputs_referenced |= FRAG_BIT_FOGC; /* maybe */
    }
+
+   key->inputs_available = (inputs_available & inputs_referenced);
 }
 
-/* Use uregs to represent registers internally, translate to Mesa's
+/**
+ * Use uregs to represent registers internally, translate to Mesa's
  * expected formats on emit.  
  *
  * NOTE: These are passed by value extensively in this file rather
@@ -287,16 +359,16 @@ static const struct ureg undef = {
 };
 
 
-/* State used to build the fragment program:
+/** State used to build the fragment program:
  */
 struct texenv_fragment_program {
    struct gl_fragment_program *program;
    GLcontext *ctx;
    struct state_key *state;
 
-   GLbitfield alu_temps;	/* Track texture indirections, see spec. */
-   GLbitfield temps_output;	/* Track texture indirections, see spec. */
-   GLbitfield temp_in_use;	/* Tracks temporary regs which are in use. */
+   GLbitfield alu_temps;	/**< Track texture indirections, see spec. */
+   GLbitfield temps_output;	/**< Track texture indirections, see spec. */
+   GLbitfield temp_in_use;	/**< Tracks temporary regs which are in use. */
    GLboolean error;
 
    struct ureg src_texture[MAX_TEXTURE_UNITS];   
@@ -304,11 +376,11 @@ struct texenv_fragment_program {
     * else undef.
     */
 
-   struct ureg src_previous;	/* Reg containing color from previous 
+   struct ureg src_previous;	/**< Reg containing color from previous 
 				 * stage.  May need to be decl'd.
 				 */
 
-   GLuint last_tex_stage;	/* Number of last enabled texture unit */
+   GLuint last_tex_stage;	/**< Number of last enabled texture unit */
 
    struct ureg half;
    struct ureg one;
@@ -411,6 +483,14 @@ static struct ureg get_tex_temp( struct texenv_fragment_program *p )
 }
 
 
+/** Mark a temp reg as being no longer allocatable. */
+static void reserve_temp( struct texenv_fragment_program *p, struct ureg r )
+{
+   if (r.file == PROGRAM_TEMPORARY)
+      p->temps_output |= (1 << r.idx);
+}
+
+
 static void release_temps(GLcontext *ctx, struct texenv_fragment_program *p )
 {
    GLuint max_temp = ctx->Const.FragmentProgram.MaxTemps;
@@ -449,11 +529,29 @@ static struct ureg register_param5( struct texenv_fragment_program *p,
 #define register_param3(p,s0,s1,s2)    register_param5(p,s0,s1,s2,0,0)
 #define register_param4(p,s0,s1,s2,s3) register_param5(p,s0,s1,s2,s3,0)
 
+static GLuint frag_to_vert_attrib( GLuint attrib )
+{
+   switch (attrib) {
+   case FRAG_ATTRIB_COL0: return VERT_ATTRIB_COLOR0;
+   case FRAG_ATTRIB_COL1: return VERT_ATTRIB_COLOR1;
+   default:
+      assert(attrib >= FRAG_ATTRIB_TEX0);
+      assert(attrib <= FRAG_ATTRIB_TEX7);
+      return attrib - FRAG_ATTRIB_TEX0 + VERT_ATTRIB_TEX0;
+   }
+}
+
 
 static struct ureg register_input( struct texenv_fragment_program *p, GLuint input )
 {
-   p->program->Base.InputsRead |= (1 << input);
-   return make_ureg(PROGRAM_INPUT, input);
+   if (p->state->inputs_available & (1<<input)) {
+      p->program->Base.InputsRead |= (1 << input);
+      return make_ureg(PROGRAM_INPUT, input);
+   }
+   else {
+      GLuint idx = frag_to_vert_attrib( input );
+      return register_param3( p, STATE_INTERNAL, STATE_CURRENT_ATTRIB, idx );
+   }
 }
 
 
@@ -504,10 +602,12 @@ emit_op(struct texenv_fragment_program *p,
 
    emit_dst( &inst->DstReg, dest, mask );
 
+#if 0
    /* Accounting for indirection tracking:
     */
    if (dest.file == PROGRAM_TEMPORARY)
       p->temps_output |= 1 << dest.idx;
+#endif
 
    return inst;
 }
@@ -562,6 +662,10 @@ static struct ureg emit_texld( struct texenv_fragment_program *p,
 
    p->program->Base.NumTexInstructions++;
 
+   /* Accounting for indirection tracking:
+    */
+   reserve_temp(p, dest);
+
    /* Is this a texture indirection?
     */
    if ((coord.file == PROGRAM_TEMPORARY &&
@@ -1079,6 +1183,7 @@ create_new_program(GLcontext *ctx, struct state_key *key,
       for (unit = 0 ; unit < ctx->Const.MaxTextureUnits; unit++)
 	 if (key->enabled_units & (1<<unit)) {
 	    p.src_previous = emit_texenv( &p, unit );
+            reserve_temp(&p, p.src_previous); /* don't re-use this temp reg */
 	    release_temps(ctx, &p);	/* release all temps */
 	 }
    }
diff --git a/src/mesa/main/texformat.c b/src/mesa/main/texformat.c
index 60f36c4a87f..ce2772c2992 100644
--- a/src/mesa/main/texformat.c
+++ b/src/mesa/main/texformat.c
@@ -1420,14 +1420,13 @@ _mesa_choose_tex_format( GLcontext *ctx, GLint internalFormat,
          ; /* fallthrough */
    }
 
-   if (ctx->Extensions.SGIX_depth_texture ||
-       ctx->Extensions.ARB_depth_texture) {
+   if (ctx->Extensions.ARB_depth_texture) {
       switch (internalFormat) {
          case GL_DEPTH_COMPONENT:
-         case GL_DEPTH_COMPONENT24_SGIX:
-         case GL_DEPTH_COMPONENT32_SGIX:
+         case GL_DEPTH_COMPONENT24:
+         case GL_DEPTH_COMPONENT32:
             return &_mesa_texformat_z32;
-         case GL_DEPTH_COMPONENT16_SGIX:
+         case GL_DEPTH_COMPONENT16:
             return &_mesa_texformat_z16;
          default:
             ; /* fallthrough */
diff --git a/src/mesa/main/teximage.c b/src/mesa/main/teximage.c
index 91c41f38c30..bf23a1f290d 100644
--- a/src/mesa/main/teximage.c
+++ b/src/mesa/main/teximage.c
@@ -241,13 +241,12 @@ _mesa_base_tex_format( GLcontext *ctx, GLint internalFormat )
       }
    }
 
-   if (ctx->Extensions.SGIX_depth_texture ||
-       ctx->Extensions.ARB_depth_texture) {
+   if (ctx->Extensions.ARB_depth_texture) {
       switch (internalFormat) {
          case GL_DEPTH_COMPONENT:
-         case GL_DEPTH_COMPONENT16_SGIX:
-         case GL_DEPTH_COMPONENT24_SGIX:
-         case GL_DEPTH_COMPONENT32_SGIX:
+         case GL_DEPTH_COMPONENT16:
+         case GL_DEPTH_COMPONENT24:
+         case GL_DEPTH_COMPONENT32:
             return GL_DEPTH_COMPONENT;
          default:
             ; /* fallthrough */
@@ -526,9 +525,9 @@ static GLboolean
 is_depth_format(GLenum format)
 {
    switch (format) {
-      case GL_DEPTH_COMPONENT16_ARB:
-      case GL_DEPTH_COMPONENT24_ARB:
-      case GL_DEPTH_COMPONENT32_ARB:
+      case GL_DEPTH_COMPONENT16:
+      case GL_DEPTH_COMPONENT24:
+      case GL_DEPTH_COMPONENT32:
       case GL_DEPTH_COMPONENT:
          return GL_TRUE;
       default:
@@ -2308,8 +2307,7 @@ _mesa_GetTexImage( GLenum target, GLint level, GLenum format,
       return;
    }
 
-   if (!ctx->Extensions.SGIX_depth_texture &&
-       !ctx->Extensions.ARB_depth_texture && is_depth_format(format)) {
+   if (!ctx->Extensions.ARB_depth_texture && is_depth_format(format)) {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetTexImage(format)");
       return;
    }
diff --git a/src/mesa/main/texparam.c b/src/mesa/main/texparam.c
index 664adadfb9c..a9e752a6371 100644
--- a/src/mesa/main/texparam.c
+++ b/src/mesa/main/texparam.c
@@ -642,8 +642,7 @@ _mesa_GetTexLevelParameteriv( GLenum target, GLint level,
             *params = 0;
          break;
       case GL_TEXTURE_DEPTH_SIZE_ARB:
-         if (ctx->Extensions.SGIX_depth_texture ||
-             ctx->Extensions.ARB_depth_texture)
+         if (ctx->Extensions.ARB_depth_texture)
             *params = img->TexFormat->DepthBits;
          else
             _mesa_error(ctx, GL_INVALID_ENUM,
@@ -903,9 +902,9 @@ _mesa_GetTexParameterfv( GLenum target, GLenum pname, GLfloat *params )
 #ifdef FEATURE_OES_draw_texture
       case GL_TEXTURE_CROP_RECT_OES:
          params[0] = obj->CropRect[0];
-         params[0] = obj->CropRect[1];
-         params[0] = obj->CropRect[2];
-         params[0] = obj->CropRect[3];
+         params[1] = obj->CropRect[1];
+         params[2] = obj->CropRect[2];
+         params[3] = obj->CropRect[3];
          break;
 #endif
       default:
@@ -1053,9 +1052,9 @@ _mesa_GetTexParameteriv( GLenum target, GLenum pname, GLint *params )
 #ifdef FEATURE_OES_draw_texture
       case GL_TEXTURE_CROP_RECT_OES:
          params[0] = obj->CropRect[0];
-         params[0] = obj->CropRect[1];
-         params[0] = obj->CropRect[2];
-         params[0] = obj->CropRect[3];
+         params[1] = obj->CropRect[1];
+         params[2] = obj->CropRect[2];
+         params[3] = obj->CropRect[3];
          break;
 #endif
       default:
diff --git a/src/mesa/main/texstore.c b/src/mesa/main/texstore.c
index 76b785ab0e0..3639a914c4c 100644
--- a/src/mesa/main/texstore.c
+++ b/src/mesa/main/texstore.c
@@ -1,6 +1,6 @@
 /*
  * Mesa 3-D graphics library
- * Version:  7.1
+ * Version:  7.3
  *
  * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
@@ -1536,10 +1536,10 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
          for (row = 0; row < srcHeight; row++) {
             GLuint *d4 = (GLuint *) dstRow;
             for (col = 0; col < srcWidth; col++) {
-               d4[col] = ((0xff                    << 24) |
-                          (srcRow[col * 3 + RCOMP] << 16) |
-                          (srcRow[col * 3 + GCOMP] <<  8) |
-                          (srcRow[col * 3 + BCOMP] <<  0));
+               d4[col] = PACK_COLOR_8888(0xff,
+                                         srcRow[col * 3 + RCOMP],
+                                         srcRow[col * 3 + GCOMP],
+                                         srcRow[col * 3 + BCOMP]);
             }
             dstRow += dstRowStride;
             srcRow += srcRowStride;
@@ -1551,8 +1551,7 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
 	    dstFormat == &_mesa_texformat_argb8888 &&
             srcFormat == GL_RGBA &&
 	    baseInternalFormat == GL_RGBA &&
-            srcType == GL_UNSIGNED_BYTE &&
-            littleEndian) {
+            srcType == GL_UNSIGNED_BYTE) {
       /* same as above case, but src data has alpha too */
       GLint img, row, col;
       /* For some reason, streaming copies to write-combined regions
@@ -1573,39 +1572,10 @@ _mesa_texstore_argb8888(TEXSTORE_PARAMS)
          for (row = 0; row < srcHeight; row++) {
             GLuint *d4 = (GLuint *) dstRow;
             for (col = 0; col < srcWidth; col++) {
-               d4[col] = ((srcRow[col * 4 + ACOMP] << 24) |
-                          (srcRow[col * 4 + RCOMP] << 16) |
-                          (srcRow[col * 4 + GCOMP] <<  8) |
-                          (srcRow[col * 4 + BCOMP] <<  0));
-            }
-            dstRow += dstRowStride;
-            srcRow += srcRowStride;
-         }
-      }
-   }
-   else if (!ctx->_ImageTransferState &&
-            !srcPacking->SwapBytes &&
-	    dstFormat == &_mesa_texformat_argb8888 &&
-            srcFormat == GL_RGBA &&
-	    baseInternalFormat == GL_RGBA &&
-            srcType == GL_UNSIGNED_BYTE) {
-
-      GLint img, row, col;
-      for (img = 0; img < srcDepth; img++) {
-         const GLint srcRowStride = _mesa_image_row_stride(srcPacking,
-                                                 srcWidth, srcFormat, srcType);
-         GLubyte *srcRow = (GLubyte *) _mesa_image_address(dims, srcPacking,
-                  srcAddr, srcWidth, srcHeight, srcFormat, srcType, img, 0, 0);
-         GLubyte *dstRow = (GLubyte *) dstAddr
-            + dstImageOffsets[dstZoffset + img] * dstFormat->TexelBytes
-            + dstYoffset * dstRowStride
-            + dstXoffset * dstFormat->TexelBytes;
-         for (row = 0; row < srcHeight; row++) {
-            for (col = 0; col < srcWidth; col++) {
-               dstRow[col * 4 + 0] = srcRow[col * 4 + BCOMP];
-               dstRow[col * 4 + 1] = srcRow[col * 4 + GCOMP];
-               dstRow[col * 4 + 2] = srcRow[col * 4 + RCOMP];
-               dstRow[col * 4 + 3] = srcRow[col * 4 + ACOMP];
+               d4[col] = PACK_COLOR_8888(srcRow[col * 4 + ACOMP],
+                                         srcRow[col * 4 + RCOMP],
+                                         srcRow[col * 4 + GCOMP],
+                                         srcRow[col * 4 + BCOMP]);
             }
             dstRow += dstRowStride;
             srcRow += srcRowStride;
diff --git a/src/mesa/shader/arbprogparse.c b/src/mesa/shader/arbprogparse.c
index f499499eb3a..536404bf978 100644
--- a/src/mesa/shader/arbprogparse.c
+++ b/src/mesa/shader/arbprogparse.c
@@ -64,12 +64,6 @@ having three separate program parameter arrays.
 #include "prog_statevars.h"
 #include "prog_instruction.h"
 
-
-/* For ARB programs, use the NV instruction limits */
-#define MAX_INSTRUCTIONS MAX2(MAX_NV_FRAGMENT_PROGRAM_INSTRUCTIONS, \
-                              MAX_NV_VERTEX_PROGRAM_INSTRUCTIONS)
-
-
 /**
  * This is basically a union of the vertex_program and fragment_program
  * structs that we can use to parse the program into
@@ -2609,7 +2603,7 @@ parse_src_reg (GLcontext * ctx, const GLubyte ** inst,
       /* If we're referencing the Program->Parameters[] array, check if the
        * parameter is really a constant/literal.  If so, set File to CONSTANT.
        */
-      assert(*Index < Program->Base.Parameters->NumParameters);
+      assert(*Index < (GLint) Program->Base.Parameters->NumParameters);
       file = Program->Base.Parameters->Parameters[*Index].Type;
       if (file == PROGRAM_CONSTANT)
          *File = PROGRAM_CONSTANT;
@@ -3443,7 +3437,7 @@ parse_instructions(GLcontext * ctx, const GLubyte * inst,
       : ctx->Const.VertexProgram.MaxInstructions;
    GLint err = 0;
 
-   ASSERT(MAX_INSTRUCTIONS >= maxInst);
+   ASSERT(MAX_PROGRAM_INSTRUCTIONS >= maxInst);
 
    Program->MajorVersion = (GLuint) * inst++;
    Program->MinorVersion = (GLuint) * inst++;
@@ -3798,7 +3792,7 @@ _mesa_parse_arb_program(GLcontext *ctx, GLenum target,
 
    /* Initialize the arb_program struct */
    program->Base.String = strz;
-   program->Base.Instructions = _mesa_alloc_instructions(MAX_INSTRUCTIONS);
+   program->Base.Instructions = _mesa_alloc_instructions(MAX_PROGRAM_INSTRUCTIONS);
    program->Base.NumInstructions =
    program->Base.NumTemporaries =
    program->Base.NumParameters =
@@ -3843,12 +3837,12 @@ _mesa_parse_arb_program(GLcontext *ctx, GLenum target,
 
    _mesa_free (parsed);
 
-   /* Reallocate the instruction array from size [MAX_INSTRUCTIONS]
+   /* Reallocate the instruction array from size [MAX_PROGRAM_INSTRUCTIONS]
     * to size [ap.Base.NumInstructions].
     */
    program->Base.Instructions
       = _mesa_realloc_instructions(program->Base.Instructions,
-                                   MAX_INSTRUCTIONS,
+                                   MAX_PROGRAM_INSTRUCTIONS,
                                    program->Base.NumInstructions);
 
    return !err;
@@ -3901,6 +3895,9 @@ _mesa_parse_arb_fragment_program(GLcontext* ctx, GLenum target,
    program->FogOption          = ap.FogOption;
    program->UsesKill          = ap.UsesKill;
 
+   if (program->FogOption)
+      program->Base.InputsRead |= FRAG_BIT_FOGC;
+      
    if (program->Base.Instructions)
       _mesa_free(program->Base.Instructions);
    program->Base.Instructions = ap.Base.Instructions;
diff --git a/src/mesa/shader/descrip.mms b/src/mesa/shader/descrip.mms
index bdac946efe2..19bafd48302 100644
--- a/src/mesa/shader/descrip.mms
+++ b/src/mesa/shader/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  [email protected]
-# Last revision : 27 May 2008
+# Last revision : 29 September 2008
 .first
 	define gl [---.include.gl]
 	define math [-.math]
@@ -34,6 +34,7 @@ SOURCES = \
 	prog_instruction.c \
 	prog_parameter.c \
 	prog_print.c \
+	prog_cache.c \
 	prog_statevars.c \
 	shader_api.c prog_uniform.c
 
@@ -52,7 +53,7 @@ OBJECTS = \
 	prog_parameter.obj,\
 	prog_print.obj,\
 	prog_statevars.obj,\
-	shader_api.obj,prog_uniform.obj
+	shader_api.obj,prog_uniform.obj,prog_cache.obj
 
 ##### RULES #####
 
@@ -91,3 +92,4 @@ prog_print.obj : prog_print.c
 prog_statevars.obj : prog_statevars.c
 shader_api.obj : shader_api.c
 prog_uniform.obj : prog_uniform.c
+prog_cache.obj : prog_cache.c
diff --git a/src/mesa/shader/prog_cache.c b/src/mesa/shader/prog_cache.c
index 36a25377c55..9437e596138 100644
--- a/src/mesa/shader/prog_cache.c
+++ b/src/mesa/shader/prog_cache.c
@@ -44,6 +44,7 @@ struct cache_item
 struct gl_program_cache
 {
    struct cache_item **items;
+   struct cache_item *last;
    GLuint size, n_items;
 };
 
@@ -83,6 +84,8 @@ rehash(struct gl_program_cache *cache)
    struct cache_item *c, *next;
    GLuint size, i;
 
+   cache->last = NULL;
+
    size = cache->size * 3;
    items = (struct cache_item**) _mesa_malloc(size * sizeof(*items));
    _mesa_memset(items, 0, size * sizeof(*items));
@@ -105,6 +108,8 @@ clear_cache(GLcontext *ctx, struct gl_program_cache *cache)
 {
    struct cache_item *c, *next;
    GLuint i;
+   
+   cache->last = NULL;
 
    for (i = 0; i < cache->size; i++) {
       for (c = cache->items[i]; c; c = next) {
@@ -149,18 +154,26 @@ _mesa_delete_program_cache(GLcontext *ctx, struct gl_program_cache *cache)
 
 
 struct gl_program *
-_mesa_search_program_cache(const struct gl_program_cache *cache,
+_mesa_search_program_cache(struct gl_program_cache *cache,
                            const void *key, GLuint keysize)
 {
-   const GLuint hash = hash_key(key, keysize);
-   struct cache_item *c;
-
-   for (c = cache->items[hash % cache->size]; c; c = c->next) {
-      if (c->hash == hash && memcmp(c->key, key, keysize) == 0)
-	 return c->program;
+   if (cache->last && 
+       memcmp(cache->last->key, key, keysize) == 0) {
+      return cache->last->program;
    }
+   else {
+      const GLuint hash = hash_key(key, keysize);
+      struct cache_item *c;
+
+      for (c = cache->items[hash % cache->size]; c; c = c->next) {
+         if (c->hash == hash && memcmp(c->key, key, keysize) == 0) {
+            cache->last = c;
+            return c->program;
+         }
+      }
 
-   return NULL;
+      return NULL;
+   }
 }
 
 
diff --git a/src/mesa/shader/prog_cache.h b/src/mesa/shader/prog_cache.h
index a8c91fba011..4e1ccac03ff 100644
--- a/src/mesa/shader/prog_cache.h
+++ b/src/mesa/shader/prog_cache.h
@@ -42,7 +42,7 @@ _mesa_delete_program_cache(GLcontext *ctx, struct gl_program_cache *pc);
 
 
 extern struct gl_program *
-_mesa_search_program_cache(const struct gl_program_cache *cache,
+_mesa_search_program_cache(struct gl_program_cache *cache,
                            const void *key, GLuint keysize);
 
 extern void
diff --git a/src/mesa/shader/prog_statevars.c b/src/mesa/shader/prog_statevars.c
index d4e31207e84..34c47413506 100644
--- a/src/mesa/shader/prog_statevars.c
+++ b/src/mesa/shader/prog_statevars.c
@@ -395,6 +395,12 @@ _mesa_fetch_state(GLcontext *ctx, const gl_state_index state[],
 
    case STATE_INTERNAL:
       switch (state[1]) {
+      case STATE_CURRENT_ATTRIB: {
+         const GLuint idx = (GLuint) state[2];
+         COPY_4V(value, ctx->Current.Attrib[idx]);
+         return;
+      }						  
+
       case STATE_NORMAL_SCALE:
          ASSIGN_4V(value, 
                    ctx->_ModelViewInvScale, 
@@ -501,6 +507,9 @@ _mesa_fetch_state(GLcontext *ctx, const gl_state_index state[],
          }
          return;
 
+         /* XXX: make sure new tokens added here are also handled in the 
+          * _mesa_program_state_flags() switch, below.
+          */
       default:
          /* unknown state indexes are silently ignored
           *  should be handled by the driver.
@@ -574,11 +583,29 @@ _mesa_program_state_flags(const gl_state_index state[STATE_LENGTH])
 
    case STATE_INTERNAL:
       switch (state[1]) {
+      case STATE_CURRENT_ATTRIB:
+         return _NEW_CURRENT_ATTRIB;
+
+      case STATE_NORMAL_SCALE:
+         return _NEW_MODELVIEW;
+
       case STATE_TEXRECT_SCALE:
       case STATE_SHADOW_AMBIENT:
 	 return _NEW_TEXTURE;
       case STATE_FOG_PARAMS_OPTIMIZED:
 	 return _NEW_FOG;
+      case STATE_LIGHT_SPOT_DIR_NORMALIZED:
+      case STATE_LIGHT_POSITION:
+      case STATE_LIGHT_POSITION_NORMALIZED:
+      case STATE_LIGHT_HALF_VECTOR:
+         return _NEW_LIGHT;
+
+      case STATE_PT_SCALE:
+      case STATE_PT_BIAS:
+      case STATE_PCM_SCALE:
+      case STATE_PCM_BIAS:
+         return _NEW_PIXEL;
+
       default:
          /* unknown state indexes are silently ignored and
          *  no flag set, since it is handled by the driver.
diff --git a/src/mesa/shader/prog_statevars.h b/src/mesa/shader/prog_statevars.h
index 20643ca7947..72e51f40314 100644
--- a/src/mesa/shader/prog_statevars.h
+++ b/src/mesa/shader/prog_statevars.h
@@ -104,6 +104,7 @@ typedef enum gl_state_index_ {
    STATE_LOCAL,
 
    STATE_INTERNAL,		/* Mesa additions */
+   STATE_CURRENT_ATTRIB,        /* ctx->Current vertex attrib value */
    STATE_NORMAL_SCALE,
    STATE_TEXRECT_SCALE,
    STATE_FOG_PARAMS_OPTIMIZED,  /* for faster fog calc */
diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c
index f120c20bdf2..37962f0e9bb 100644
--- a/src/mesa/shader/program.c
+++ b/src/mesa/shader/program.c
@@ -372,7 +372,11 @@ _mesa_reference_program(GLcontext *ctx,
    assert(ptr);
    if (*ptr && prog) {
       /* sanity check */
-      ASSERT((*ptr)->Target == prog->Target);
+      if ((*ptr)->Target == GL_VERTEX_PROGRAM_ARB)
+         ASSERT(prog->Target == GL_VERTEX_PROGRAM_ARB);
+      else if ((*ptr)->Target == GL_FRAGMENT_PROGRAM_ARB)
+         ASSERT(prog->Target == GL_FRAGMENT_PROGRAM_ARB ||
+                prog->Target == GL_FRAGMENT_PROGRAM_NV);
    }
    if (*ptr == prog) {
       return;  /* no change */
@@ -686,17 +690,47 @@ _mesa_combine_programs(GLcontext *ctx,
 
    if (newProg->Target == GL_FRAGMENT_PROGRAM_ARB) {
       struct gl_fragment_program *fprogA, *fprogB, *newFprog;
+      GLbitfield progB_inputsRead = progB->InputsRead;
+      GLint progB_colorFile, progB_colorIndex;
+
       fprogA = (struct gl_fragment_program *) progA;
       fprogB = (struct gl_fragment_program *) progB;
       newFprog = (struct gl_fragment_program *) newProg;
 
       newFprog->UsesKill = fprogA->UsesKill || fprogB->UsesKill;
 
+      /* We'll do a search and replace for instances
+       * of progB_colorFile/progB_colorIndex below...
+       */
+      progB_colorFile = PROGRAM_INPUT;
+      progB_colorIndex = FRAG_ATTRIB_COL0;
+
+      /*
+       * The fragment program may get color from a state var rather than
+       * a fragment input (vertex output) if it's constant.
+       * See the texenvprogram.c code.
+       * So, search the program's parameter list now to see if the program
+       * gets color from a state var instead of a conventional fragment
+       * input register.
+       */
+      for (i = 0; i < progB->Parameters->NumParameters; i++) {
+         struct gl_program_parameter *p = &progB->Parameters->Parameters[i];
+         if (p->Type == PROGRAM_STATE_VAR &&
+             p->StateIndexes[0] == STATE_INTERNAL &&
+             p->StateIndexes[1] == STATE_CURRENT_ATTRIB &&
+             p->StateIndexes[2] == VERT_ATTRIB_COLOR0) {
+            progB_inputsRead |= FRAG_BIT_COL0;
+            progB_colorFile = PROGRAM_STATE_VAR;
+            progB_colorIndex = i;
+            break;
+         }
+      }
+
       /* Connect color outputs of fprogA to color inputs of fprogB, via a
        * new temporary register.
        */
       if ((progA->OutputsWritten & (1 << FRAG_RESULT_COLR)) &&
-          (progB->InputsRead & (1 << FRAG_ATTRIB_COL0))) {
+          (progB_inputsRead & FRAG_BIT_COL0)) {
          GLint tempReg = _mesa_find_free_register(newProg, PROGRAM_TEMPORARY);
          if (tempReg < 0) {
             _mesa_problem(ctx, "No free temp regs found in "
@@ -707,13 +741,14 @@ _mesa_combine_programs(GLcontext *ctx,
          replace_registers(newInst, lenA,
                            PROGRAM_OUTPUT, FRAG_RESULT_COLR,
                            PROGRAM_TEMPORARY, tempReg);
-         /* replace reads from input.color[0] with tempReg */
+         /* replace reads from the input color with tempReg */
          replace_registers(newInst + lenA, lenB,
-                           PROGRAM_INPUT, FRAG_ATTRIB_COL0,
-                           PROGRAM_TEMPORARY, tempReg);
+                           progB_colorFile, progB_colorIndex, /* search for */
+                           PROGRAM_TEMPORARY, tempReg  /* replace with */ );
       }
 
-      inputsB = progB->InputsRead;
+      /* compute combined program's InputsRead */
+      inputsB = progB_inputsRead;
       if (progA->OutputsWritten & (1 << FRAG_RESULT_COLR)) {
          inputsB &= ~(1 << FRAG_ATTRIB_COL0);
       }
diff --git a/src/mesa/shader/shader_api.c b/src/mesa/shader/shader_api.c
index 854f8bfdaa6..266ecc4ef21 100644
--- a/src/mesa/shader/shader_api.c
+++ b/src/mesa/shader/shader_api.c
@@ -47,7 +47,7 @@
 #include "shader/shader_api.h"
 #include "shader/slang/slang_compile.h"
 #include "shader/slang/slang_link.h"
-
+#include "glapi/dispatch.h"
 
 
 #ifndef GL_PROGRAM_BINARY_LENGTH_OES
@@ -455,7 +455,13 @@ _mesa_attach_shader(GLcontext *ctx, GLuint program, GLuint shader)
    n = shProg->NumShaders;
    for (i = 0; i < n; i++) {
       if (shProg->Shaders[i] == sh) {
-         /* already attached */
+         /* The shader is already attched to this program.  The
+          * GL_ARB_shader_objects spec says:
+          *
+          *     "The error INVALID_OPERATION is generated by AttachObjectARB
+          *     if <obj> is already attached to <containerObj>."
+          */
+         _mesa_error(ctx, GL_INVALID_OPERATION, "glAttachShader");
          return;
       }
    }
@@ -919,24 +925,15 @@ _mesa_get_attached_shaders(GLcontext *ctx, GLuint program, GLsizei maxCount,
 static GLuint
 _mesa_get_handle(GLcontext *ctx, GLenum pname)
 {
-#if 0
-   GET_CURRENT_CONTEXT(ctx);
-
-   switch (pname) {
-   case GL_PROGRAM_OBJECT_ARB:
-      {
-         struct gl2_program_intf **pro = ctx->Shader.CurrentProgram;
-
-         if (pro != NULL)
-            return (**pro)._container._generic.
-               GetName((struct gl2_generic_intf **) (pro));
-      }
-      break;
-   default:
+   GLint handle = 0;
+   
+   if (pname == GL_PROGRAM_OBJECT_ARB) {
+      CALL_GetIntegerv(ctx->Exec, (GL_CURRENT_PROGRAM, &handle));
+   } else {
       _mesa_error(ctx, GL_INVALID_ENUM, "glGetHandleARB");
    }
-#endif
-   return 0;
+
+   return handle;
 }
 
 
diff --git a/src/mesa/shader/slang/slang_emit.c b/src/mesa/shader/slang/slang_emit.c
index 9e8daa10517..f3c3fa6c5b3 100644
--- a/src/mesa/shader/slang/slang_emit.c
+++ b/src/mesa/shader/slang/slang_emit.c
@@ -1579,13 +1579,17 @@ emit_array_element(slang_emit_info *emitInfo, slang_ir_node *n)
    else {
       /* Variable array index */
       struct prog_instruction *inst;
+      slang_ir_storage dstStore = *n->Store;
 
       /* do codegen for array index expression */
       emit(emitInfo, n->Children[1]);
 
       inst = new_instruction(emitInfo, OPCODE_ARL);
 
-      storage_to_dst_reg(&inst->DstReg, n->Store, n->Writemask);
+      if (dstStore.Size > 4)
+         dstStore.Size = 4; /* only emit one instruction */
+
+      storage_to_dst_reg(&inst->DstReg, &dstStore, n->Writemask);
       storage_to_src_reg(&inst->SrcReg[0], n->Children[1]->Store);
 
       inst->DstReg.File = PROGRAM_ADDRESS;
diff --git a/src/mesa/shader/slang/slang_link.c b/src/mesa/shader/slang/slang_link.c
index d884be2a75d..00e89537685 100644
--- a/src/mesa/shader/slang/slang_link.c
+++ b/src/mesa/shader/slang/slang_link.c
@@ -408,7 +408,7 @@ _slang_update_inputs_outputs(struct gl_program *prog)
             }
          }
          else if (inst->SrcReg[j].File == PROGRAM_ADDRESS) {
-            maxAddrReg = MAX2(maxAddrReg, inst->SrcReg[j].Index + 1);
+            maxAddrReg = MAX2(maxAddrReg, (GLuint) (inst->SrcReg[j].Index + 1));
          }
       }
       if (inst->DstReg.File == PROGRAM_OUTPUT) {
diff --git a/src/mesa/state_tracker/st_atom_rasterizer.c b/src/mesa/state_tracker/st_atom_rasterizer.c
index fc47896c242..5eef4ebe92e 100644
--- a/src/mesa/state_tracker/st_atom_rasterizer.c
+++ b/src/mesa/state_tracker/st_atom_rasterizer.c
@@ -215,6 +215,9 @@ static void update_raster_state( struct st_context *st )
          raster->sprite_coord_mode[i] = PIPE_SPRITE_COORD_NONE;
       }
    }
+
+   /* ST_NEW_VERTEX_PROGRAM
+    */
    if (vertProg) {
       if (vertProg->Base.Id == 0) {
          if (vertProg->Base.OutputsWritten & (1 << VERT_RESULT_PSIZ)) {
@@ -277,7 +280,7 @@ const struct st_tracked_state st_update_rasterizer = {
        _NEW_POLYGON |
        _NEW_PROGRAM |
        _NEW_SCISSOR),      /* mesa state dependencies*/
-      0,                   /* state tracker dependencies */
+      ST_NEW_VERTEX_PROGRAM,  /* state tracker dependencies */
    },
    update_raster_state     /* update function */
 };
diff --git a/src/mesa/state_tracker/st_cb_texture.c b/src/mesa/state_tracker/st_cb_texture.c
index e545e00eb2a..acaf1de8823 100644
--- a/src/mesa/state_tracker/st_cb_texture.c
+++ b/src/mesa/state_tracker/st_cb_texture.c
@@ -26,7 +26,7 @@
  **************************************************************************/
 
 #include "main/imports.h"
-#if FEATURE_convolution
+#if FEATURE_convolve
 #include "main/convolve.h"
 #endif
 #include "main/enums.h"
@@ -409,7 +409,7 @@ st_TexImage(GLcontext * ctx,
    stImage->face = _mesa_tex_target_to_face(target);
    stImage->level = level;
 
-#if FEATURE_convolution
+#if FEATURE_convolve
    if (ctx->_ImageTransferState & IMAGE_CONVOLUTION_BIT) {
       _mesa_adjust_image_for_convolution(ctx, dims, &postConvWidth,
                                          &postConvHeight);
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index bdf8648ef7c..61949a93884 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -525,6 +525,8 @@ st_draw_vbo(GLcontext *ctx,
                                 vbuffer, velements);
       num_vbuffers = 1;
       num_velements = vp->num_inputs;
+      if (num_velements == 0)
+         num_vbuffers = 0;
    }
    else {
       /*printf("Draw non-interleaved\n");*/
diff --git a/src/mesa/swrast/s_depth.c b/src/mesa/swrast/s_depth.c
index 293eb8628e6..26e23f02d59 100644
--- a/src/mesa/swrast/s_depth.c
+++ b/src/mesa/swrast/s_depth.c
@@ -1,8 +1,8 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.5.1
+ * Version:  7.2.1
  *
- * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
+ * Copyright (C) 1999-2008  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
@@ -534,15 +534,15 @@ depth_test_span( GLcontext *ctx, SWspan *span)
       if (rb->DataType == GL_UNSIGNED_SHORT) {
          GLushort zbuffer[MAX_WIDTH];
          rb->GetRow(ctx, rb, count, x, y, zbuffer);
-         passed = depth_test_span16(ctx, count, zbuffer, zValues, mask );
-         rb->PutRow(ctx, rb, count, x, y, zbuffer, NULL);
+         passed = depth_test_span16(ctx, count, zbuffer, zValues, mask);
+         rb->PutRow(ctx, rb, count, x, y, zbuffer, mask);
       }
       else {
          GLuint zbuffer[MAX_WIDTH];
          ASSERT(rb->DataType == GL_UNSIGNED_INT);
          rb->GetRow(ctx, rb, count, x, y, zbuffer);
-         passed = depth_test_span32(ctx, count, zbuffer, zValues, mask );
-         rb->PutRow(ctx, rb, count, x, y, zbuffer, NULL);
+         passed = depth_test_span32(ctx, count, zbuffer, zValues, mask);
+         rb->PutRow(ctx, rb, count, x, y, zbuffer, mask);
       }
    }
 
@@ -1080,15 +1080,15 @@ depth_test_pixels( GLcontext *ctx, SWspan *span )
       if (rb->DataType == GL_UNSIGNED_SHORT) {
          GLushort zbuffer[MAX_WIDTH];
          _swrast_get_values(ctx, rb, count, x, y, zbuffer, sizeof(GLushort));
-         depth_test_span16(ctx, count, zbuffer, z, mask );
-         rb->PutValues(ctx, rb, count, x, y, zbuffer, NULL);
+         depth_test_span16(ctx, count, zbuffer, z, mask);
+         rb->PutValues(ctx, rb, count, x, y, zbuffer, mask);
       }
       else {
          GLuint zbuffer[MAX_WIDTH];
          ASSERT(rb->DataType == GL_UNSIGNED_INT);
          _swrast_get_values(ctx, rb, count, x, y, zbuffer, sizeof(GLuint));
-         depth_test_span32(ctx, count, zbuffer, z, mask );
-         rb->PutValues(ctx, rb, count, x, y, zbuffer, NULL);
+         depth_test_span32(ctx, count, zbuffer, z, mask);
+         rb->PutValues(ctx, rb, count, x, y, zbuffer, mask);
       }
    }
 
diff --git a/src/mesa/swrast/s_points.c b/src/mesa/swrast/s_points.c
index 846c485f15a..61ff4d0b84c 100644
--- a/src/mesa/swrast/s_points.c
+++ b/src/mesa/swrast/s_points.c
@@ -129,13 +129,13 @@ sprite_point(GLcontext *ctx, const SWvertex *vert)
       s = 0.0;
       dsdx = 1.0 / size;
       if (ctx->Point.SpriteOrigin == GL_LOWER_LEFT) {
-         t0 = 0.0;
          dtdy = 1.0 / size;
+         t0 = 0.5 * dtdy;
       }
       else {
          /* GL_UPPER_LEFT */
-         t0 = 1.0;
          dtdy = -1.0 / size;
+         t0 = 1.0 + 0.5 * dtdy;
       }
 
       ATTRIB_LOOP_BEGIN
diff --git a/src/mesa/swrast/s_texcombine.c b/src/mesa/swrast/s_texcombine.c
index 4e3d329075c..632d650007e 100644
--- a/src/mesa/swrast/s_texcombine.c
+++ b/src/mesa/swrast/s_texcombine.c
@@ -26,6 +26,7 @@
 #include "main/glheader.h"
 #include "main/context.h"
 #include "main/colormac.h"
+#include "main/image.h"
 #include "main/imports.h"
 #include "main/macros.h"
 #include "main/pixel.h"
diff --git a/src/mesa/swrast/s_texstore.c b/src/mesa/swrast/s_texstore.c
index 15d52aa587c..16b00b9fa1c 100644
--- a/src/mesa/swrast/s_texstore.c
+++ b/src/mesa/swrast/s_texstore.c
@@ -216,9 +216,9 @@ is_depth_format(GLenum format)
 {
    switch (format) {
       case GL_DEPTH_COMPONENT:
-      case GL_DEPTH_COMPONENT16_SGIX:
-      case GL_DEPTH_COMPONENT24_SGIX:
-      case GL_DEPTH_COMPONENT32_SGIX:
+      case GL_DEPTH_COMPONENT16:
+      case GL_DEPTH_COMPONENT24:
+      case GL_DEPTH_COMPONENT32:
          return GL_TRUE;
       default:
          return GL_FALSE;
diff --git a/src/mesa/swrast_setup/ss_context.c b/src/mesa/swrast_setup/ss_context.c
index f4d90c514bb..61172f9979b 100644
--- a/src/mesa/swrast_setup/ss_context.c
+++ b/src/mesa/swrast_setup/ss_context.c
@@ -112,22 +112,25 @@ setup_vertex_format(GLcontext *ctx)
 {
    TNLcontext *tnl = TNL_CONTEXT(ctx);
    SScontext *swsetup = SWSETUP_CONTEXT(ctx);
+   GLboolean intColors = !ctx->FragmentProgram._Current
+                      && !ctx->ATIFragmentShader._Enabled
+                      && ctx->RenderMode == GL_RENDER
+                      && CHAN_TYPE == GL_UNSIGNED_BYTE;
 
-   if (!RENDERINPUTS_EQUAL(tnl->render_inputs_bitset,
+   if (intColors != swsetup->intColors ||
+       !RENDERINPUTS_EQUAL(tnl->render_inputs_bitset,
                            swsetup->last_index_bitset)) {
       DECLARE_RENDERINPUTS(index_bitset);
       struct tnl_attr_map map[_TNL_ATTRIB_MAX];
       int i, e = 0;
 
+      swsetup->intColors = intColors;
+
       RENDERINPUTS_COPY( index_bitset, tnl->render_inputs_bitset );
 
       EMIT_ATTR( _TNL_ATTRIB_POS, EMIT_4F_VIEWPORT, attrib[FRAG_ATTRIB_WPOS] );
 
       if (RENDERINPUTS_TEST( index_bitset, _TNL_ATTRIB_COLOR0 )) {
-         swsetup->intColors = !ctx->FragmentProgram._Current
-                           && !ctx->ATIFragmentShader._Enabled
-                           && ctx->RenderMode == GL_RENDER
-                           && CHAN_TYPE == GL_UNSIGNED_BYTE;
          if (swsetup->intColors)
             EMIT_ATTR( _TNL_ATTRIB_COLOR0, EMIT_4CHAN_4F_RGBA, color );
          else
diff --git a/src/mesa/tnl/descrip.mms b/src/mesa/tnl/descrip.mms
index f77f672dc8f..25dd1aecb13 100644
--- a/src/mesa/tnl/descrip.mms
+++ b/src/mesa/tnl/descrip.mms
@@ -1,6 +1,6 @@
 # Makefile for core library for VMS
 # contributed by Jouk Jansen  [email protected]
-# Last revision : 30 November 2007
+# Last revision : 39 September 2008
 
 .first
 	define gl [---.include.gl]
@@ -27,13 +27,13 @@ SOURCES = t_context.c t_draw.c \
 	t_pipeline.c t_vb_fog.c \
 	t_vb_light.c t_vb_normals.c t_vb_points.c t_vb_program.c \
 	t_vb_render.c t_vb_texgen.c t_vb_texmat.c t_vb_vertex.c \
-	t_vertex.c \
+	t_vertex.c t_rasterpos.c\
 	t_vertex_generic.c t_vp_build.c
 
 OBJECTS = t_context.obj,t_draw.obj,\
 	t_pipeline.obj,t_vb_fog.obj,t_vb_light.obj,t_vb_normals.obj,\
 	t_vb_points.obj,t_vb_program.obj,t_vb_render.obj,t_vb_texgen.obj,\
-	t_vb_texmat.obj,t_vb_vertex.obj,\
+	t_vb_texmat.obj,t_vb_vertex.obj,t_rasterpos.obj,\
 	t_vertex.obj,t_vertex_generic.obj,\
 	t_vp_build.obj
 
@@ -65,3 +65,4 @@ t_vb_vertex.obj : t_vb_vertex.c
 t_vertex.obj : t_vertex.c
 t_vertex_generic.obj : t_vertex_generic.c
 t_vp_build.obj : t_vp_build.c
+t_rasterpos.obj : t_rasterpos.c
diff --git a/src/mesa/tnl/t_vertex_generic.c b/src/mesa/tnl/t_vertex_generic.c
index db70ad4dad5..f763522f91f 100644
--- a/src/mesa/tnl/t_vertex_generic.c
+++ b/src/mesa/tnl/t_vertex_generic.c
@@ -34,6 +34,12 @@
 #include "t_vertex.h"
 
 
+#if 0
+#define DEBUG_INSERT printf("%s\n", __FUNCTION__)
+#else
+#define DEBUG_INSERT
+#endif
+
 
 /*
  * These functions take the NDC coordinates pointed to by 'in', apply the
@@ -45,7 +51,7 @@ static INLINE void insert_4f_viewport_4( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -57,7 +63,7 @@ static INLINE void insert_4f_viewport_3( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -69,7 +75,7 @@ static INLINE void insert_4f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[14];
@@ -81,7 +87,7 @@ static INLINE void insert_4f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
    out[2] = vp[14];
@@ -93,7 +99,7 @@ static INLINE void insert_3f_viewport_3( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -104,7 +110,7 @@ static INLINE void insert_3f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
    out[2] = vp[10] * in[2] + vp[14];
@@ -115,7 +121,7 @@ static INLINE void insert_3f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
    out[2] = vp[14];
@@ -126,7 +132,7 @@ static INLINE void insert_2f_viewport_2( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[5] * in[1] + vp[13];
 }
@@ -136,7 +142,7 @@ static INLINE void insert_2f_viewport_1( const struct tnl_clipspace_attr *a, GLu
 {
    GLfloat *out = (GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = vp[0] * in[0] + vp[12];
    out[1] = vp[13];
 }
@@ -150,7 +156,7 @@ static INLINE void insert_4f_4( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -161,7 +167,7 @@ static INLINE void insert_4f_3( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -172,7 +178,7 @@ static INLINE void insert_4f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = 0;
@@ -183,7 +189,7 @@ static INLINE void insert_4f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
    out[2] = 0;
@@ -194,7 +200,7 @@ static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[3];
@@ -203,6 +209,7 @@ static INLINE void insert_3f_xyw_4( const struct tnl_clipspace_attr *a, GLubyte
 static INLINE void insert_3f_xyw_err( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
 {
    (void) a; (void) v; (void) in;
+   DEBUG_INSERT;
    _mesa_exit(1);
 }
 
@@ -210,7 +217,7 @@ static INLINE void insert_3f_3( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = in[2];
@@ -220,7 +227,7 @@ static INLINE void insert_3f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
    out[2] = 0;
@@ -230,7 +237,7 @@ static INLINE void insert_3f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
    out[2] = 0;
@@ -241,7 +248,7 @@ static INLINE void insert_2f_2( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = in[1];
 }
@@ -250,7 +257,7 @@ static INLINE void insert_2f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-   
+   DEBUG_INSERT;
    out[0] = in[0];
    out[1] = 0;
 }
@@ -259,12 +266,13 @@ static INLINE void insert_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v,
 {
    GLfloat *out = (GLfloat *)(v);
    (void) a;
-
+   DEBUG_INSERT;
    out[0] = in[0];
 }
 
 static INLINE void insert_null( const struct tnl_clipspace_attr *a, GLubyte *v, const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a; (void) v; (void) in;
 }
 
@@ -272,6 +280,7 @@ static INLINE void insert_4chan_4f_rgba_4( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -283,6 +292,7 @@ static INLINE void insert_4chan_4f_rgba_3( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -294,6 +304,7 @@ static INLINE void insert_4chan_4f_rgba_2( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    UNCLAMPED_FLOAT_TO_CHAN(c[1], in[1]); 
@@ -305,6 +316,7 @@ static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, G
 				  const GLfloat *in )
 {
    GLchan *c = (GLchan *)v;
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_CHAN(c[0], in[0]); 
    c[1] = 0;
@@ -315,6 +327,7 @@ static INLINE void insert_4chan_4f_rgba_1( const struct tnl_clipspace_attr *a, G
 static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -325,6 +338,7 @@ static INLINE void insert_4ub_4f_rgba_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -335,6 +349,7 @@ static INLINE void insert_4ub_4f_rgba_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -345,6 +360,7 @@ static INLINE void insert_4ub_4f_rgba_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    v[1] = 0;
@@ -355,6 +371,7 @@ static INLINE void insert_4ub_4f_rgba_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -365,6 +382,7 @@ static INLINE void insert_4ub_4f_bgra_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -375,6 +393,7 @@ static INLINE void insert_4ub_4f_bgra_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -385,6 +404,7 @@ static INLINE void insert_4ub_4f_bgra_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    v[1] = 0;
@@ -395,6 +415,7 @@ static INLINE void insert_4ub_4f_bgra_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -405,6 +426,7 @@ static INLINE void insert_4ub_4f_argb_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -415,6 +437,7 @@ static INLINE void insert_4ub_4f_argb_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -425,6 +448,7 @@ static INLINE void insert_4ub_4f_argb_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
    v[2] = 0x00;
@@ -435,6 +459,7 @@ static INLINE void insert_4ub_4f_argb_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -445,6 +470,7 @@ static INLINE void insert_4ub_4f_abgr_4( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -455,6 +481,7 @@ static INLINE void insert_4ub_4f_abgr_3( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
@@ -465,6 +492,7 @@ static INLINE void insert_4ub_4f_abgr_2( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
    v[2] = 0x00;
@@ -475,6 +503,7 @@ static INLINE void insert_4ub_4f_abgr_1( const struct tnl_clipspace_attr *a, GLu
 static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -484,6 +513,7 @@ static INLINE void insert_3ub_3f_rgb_3( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -493,6 +523,7 @@ static INLINE void insert_3ub_3f_rgb_2( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			       const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
    v[1] = 0;
@@ -502,6 +533,7 @@ static INLINE void insert_3ub_3f_rgb_1( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -511,6 +543,7 @@ static INLINE void insert_3ub_3f_bgr_3( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
@@ -520,6 +553,7 @@ static INLINE void insert_3ub_3f_bgr_2( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 				 const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
    v[1] = 0;
@@ -530,6 +564,7 @@ static INLINE void insert_3ub_3f_bgr_1( const struct tnl_clipspace_attr *a, GLub
 static INLINE void insert_1ub_1f_1( const struct tnl_clipspace_attr *a, GLubyte *v, 
 			   const GLfloat *in )
 {
+   DEBUG_INSERT;
    (void) a;
    UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
 }
@@ -551,6 +586,7 @@ static void extract_4f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
    /* Although included for completeness, the position coordinate is
     * usually handled differently during clipping.
     */
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = (in[2] - vp[14]) / vp[10];
@@ -562,7 +598,7 @@ static void extract_3f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
 {
    const GLfloat *in = (const GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = (in[2] - vp[14]) / vp[10];
@@ -575,7 +611,7 @@ static void extract_2f_viewport( const struct tnl_clipspace_attr *a, GLfloat *ou
 {
    const GLfloat *in = (const GLfloat *)v;
    const GLfloat * const vp = a->vp;
-   
+   DEBUG_INSERT;
    out[0] = (in[0] - vp[12]) / vp[0];
    out[1] = (in[1] - vp[13]) / vp[5];
    out[2] = 0;
diff --git a/src/mesa/vbo/vbo_exec_api.c b/src/mesa/vbo/vbo_exec_api.c
index fdb0c5a9a4d..a6ce26ffed8 100644
--- a/src/mesa/vbo/vbo_exec_api.c
+++ b/src/mesa/vbo/vbo_exec_api.c
@@ -143,29 +143,37 @@ static void vbo_exec_copy_to_current( struct vbo_exec_context *exec )
 
    for (i = VBO_ATTRIB_POS+1 ; i < VBO_ATTRIB_MAX ; i++) {
       if (exec->vtx.attrsz[i]) {
-	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
-
          /* Note: the exec->vtx.current[i] pointers point into the
           * ctx->Current.Attrib and ctx->Light.Material.Attrib arrays.
           */
-	 COPY_CLEAN_4V(current, 
-		       exec->vtx.attrsz[i], 
-		       exec->vtx.attrptr[i]);
+	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
+         GLfloat tmp[4];
+
+         COPY_CLEAN_4V(tmp, 
+                       exec->vtx.attrsz[i], 
+                       exec->vtx.attrptr[i]);
+         
+         if (memcmp(current, tmp, sizeof(tmp)) != 0)
+         { 
+            memcpy(current, tmp, sizeof(tmp));
 
 	 
-	 /* Given that we explicitly state size here, there is no need
-	  * for the COPY_CLEAN above, could just copy 16 bytes and be
-	  * done.  The only problem is when Mesa accesses ctx->Current
-	  * directly.
-	  */
-	 vbo->currval[i].Size = exec->vtx.attrsz[i];
-
-	 /* This triggers rather too much recalculation of Mesa state
-	  * that doesn't get used (eg light positions).
-	  */
-	 if (i >= VBO_ATTRIB_MAT_FRONT_AMBIENT &&
-	     i <= VBO_ATTRIB_MAT_BACK_INDEXES)
-	    ctx->NewState |= _NEW_LIGHT;
+            /* Given that we explicitly state size here, there is no need
+             * for the COPY_CLEAN above, could just copy 16 bytes and be
+             * done.  The only problem is when Mesa accesses ctx->Current
+             * directly.
+             */
+            vbo->currval[i].Size = exec->vtx.attrsz[i];
+
+            /* This triggers rather too much recalculation of Mesa state
+             * that doesn't get used (eg light positions).
+             */
+            if (i >= VBO_ATTRIB_MAT_FRONT_AMBIENT &&
+                i <= VBO_ATTRIB_MAT_BACK_INDEXES)
+               ctx->NewState |= _NEW_LIGHT;
+            
+            ctx->NewState |= _NEW_CURRENT_ATTRIB;
+         }
       }
    }
 
diff --git a/src/mesa/vbo/vbo_exec_array.c b/src/mesa/vbo/vbo_exec_array.c
index 0f9d8da3568..8871e10cf60 100644
--- a/src/mesa/vbo/vbo_exec_array.c
+++ b/src/mesa/vbo/vbo_exec_array.c
@@ -127,6 +127,7 @@ static void recalculate_input_bindings( GLcontext *ctx )
    struct vbo_context *vbo = vbo_context(ctx);
    struct vbo_exec_context *exec = &vbo->exec;
    const struct gl_client_array **inputs = &exec->array.inputs[0];
+   GLbitfield const_inputs = 0x0;
    GLuint i;
 
    exec->array.program_mode = get_program_mode(ctx);
@@ -141,19 +142,24 @@ static void recalculate_input_bindings( GLcontext *ctx )
       for (i = 0; i <= VERT_ATTRIB_TEX7; i++) {
 	 if (exec->array.legacy_array[i]->Enabled)
 	    inputs[i] = exec->array.legacy_array[i];
-	 else
+	 else {
 	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
       }
 
       for (i = 0; i < MAT_ATTRIB_MAX; i++) {
 	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->mat_currval[i];
+         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
       }
 
       /* Could use just about anything, just to fill in the empty
        * slots:
        */
-      for (i = MAT_ATTRIB_MAX; i < VERT_ATTRIB_MAX - VERT_ATTRIB_GENERIC0; i++)
+      for (i = MAT_ATTRIB_MAX; i < VERT_ATTRIB_MAX - VERT_ATTRIB_GENERIC0; i++) {
 	 inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
+         const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+      }
 
       break;
    case VP_NV:
@@ -166,15 +172,19 @@ static void recalculate_input_bindings( GLcontext *ctx )
 	    inputs[i] = exec->array.generic_array[i];
 	 else if (exec->array.legacy_array[i]->Enabled)
 	    inputs[i] = exec->array.legacy_array[i];
-	 else
+	 else {
 	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
       }
 
       /* Could use just about anything, just to fill in the empty
        * slots:
        */
-      for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++)
+      for (i = VERT_ATTRIB_GENERIC0; i < VERT_ATTRIB_MAX; i++) {
 	 inputs[i] = &vbo->generic_currval[i - VERT_ATTRIB_GENERIC0];
+         const_inputs |= 1 << i;
+      }
 
       break;
    case VP_ARB:
@@ -189,25 +199,34 @@ static void recalculate_input_bindings( GLcontext *ctx )
 	 inputs[0] = exec->array.generic_array[0];
       else if (exec->array.legacy_array[0]->Enabled)
 	 inputs[0] = exec->array.legacy_array[0];
-      else
+      else {
 	 inputs[0] = &vbo->legacy_currval[0];
+         const_inputs |= 1 << 0;
+      }
 
 
       for (i = 1; i <= VERT_ATTRIB_TEX7; i++) {
 	 if (exec->array.legacy_array[i]->Enabled)
 	    inputs[i] = exec->array.legacy_array[i];
-	 else
+	 else {
 	    inputs[i] = &vbo->legacy_currval[i];
+            const_inputs |= 1 << i;
+         }
       }
 
       for (i = 0; i < 16; i++) {
 	 if (exec->array.generic_array[i]->Enabled)
 	    inputs[VERT_ATTRIB_GENERIC0 + i] = exec->array.generic_array[i];
-	 else
+	 else {
 	    inputs[VERT_ATTRIB_GENERIC0 + i] = &vbo->generic_currval[i];
+            const_inputs |= 1 << (VERT_ATTRIB_GENERIC0 + i);
+         }
+
       }
       break;
    }
+
+   _mesa_set_varying_vp_inputs( ctx, ~const_inputs );
 }
 
 static void bind_arrays( GLcontext *ctx )
@@ -257,6 +276,11 @@ vbo_exec_DrawArrays(GLenum mode, GLint start, GLsizei count)
 
    bind_arrays( ctx );
 
+   /* Again...
+    */
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
    prim[0].begin = 1;
    prim[0].end = 1;
    prim[0].weak = 0;
@@ -297,6 +321,9 @@ vbo_exec_DrawRangeElements(GLenum mode,
 
    bind_arrays( ctx );
 
+   if (ctx->NewState)
+      _mesa_update_state( ctx );
+
    ib.count = count;
    ib.type = type; 
    ib.obj = ctx->Array.ElementArrayBufferObj;
diff --git a/src/mesa/vbo/vbo_exec_draw.c b/src/mesa/vbo/vbo_exec_draw.c
index 92356ba9773..5bf3d836db5 100644
--- a/src/mesa/vbo/vbo_exec_draw.c
+++ b/src/mesa/vbo/vbo_exec_draw.c
@@ -150,6 +150,7 @@ static void vbo_exec_bind_arrays( GLcontext *ctx )
    GLubyte *data = exec->vtx.buffer_map;
    const GLuint *map;
    GLuint attr;
+   GLbitfield varying_inputs = 0x0;
 
    /* Install the default (ie Current) attributes first, then overlay
     * all active ones.
@@ -211,8 +212,11 @@ static void vbo_exec_bind_arrays( GLcontext *ctx )
 	 arrays[attr]._MaxElement = count; /* ??? */
 
 	 data += exec->vtx.attrsz[src] * sizeof(GLfloat);
+         varying_inputs |= 1<<attr;
       }
    }
+
+   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
 }
 
 
@@ -242,6 +246,9 @@ void vbo_exec_vtx_flush( struct vbo_exec_context *exec )
 	  */
 	 vbo_exec_bind_arrays( ctx );
 
+         if (ctx->NewState)
+            _mesa_update_state( ctx );
+
          /* if using a real VBO, unmap it before drawing */
          if (exec->vtx.bufferobj->Name) {
             ctx->Driver.UnmapBuffer(ctx, target, exec->vtx.bufferobj);
diff --git a/src/mesa/vbo/vbo_save_draw.c b/src/mesa/vbo/vbo_save_draw.c
index ed82f09958d..0488c5d7182 100644
--- a/src/mesa/vbo/vbo_save_draw.c
+++ b/src/mesa/vbo/vbo_save_draw.c
@@ -64,18 +64,26 @@ static void _playback_copy_to_current( GLcontext *ctx,
    for (i = VBO_ATTRIB_POS+1 ; i < VBO_ATTRIB_MAX ; i++) {
       if (node->attrsz[i]) {
 	 GLfloat *current = (GLfloat *)vbo->currval[i].Ptr;
+         GLfloat tmp[4];
 
-	 COPY_CLEAN_4V(current, 
-		       node->attrsz[i], 
-		       data);
+         COPY_CLEAN_4V(tmp, 
+                       node->attrsz[i], 
+                       data);
+         
+         if (memcmp(current, tmp, 4 * sizeof(GLfloat)) != 0)
+         {
+            memcpy(current, tmp, 4 * sizeof(GLfloat));
 
-	 vbo->currval[i].Size = node->attrsz[i];
+            vbo->currval[i].Size = node->attrsz[i];
 
-	 data += node->attrsz[i];
+            if (i >= VBO_ATTRIB_FIRST_MATERIAL &&
+                i <= VBO_ATTRIB_LAST_MATERIAL)
+               ctx->NewState |= _NEW_LIGHT;
+
+            ctx->NewState |= _NEW_CURRENT_ATTRIB;
+         }
 
-	 if (i >= VBO_ATTRIB_FIRST_MATERIAL &&
-	     i <= VBO_ATTRIB_LAST_MATERIAL)
-	    ctx->NewState |= _NEW_LIGHT;
+	 data += node->attrsz[i];
       }
    }
 
@@ -110,6 +118,7 @@ static void vbo_bind_vertex_list( GLcontext *ctx,
    GLuint data = node->buffer_offset;
    const GLuint *map;
    GLuint attr;
+   GLbitfield varying_inputs = 0x0;
 
    /* Install the default (ie Current) attributes first, then overlay
     * all active ones.
@@ -159,8 +168,11 @@ static void vbo_bind_vertex_list( GLcontext *ctx,
 	 assert(arrays[attr].BufferObj->Name);
 
 	 data += node->attrsz[src] * sizeof(GLfloat);
+         varying_inputs |= 1<<attr;
       }
    }
+
+   _mesa_set_varying_vp_inputs( ctx, varying_inputs );
 }
 
 static void vbo_save_loopback_vertex_list( GLcontext *ctx,
@@ -229,6 +241,11 @@ void vbo_save_playback_vertex_list( GLcontext *ctx, void *data )
 
       vbo_bind_vertex_list( ctx, node );
 
+      /* Again...
+       */
+      if (ctx->NewState)
+	 _mesa_update_state( ctx );
+
       vbo_context(ctx)->draw_prims( ctx, 
 				    save->inputs, 
 				    node->prim, 
diff --git a/src/mesa/x86-64/x86-64.c b/src/mesa/x86-64/x86-64.c
index 9ec43c841d3..96f8da87f05 100644
--- a/src/mesa/x86-64/x86-64.c
+++ b/src/mesa/x86-64/x86-64.c
@@ -41,7 +41,10 @@
 #include "math/m_debug.h"
 #endif
 
+extern void _mesa_x86_64_cpuid(unsigned int *regs);
+
 DECLARE_XFORM_GROUP( x86_64, 4 )
+DECLARE_XFORM_GROUP( 3dnow, 4 )
 
 #else
 /* just to silence warning below */
@@ -81,6 +84,7 @@ static void message( const char *msg )
 void _mesa_init_all_x86_64_transform_asm(void)
 {
 #ifdef USE_X86_64_ASM
+   unsigned int regs[4];
 
    if ( _mesa_getenv( "MESA_NO_ASM" ) ) {
      return;
@@ -88,24 +92,32 @@ void _mesa_init_all_x86_64_transform_asm(void)
 
    message("Initializing x86-64 optimizations\n");
 
-   ASSIGN_XFORM_GROUP( x86_64, 4 );
 
-   /*
    _mesa_transform_tab[4][MATRIX_GENERAL] =
       _mesa_x86_64_transform_points4_general;
    _mesa_transform_tab[4][MATRIX_IDENTITY] =
       _mesa_x86_64_transform_points4_identity;
    _mesa_transform_tab[4][MATRIX_3D] =
       _mesa_x86_64_transform_points4_3d;
-   _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
-      _mesa_x86_64_transform_points4_3d_no_rot;
-   _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
-      _mesa_x86_64_transform_points4_perspective;
-   _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
-      _mesa_x86_64_transform_points4_2d_no_rot;
-   _mesa_transform_tab[4][MATRIX_2D] =
-      _mesa_x86_64_transform_points4_2d;
-   */
+
+   regs[0] = 0x80000001;
+   regs[1] = 0x00000000;
+   regs[2] = 0x00000000;
+   regs[3] = 0x00000000;
+   _mesa_x86_64_cpuid(regs);
+   if (regs[3] & (1U << 31)) {
+      message("3Dnow! detected\n");
+      _mesa_transform_tab[4][MATRIX_3D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_3d_no_rot;
+      _mesa_transform_tab[4][MATRIX_PERSPECTIVE] =
+	  _mesa_3dnow_transform_points4_perspective;
+      _mesa_transform_tab[4][MATRIX_2D_NO_ROT] =
+	  _mesa_3dnow_transform_points4_2d_no_rot;
+      _mesa_transform_tab[4][MATRIX_2D] =
+	  _mesa_3dnow_transform_points4_2d;
+
+   }
+
    
 #ifdef DEBUG_MATH
    _math_test_all_transform_functions("x86_64");
diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S
index 3f9c9d56ab7..805969127db 100644
--- a/src/mesa/x86-64/xform4.S
+++ b/src/mesa/x86-64/xform4.S
@@ -29,7 +29,22 @@
 .text
 
 .align 16
+.globl _mesa_x86_64_cpuid
+_mesa_x86_64_cpuid:
+	pushq	%rbx
+	movl	(%rdi), %eax
+	movl	8(%rdi), %ecx
+
+	cpuid
+
+	movl	%ebx, 4(%rdi)
+	movl	%eax, (%rdi)
+	movl	%ecx, 8(%rdi)
+	movl	%edx, 12(%rdi)
+	popq	%rbx
+	ret
 
+.align 16
 .globl _mesa_x86_64_transform_points4_general
 _mesa_x86_64_transform_points4_general:
 /*
@@ -204,8 +219,8 @@ p4_identity_done:
 
 	
 .align 16
-.globl _mesa_x86_64_transform_points4_3d_no_rot
-_mesa_x86_64_transform_points4_3d_no_rot:
+.globl _mesa_3dnow_transform_points4_3d_no_rot
+_mesa_3dnow_transform_points4_3d_no_rot:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
@@ -268,8 +283,8 @@ p4_3d_no_rot_done:
 
 	
 .align 16
-.globl _mesa_x86_64_transform_points4_perspective
-_mesa_x86_64_transform_points4_perspective:
+.globl _mesa_3dnow_transform_points4_perspective
+_mesa_3dnow_transform_points4_perspective:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
@@ -334,8 +349,8 @@ p4_perspective_done:
 	ret
 
 .align 16
-.globl _mesa_x86_64_transform_points4_2d_no_rot
-_mesa_x86_64_transform_points4_2d_no_rot:
+.globl _mesa_3dnow_transform_points4_2d_no_rot
+_mesa_3dnow_transform_points4_2d_no_rot:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */
@@ -389,8 +404,8 @@ p4_2d_no_rot_done:
 
 	
 .align 16
-.globl _mesa_x86_64_transform_points4_2d
-_mesa_x86_64_transform_points4_2d:
+.globl _mesa_3dnow_transform_points4_2d
+_mesa_3dnow_transform_points4_2d:
 
 	movl V4F_COUNT(%rdx), %ecx	/* count */
 	movzx V4F_STRIDE(%rdx), %eax	/* stride */