Merge remote branch 'origin/gallium-0.2' into gallium-0.2

author: Ben Skeggs <[email protected]> 2008-09-30 11:57:45 +1000
committer: Ben Skeggs <[email protected]> 2008-09-30 11:57:45 +1000
commit: 8cd046c9b1da31e4ff178816bdfe2ee3451a1553 (patch)
tree: 6b9de732a9d75c3005a80b078af0c97b9cb6c0ab /src/gallium/auxiliary
parent: 8c14bd6992e338da8e467775b0237c8db7cbe44d (diff)
parent: 5dc8e67078be8b8c42a809311debd275ac7d64a7 (diff)
34 files changed, 1433 insertions, 198 deletions
diff --git a/src/gallium/auxiliary/cso_cache/cso_context.c b/src/gallium/auxiliary/cso_cache/cso_context.c
index f22ba40824d..68508f24de4 100644
--- a/src/gallium/auxiliary/cso_cache/cso_context.c
+++ b/src/gallium/auxiliary/cso_cache/cso_context.c
@@ -80,6 +80,10 @@ struct cso_context {
 };
 
 
+static void
+free_framebuffer_state(struct pipe_framebuffer_state *fb);
+
+
 static boolean delete_blend_state(struct cso_context *ctx, void *state)
 {
    struct cso_blend *cso = (struct cso_blend *)state;
@@ -252,6 +256,9 @@ void cso_release_all( struct cso_context *ctx )
       pipe_texture_reference(&ctx->textures_saved[i], NULL);
    }
 
+   free_framebuffer_state(&ctx->fb);
+   free_framebuffer_state(&ctx->fb_saved);
+
    if (ctx->cache) {
       cso_cache_delete( ctx->cache );
       ctx->cache = NULL;
@@ -765,12 +772,42 @@ void cso_restore_vertex_shader(struct cso_context *ctx)
 }
 
 
+/**
+ * Copy framebuffer state from src to dst with refcounting of surfaces.
+ */
+static void
+copy_framebuffer_state(struct pipe_framebuffer_state *dst,
+                       const struct pipe_framebuffer_state *src)
+{
+   uint i;
+
+   dst->width = src->width;
+   dst->height = src->height;
+   dst->num_cbufs = src->num_cbufs;
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&dst->cbufs[i], src->cbufs[i]);
+   }
+   pipe_surface_reference(&dst->zsbuf, src->zsbuf);
+}
+
+
+static void
+free_framebuffer_state(struct pipe_framebuffer_state *fb)
+{
+   uint i;
+
+   for (i = 0; i < PIPE_MAX_COLOR_BUFS; i++) {
+      pipe_surface_reference(&fb->cbufs[i], NULL);
+   }
+   pipe_surface_reference(&fb->zsbuf, NULL);
+}
+
 
 enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
                                     const struct pipe_framebuffer_state *fb)
 {
    if (memcmp(&ctx->fb, fb, sizeof(*fb)) != 0) {
-      ctx->fb = *fb;
+      copy_framebuffer_state(&ctx->fb, fb);
       ctx->pipe->set_framebuffer_state(ctx->pipe, fb);
    }
    return PIPE_OK;
@@ -778,14 +815,15 @@ enum pipe_error cso_set_framebuffer(struct cso_context *ctx,
 
 void cso_save_framebuffer(struct cso_context *ctx)
 {
-   ctx->fb_saved = ctx->fb;
+   copy_framebuffer_state(&ctx->fb_saved, &ctx->fb);
 }
 
 void cso_restore_framebuffer(struct cso_context *ctx)
 {
    if (memcmp(&ctx->fb, &ctx->fb_saved, sizeof(ctx->fb))) {
-      ctx->fb = ctx->fb_saved;
+      copy_framebuffer_state(&ctx->fb, &ctx->fb_saved);
       ctx->pipe->set_framebuffer_state(ctx->pipe, &ctx->fb);
+      free_framebuffer_state(&ctx->fb_saved);
    }
 }
 
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
index 7f0044c5a7f..4e7664f9bf0 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.c
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -431,3 +431,9 @@ struct cso_hash_iter cso_hash_erase(struct cso_hash *hash, struct cso_hash_iter
    --hash->data.d->size;
    return ret;
 }
+
+boolean cso_hash_contains(struct cso_hash *hash, unsigned key)
+{
+   struct cso_node **node = cso_hash_find_node(hash, key);
+   return (*node != hash->data.e);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
index 85f3e276c6a..5891c325fa5 100644
--- a/src/gallium/auxiliary/cso_cache/cso_hash.h
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -44,6 +44,7 @@
 #ifndef CSO_HASH_H
 #define CSO_HASH_H
 
+#include "pipe/p_compiler.h"
 
 #ifdef	__cplusplus
 extern "C" {
@@ -95,6 +96,11 @@ struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
  */
 struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
 
+/**
+ * Returns true if a value with the given key exists in the hash
+ */
+boolean   cso_hash_contains(struct cso_hash *hash, unsigned key);
+
 
 int       cso_hash_iter_is_null(struct cso_hash_iter iter);
 unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index 4f1326053df..e1af9e56a24 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -222,8 +222,8 @@ static void widepoint_first_point( struct draw_stage *stage,
       /* find fragment shader PointCoord/Fog input */
       wide->point_coord_fs_input = 0; /* XXX fix this! */
 
-      /* setup extra vp output */
-      draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_FOG;
+      /* setup extra vp output (point coord implemented as a texcoord) */
+      draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
       draw->extra_vp_outputs.semantic_index = 0;
       draw->extra_vp_outputs.slot = draw->vs.num_vs_outputs;
    }
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 44563803f90..79a19d6be2b 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -46,6 +46,7 @@
 struct exec_vertex_shader {
    struct draw_vertex_shader base;
    struct tgsi_exec_machine *machine;
+   const struct tgsi_token *machine_tokens;
 };
 
 static struct exec_vertex_shader *exec_vertex_shader( struct draw_vertex_shader *vs )
@@ -62,12 +63,16 @@ vs_exec_prepare( struct draw_vertex_shader *shader,
 {
    struct exec_vertex_shader *evs = exec_vertex_shader(shader);
 
-   /* specify the vertex program to interpret/execute */
-   tgsi_exec_machine_bind_shader(evs->machine,
-				 shader->state.tokens,
-				 PIPE_MAX_SAMPLERS,
-				 NULL /*samplers*/ );
-
+   /* Specify the vertex program to interpret/execute.
+    * Avoid rebinding when possible.
+    */
+   if (evs->machine_tokens != shader->state.tokens) {
+      tgsi_exec_machine_bind_shader(evs->machine,
+                                    shader->state.tokens,
+                                    PIPE_MAX_SAMPLERS,
+                                    NULL /*samplers*/ );
+      evs->machine_tokens = shader->state.tokens;
+   }
 }
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
index 20fc87b39d2..1bf22a2ec09 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -129,7 +129,7 @@ pb_malloc_buffer_create(size_t size,
 
 
 static struct pb_buffer *
-pb_malloc_buffer_create_buffer(struct pb_manager *mgr, 
+pb_malloc_bufmgr_create_buffer(struct pb_manager *mgr, 
                                size_t size,
                                const struct pb_desc *desc) 
 {
@@ -138,6 +138,13 @@ pb_malloc_buffer_create_buffer(struct pb_manager *mgr,
 
 
 static void
+pb_malloc_bufmgr_flush(struct pb_manager *mgr) 
+{
+   /* No-op */
+}
+
+
+static void
 pb_malloc_bufmgr_destroy(struct pb_manager *mgr) 
 {
    /* No-op */
@@ -146,8 +153,9 @@ pb_malloc_bufmgr_destroy(struct pb_manager *mgr)
 
 static struct pb_manager 
 pb_malloc_bufmgr = {
-   pb_malloc_buffer_create_buffer,
-   pb_malloc_bufmgr_destroy
+   pb_malloc_bufmgr_destroy,
+   pb_malloc_bufmgr_create_buffer,
+   pb_malloc_bufmgr_flush
 };
 
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
index 32867029ee7..cafbee045ae 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -69,13 +69,22 @@ struct pipe_winsys;
  */
 struct pb_manager
 {
+   void
+   (*destroy)( struct pb_manager *mgr );
+
    struct pb_buffer *
    (*create_buffer)( struct pb_manager *mgr, 
 	             size_t size,
 	             const struct pb_desc *desc);
 
+   /**
+    * Flush all temporary-held buffers.
+    * 
+    * Used mostly to aid debugging memory issues or to clean up resources when 
+    * the drivers are long lived.
+    */
    void
-   (*destroy)( struct pb_manager *mgr );
+   (*flush)( struct pb_manager *mgr );
 };
 
 
@@ -153,9 +162,6 @@ struct pb_manager *
 pb_cache_manager_create(struct pb_manager *provider, 
                      	unsigned usecs); 
 
-void
-pb_cache_flush(struct pb_manager *mgr);
-
 
 /** 
  * Fenced buffer manager.
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
index 2afaeafa1a1..c956924cc76 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_alt.c
@@ -76,6 +76,21 @@ pb_alt_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_alt_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_alt_manager *mgr = pb_alt_manager(_mgr);
+   
+   assert(mgr->provider1->flush);
+   if(mgr->provider1->flush)
+      mgr->provider1->flush(mgr->provider1);
+   
+   assert(mgr->provider2->flush);
+   if(mgr->provider2->flush)
+      mgr->provider2->flush(mgr->provider2);
+}
+
+
+static void
 pb_alt_manager_destroy(struct pb_manager *mgr)
 {
    FREE(mgr);
@@ -97,6 +112,7 @@ pb_alt_manager_create(struct pb_manager *provider1,
 
    mgr->base.destroy = pb_alt_manager_destroy;
    mgr->base.create_buffer = pb_alt_manager_create_buffer;
+   mgr->base.flush = pb_alt_manager_flush;
    mgr->provider1 = provider1;
    mgr->provider2 = provider2;
       
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
index 1ec422fb19e..8f118874ec3 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_cache.c
@@ -306,8 +306,8 @@ pb_cache_manager_create_buffer(struct pb_manager *_mgr,
 }
 
 
-void
-pb_cache_flush(struct pb_manager *_mgr)
+static void
+pb_cache_manager_flush(struct pb_manager *_mgr)
 {
    struct pb_cache_manager *mgr = pb_cache_manager(_mgr);
    struct list_head *curr, *next;
@@ -323,13 +323,17 @@ pb_cache_flush(struct pb_manager *_mgr)
       next = curr->next;
    }
    pipe_mutex_unlock(mgr->mutex);
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
 }
 
 
 static void
 pb_cache_manager_destroy(struct pb_manager *mgr)
 {
-   pb_cache_flush(mgr);
+   pb_cache_manager_flush(mgr);
    FREE(mgr);
 }
 
@@ -349,6 +353,7 @@ pb_cache_manager_create(struct pb_manager *provider,
 
    mgr->base.destroy = pb_cache_manager_destroy;
    mgr->base.create_buffer = pb_cache_manager_create_buffer;
+   mgr->base.flush = pb_cache_manager_flush;
    mgr->provider = provider;
    mgr->usecs = usecs;
    LIST_INITHEAD(&mgr->delayed);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
index 5f1ed3e5a8a..1675e6e1822 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_debug.c
@@ -314,6 +314,16 @@ pb_debug_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_debug_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
 pb_debug_manager_destroy(struct pb_manager *_mgr)
 {
    struct pb_debug_manager *mgr = pb_debug_manager(_mgr);
@@ -336,6 +346,7 @@ pb_debug_manager_create(struct pb_manager *provider, size_t band_size)
 
    mgr->base.destroy = pb_debug_manager_destroy;
    mgr->base.create_buffer = pb_debug_manager_create_buffer;
+   mgr->base.flush = pb_debug_manager_flush;
    mgr->provider = provider;
    mgr->band_size = band_size;
       
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
index 8fc63ce648c..633ee70a75b 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -95,6 +95,19 @@ fenced_bufmgr_create_buffer(struct pb_manager *mgr,
 
 
 static void
+fenced_bufmgr_flush(struct pb_manager *mgr)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+
+   fenced_buffer_list_check_free(fenced_mgr->fenced_list, TRUE);
+
+   assert(fenced_mgr->provider->flush);
+   if(fenced_mgr->provider->flush)
+      fenced_mgr->provider->flush(fenced_mgr->provider);
+}
+
+
+static void
 fenced_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
@@ -123,6 +136,7 @@ fenced_bufmgr_create(struct pb_manager *provider,
 
    fenced_mgr->base.destroy = fenced_bufmgr_destroy;
    fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
+   fenced_mgr->base.flush = fenced_bufmgr_flush;
 
    fenced_mgr->provider = provider;
    fenced_mgr->fenced_list = fenced_buffer_list_create(winsys);
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
index e8c7f8e1f82..fe80ca30eea 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -200,6 +200,13 @@ mm_bufmgr_create_buffer(struct pb_manager *mgr,
 
 
 static void
+mm_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
 mm_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct mm_pb_manager *mm = mm_pb_manager(mgr);
@@ -230,8 +237,9 @@ mm_bufmgr_create_from_buffer(struct pb_buffer *buffer,
    if (!mm)
       return NULL;
 
-   mm->base.create_buffer = mm_bufmgr_create_buffer;
    mm->base.destroy = mm_bufmgr_destroy;
+   mm->base.create_buffer = mm_bufmgr_create_buffer;
+   mm->base.flush = mm_bufmgr_flush;
 
    mm->size = size;
    mm->align2 = align2; /* 64-byte alignment */
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
index 3ef72c5bbb3..61ac291ed77 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -203,6 +203,13 @@ pool_bufmgr_create_buffer(struct pb_manager *mgr,
 
 
 static void
+pool_bufmgr_flush(struct pb_manager *mgr)
+{
+   /* No-op */
+}
+
+
+static void
 pool_bufmgr_destroy(struct pb_manager *mgr)
 {
    struct pool_pb_manager *pool = pool_pb_manager(mgr);
@@ -238,6 +245,7 @@ pool_bufmgr_create(struct pb_manager *provider,
 
    pool->base.destroy = pool_bufmgr_destroy;
    pool->base.create_buffer = pool_bufmgr_create_buffer;
+   pool->base.flush = pool_bufmgr_flush;
 
    LIST_INITHEAD(&pool->free);
 
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
index 8698c4cff62..2a801549202 100644
--- a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_slab.c
@@ -407,6 +407,17 @@ pb_slab_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_slab_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
+
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
 pb_slab_manager_destroy(struct pb_manager *_mgr)
 {
    struct pb_slab_manager *mgr = pb_slab_manager(_mgr);
@@ -430,6 +441,7 @@ pb_slab_manager_create(struct pb_manager *provider,
 
    mgr->base.destroy = pb_slab_manager_destroy;
    mgr->base.create_buffer = pb_slab_manager_create_buffer;
+   mgr->base.flush = pb_slab_manager_flush;
 
    mgr->provider = provider;
    mgr->bufSize = bufSize;
@@ -466,6 +478,19 @@ pb_slab_range_manager_create_buffer(struct pb_manager *_mgr,
 
 
 static void
+pb_slab_range_manager_flush(struct pb_manager *_mgr)
+{
+   struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
+
+   /* Individual slabs don't hold any temporary buffers so no need to call them */
+   
+   assert(mgr->provider->flush);
+   if(mgr->provider->flush)
+      mgr->provider->flush(mgr->provider);
+}
+
+
+static void
 pb_slab_range_manager_destroy(struct pb_manager *_mgr)
 {
    struct pb_slab_range_manager *mgr = pb_slab_range_manager(_mgr);
@@ -499,6 +524,7 @@ pb_slab_range_manager_create(struct pb_manager *provider,
 
    mgr->base.destroy = pb_slab_range_manager_destroy;
    mgr->base.create_buffer = pb_slab_range_manager_create_buffer;
+   mgr->base.flush = pb_slab_range_manager_flush;
 
    mgr->provider = provider;
    mgr->minBufSize = minBufSize;
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.c b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
new file mode 100644
index 00000000000..534a23568d5
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.c
@@ -0,0 +1,365 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#include "util/u_memory.h"
+#include "pipe/p_debug.h"
+#include "rtasm_ppc.h"
+
+
+void
+ppc_init_func(struct ppc_function *p, unsigned max_inst)
+{
+    p->store = align_malloc(max_inst * PPC_INST_SIZE, 16);
+    p->num_inst = 0;
+    p->max_inst = max_inst;
+    p->vec_used = ~0;
+}
+
+
+void
+ppc_release_func(struct ppc_function *p)
+{
+    assert(p->num_inst <= p->max_inst);
+    if (p->store != NULL) {
+        align_free(p->store);
+    }
+    p->store = NULL;
+}
+
+
+/**
+ * Alloate a vector register.
+ * \return register index or -1 if none left.
+ */
+int
+ppc_allocate_vec_register(struct ppc_function *p, int reg)
+{
+   unsigned i;
+   for (i = 0; i < PPC_NUM_VEC_REGS; i++) {
+      const uint64_t mask = 1 << i;
+      if ((p->vec_used & mask) != 0) {
+         p->vec_used &= ~mask;
+         return i;
+      }
+   }
+
+   return -1;
+}
+
+
+/**
+ * Mark the given vector register as "unallocated".
+ */
+void
+ppc_release_vec_register(struct ppc_function *p, int reg)
+{
+   assert(reg < PPC_NUM_VEC_REGS);
+   assert((p->vec_used & (1 << reg)) == 0);
+
+   p->vec_used |= (1 << reg);
+}
+
+
+
+union vx_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned op2:11;
+   } inst;
+};
+
+union vxr_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned rC:1;
+      unsigned op2:10;
+   } inst;
+};
+
+union va_inst {
+   uint32_t bits;
+   struct {
+      unsigned op:6;
+      unsigned vD:5;
+      unsigned vA:5;
+      unsigned vB:5;
+      unsigned vC:5;
+      unsigned op2:6;
+   } inst;
+};
+
+
+static inline void
+emit_vx(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vx_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_vxr(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB)
+{
+   union vxr_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.rC = 0;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+static inline void
+emit_va(struct ppc_function *p, uint op2, uint vD, uint vA, uint vB, uint vC)
+{
+   union va_inst inst;
+   inst.inst.op = 4;
+   inst.inst.vD = vD;
+   inst.inst.vA = vA;
+   inst.inst.vB = vB;
+   inst.inst.vC = vC;
+   inst.inst.op2 = op2;
+   p->store[p->num_inst++] = inst.bits;
+   assert(p->num_inst <= p->max_inst);
+};
+
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 10, vD, vA, vB);
+}
+
+/** vector float substract */
+void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 74, vD, vA, vB);
+}
+
+/** vector float min */
+void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1098, vD, vA, vB);
+}
+
+/** vector float max */
+void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1034, vD, vA, vB);
+}
+
+/** vector float mult add */
+void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 46, vD, vA, vB, vC);
+}
+
+/** vector float compare greater than */
+void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 710, vD, vA, vB);
+}
+
+/** vector float compare greater than or equal to */
+void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 454, vD, vA, vB);
+}
+
+/** vector float compare equal */
+void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vxr(p, 198, vD, vA, vB);
+}
+
+/** vector float 2^x */
+void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 394, vD, 0, vB);
+}
+
+/** vector float log2(x) */
+void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 458, vD, 0, vB);
+}
+
+/** vector float reciprocol */
+void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 266, vD, 0, vB);
+}
+
+/** vector float reciprocol sqrt estimate */
+void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 330, vD, 0, vB);
+}
+
+/** vector float round to negative infinity */
+void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 714, vD, 0, vB);
+}
+
+/** vector float round to positive infinity */
+void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 650, vD, 0, vB);
+}
+
+/** vector float round to nearest int */
+void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 522, vD, 0, vB);
+}
+
+/** vector float round to int toward zero */
+void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB)
+{
+   emit_vx(p, 586, vD, 0, vB);
+}
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1028, vD, vA, vB);
+}
+
+/** vector and complement */
+void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1092, vD, vA, vB);
+}
+
+/** vector or */
+void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1156, vD, vA, vB);
+}
+
+/** vector nor */
+void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1284, vD, vA, vB);
+}
+
+/** vector xor */
+void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB)
+{
+   emit_vx(p, 1220, vD, vA, vB);
+}
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 43, vD, vA, vB, vC);
+}
+
+/** vector select */
+void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC)
+{
+   emit_va(p, 42, vD, vA, vB, vC);
+}
+
+/** vector splat byte */
+void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 42, vD, imm, vB);
+}
+
+/** vector splat half word */
+void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 588, vD, imm, vB);
+}
+
+/** vector splat word */
+void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm)
+{
+   emit_vx(p, 652, vD, imm, vB);
+}
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc.h b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
new file mode 100644
index 00000000000..ed14e943df6
--- /dev/null
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc.h
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright (C) 2008 Tungsten Graphics, Inc.   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * PPC code generation.
+ * \author Brian Paul
+ */
+
+
+#ifndef RTASM_PPC_H
+#define RTASM_PPC_H
+
+
+#include "pipe/p_compiler.h"
+
+
+#define PPC_INST_SIZE 4  /**< 4 bytes / instruction */
+
+#define PPC_NUM_VEC_REGS 32
+
+
+struct ppc_function
+{
+   uint32_t *store;  /**< instruction buffer */
+   uint num_inst;
+   uint max_inst;
+   uint32_t vec_used;   /** used/free vector registers bitmask */
+   uint32_t reg_used;   /** used/free general-purpose registers bitmask */
+};
+
+
+
+extern void ppc_init_func(struct ppc_function *p, unsigned max_inst);
+extern void ppc_release_func(struct ppc_function *p);
+
+extern int ppc_allocate_vec_register(struct ppc_function *p, int reg);
+extern void ppc_release_vec_register(struct ppc_function *p, int reg);
+
+
+/**
+ ** float vector arithmetic
+ **/
+
+/** vector float add */
+extern void
+ppc_vaddfp(struct ppc_function *p,uint vD, uint vA, uint vB);
+
+/** vector float substract */
+extern void
+ppc_vsubfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float min */
+extern void
+ppc_vminfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float max */
+extern void
+ppc_vmaxfp(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float mult add */
+extern void
+ppc_vmaddfp(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector float compare greater than */
+extern void
+ppc_vcmpgtfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare greater than or equal to */
+extern void
+ppc_vcmpgefpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float compare equal */
+extern void
+ppc_vcmpeqfpx(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector float 2^x */
+extern void
+ppc_vexptefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float log2(x) */
+extern void
+ppc_vlogefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol */
+extern void
+ppc_vrefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float reciprocol sqrt estimate */
+extern void
+ppc_vrsqrtefp(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to negative infinity */
+extern void
+ppc_vrfim(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to positive infinity */
+extern void
+ppc_vrfip(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to nearest int */
+extern void
+ppc_vrfin(struct ppc_function *p, uint vD, uint vB);
+
+/** vector float round to int toward zero */
+extern void
+ppc_vrfiz(struct ppc_function *p, uint vD, uint vB);
+
+
+
+/**
+ ** bitwise operations
+ **/
+
+
+/** vector and */
+extern void
+ppc_vand(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector and complement */
+extern void
+ppc_vandc(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector or */
+extern void
+ppc_vor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector nor */
+extern void
+ppc_vnor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+/** vector xor */
+extern void
+ppc_vxor(struct ppc_function *p, uint vD, uint vA, uint vB);
+
+
+/**
+ ** Vector shuffle / select / splat / etc
+ **/
+
+/** vector permute */
+extern void
+ppc_vperm(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector select */
+extern void
+ppc_vsel(struct ppc_function *p, uint vD, uint vA, uint vB, uint vC);
+
+/** vector splat byte */
+extern void
+ppc_vspltb(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat half word */
+extern void
+ppc_vsplthw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+/** vector splat word */
+extern void
+ppc_vspltw(struct ppc_function *p, uint vD, uint vB, uint imm);
+
+
+#endif /* RTASM_PPC_H */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
index a04cc6c4ff7..491141f1908 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.c
@@ -27,12 +27,16 @@
  * Real-time assembly generation interface for Cell B.E. SPEs.
  *
  * \author Ian Romanick <[email protected]>
+ * \author Brian Paul
  */
 
+
+#include <stdio.h>
 #include "pipe/p_compiler.h"
 #include "util/u_memory.h"
 #include "rtasm_ppc_spe.h"
 
+
 #ifdef GALLIUM_CELL
 /**
  * SPE instruction types
@@ -143,8 +147,25 @@ union spe_inst_RI18 {
 /*@}*/
 
 
+static void
+indent(const struct spe_function *p)
+{
+   int i;
+   for (i = 0; i < p->indent; i++) {
+      putchar(' ');
+   }
+}
+
+
+static const char *
+rem_prefix(const char *longname)
+{
+   return longname + 4;
+}
+
+
 static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB)
+		    unsigned rA, unsigned rB, const char *name)
 {
     union spe_inst_RR inst;
     inst.inst.op = op;
@@ -153,11 +174,15 @@ static void emit_RR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB);
+    }
 }
 
 
 static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
-		    unsigned rA, unsigned rB, unsigned rC)
+                     unsigned rA, unsigned rB, unsigned rC, const char *name)
 {
     union spe_inst_RRR inst;
     inst.inst.op = op;
@@ -167,11 +192,15 @@ static void emit_RRR(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rC = rC;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, $%d, $%d\n", rem_prefix(name), rT, rA, rB, rC);
+    }
 }
 
 
 static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI7 inst;
     inst.inst.op = op;
@@ -180,12 +209,16 @@ static void emit_RI7(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 
 static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
-		     unsigned rA, int imm)
+		     unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI8 inst;
     inst.inst.op = op;
@@ -194,12 +227,16 @@ static void emit_RI8(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 
 static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
-		      unsigned rA, int imm)
+		      unsigned rA, int imm, const char *name)
 {
     union spe_inst_RI10 inst;
     inst.inst.op = op;
@@ -208,11 +245,19 @@ static void emit_RI10(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       if (strcmp(name, "spe_lqd") == 0 ||
+           strcmp(name, "spe_stqd") == 0)
+          printf("%s\t$%d, 0x%x($%d)\n", rem_prefix(name), rT, imm, rA);
+       else
+          printf("%s\t$%d, $%d, 0x%x\n", rem_prefix(name), rT, rA, imm);
+    }
 }
 
 
 static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI16 inst;
     inst.inst.op = op;
@@ -220,11 +265,15 @@ static void emit_RI16(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
 static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
-		      int imm)
+		      int imm, const char *name)
 {
     union spe_inst_RI18 inst;
     inst.inst.op = op;
@@ -232,6 +281,10 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
     inst.inst.rT = rT;
     p->store[p->num_inst++] = inst.bits;
     assert(p->num_inst <= p->max_inst);
+    if (p->print) {
+       indent(p);
+       printf("%s\t$%d, 0x%x\n", rem_prefix(name), rT, imm);
+    }
 }
 
 
@@ -240,61 +293,61 @@ static void emit_RI18(struct spe_function *p, unsigned op, unsigned rT,
 #define EMIT_(_name, _op) \
 void _name (struct spe_function *p, unsigned rT) \
 { \
-    emit_RR(p, _op, rT, 0, 0); \
+   emit_RR(p, _op, rT, 0, 0, __FUNCTION__); \
 }
 
 #define EMIT_R(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA) \
 { \
-    emit_RR(p, _op, rT, rA, 0); \
+   emit_RR(p, _op, rT, rA, 0, __FUNCTION__);                 \
 }
 
 #define EMIT_RR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB) \
 { \
-    emit_RR(p, _op, rT, rA, rB); \
+   emit_RR(p, _op, rT, rA, rB, __FUNCTION__);                \
 }
 
 #define EMIT_RRR(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, unsigned rB, unsigned rC) \
 { \
-    emit_RRR(p, _op, rT, rA, rB, rC); \
+   emit_RRR(p, _op, rT, rA, rB, rC, __FUNCTION__);           \
 }
 
 #define EMIT_RI7(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI7(p, _op, rT, rA, imm); \
+   emit_RI7(p, _op, rT, rA, imm, __FUNCTION__);              \
 }
 
 #define EMIT_RI8(_name, _op, bias) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI8(p, _op, rT, rA, bias - imm); \
+   emit_RI8(p, _op, rT, rA, bias - imm, __FUNCTION__);       \
 }
 
 #define EMIT_RI10(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, unsigned rA, int imm) \
 { \
-    emit_RI10(p, _op, rT, rA, imm); \
+   emit_RI10(p, _op, rT, rA, imm, __FUNCTION__);             \
 }
 
 #define EMIT_RI16(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI16(p, _op, rT, imm); \
+   emit_RI16(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_RI18(_name, _op) \
 void _name (struct spe_function *p, unsigned rT, int imm) \
 { \
-    emit_RI18(p, _op, rT, imm); \
+   emit_RI18(p, _op, rT, imm, __FUNCTION__);                 \
 }
 
 #define EMIT_I16(_name, _op) \
 void _name (struct spe_function *p, int imm) \
 { \
-    emit_RI16(p, _op, 0, imm); \
+   emit_RI16(p, _op, 0, imm, __FUNCTION__);                  \
 }
 
 #include "rtasm_ppc_spe.h"
@@ -314,6 +367,9 @@ void spe_init_func(struct spe_function *p, unsigned code_size)
      */
     p->regs[0] = ~7;
     p->regs[1] = (1U << (80 - 64)) - 1;
+
+    p->print = false;
+    p->indent = 0;
 }
 
 
@@ -327,8 +383,15 @@ void spe_release_func(struct spe_function *p)
 }
 
 
+/** Return current code size in bytes. */
+unsigned spe_code_size(const struct spe_function *p)
+{
+   return p->num_inst * SPE_INST_SIZE;
+}
+
+
 /**
- * Alloate a SPE register.
+ * Allocate a SPE register.
  * \return register index or -1 if none left.
  */
 int spe_allocate_available_register(struct spe_function *p)
@@ -382,6 +445,32 @@ void spe_release_register(struct spe_function *p, int reg)
 }
 
 
+void
+spe_print_code(struct spe_function *p, boolean enable)
+{
+   p->print = enable;
+}
+
+
+void
+spe_indent(struct spe_function *p, int spaces)
+{
+   p->indent += spaces;
+}
+
+
+extern void
+spe_comment(struct spe_function *p, int rel_indent, const char *s)
+{
+   if (p->print) {
+      p->indent += rel_indent;
+      indent(p);
+      p->indent -= rel_indent;
+      printf("# %s\n", s);
+   }
+}
+
+
 /**
  * For branch instructions:
  * \param d  if 1, disable interupts if branch is taken
@@ -392,51 +481,51 @@ void spe_release_register(struct spe_function *p, int reg)
 /** Branch Indirect to address in rA */
 void spe_bi(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a8, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Interupt Return */
 void spe_iret(struct spe_function *p, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1aa, 0, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link on external data */
 void spe_bisled(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1ab, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect and set link.  Save PC in rT, jump to rA. */
 void spe_bisl(struct spe_function *p, unsigned rT, unsigned rA, int d,
 		int e)
 {
-    emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x1a9, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero word.  If rT.word[0]==0, jump to rA. */
 void spe_biz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x128, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero word.  If rT.word[0]!=0, jump to rA. */
 void spe_binz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x129, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if zero halfword.  If rT.halfword[1]==0, jump to rA. */
 void spe_bihz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12a, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 /** Branch indirect if non-zero halfword.  If rT.halfword[1]!=0, jump to rA. */
 void spe_bihnz(struct spe_function *p, unsigned rT, unsigned rA, int d, int e)
 {
-    emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4));
+   emit_RI7(p, 0x12b, rT, rA, (d << 5) | (e << 4), __FUNCTION__);
 }
 
 
@@ -505,7 +594,28 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
    }
    else {
       spe_ilhu(p, rT, i >> 16);
-      spe_iohl(p, rT, i & 0xffff);
+      if (i & 0xffff)
+         spe_iohl(p, rT, i & 0xffff);
+   }
+}
+
+void spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui)
+{
+   /* If the whole value is in the lower 18 bits, use ila, which
+    * doesn't sign-extend.  Otherwise, if the two halfwords of
+    * the constant are identical, use ilh.  Otherwise, we have
+    * to use ilhu followed by iohl.
+    */
+   if ((ui & 0xfffc0000) == ui) {
+      spe_ila(p, rT, ui);
+   }
+   else if ((ui >> 16) == (ui & 0xffff)) {
+      spe_ilh(p, rT, ui & 0xffff);
+   }
+   else {
+      spe_ilhu(p, rT, ui >> 16);
+      if (ui & 0xffff)
+         spe_iohl(p, rT, ui & 0xffff);
    }
 }
 
@@ -513,22 +623,29 @@ spe_load_int(struct spe_function *p, unsigned rT, int i)
 void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ila(p, rT, 66051);
+   /* Duplicate bytes 0, 1, 2, and 3 across the whole register */
+   spe_ila(p, rT, 0x00010203);
    spe_shufb(p, rT, rA, rA, rT);
 }
 
 
 void
-spe_complement(struct spe_function *p, unsigned rT)
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_nor(p, rT, rT, rT);
+   spe_nor(p, rT, rA, rA);
 }
 
 
 void
 spe_move(struct spe_function *p, unsigned rT, unsigned rA)
 {
-   spe_ori(p, rT, rA, 0);
+   /* Use different instructions depending on the instruction address
+    * to take advantage of the dual pipelines.
+    */
+   if (p->num_inst & 1)
+      spe_shlqbyi(p, rT, rA, 0);  /* odd pipe */
+   else
+      spe_ori(p, rT, rA, 0);  /* even pipe */
 }
 
 
@@ -539,4 +656,70 @@ spe_zero(struct spe_function *p, unsigned rT)
 }
 
 
+void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word)
+{
+   assert(word >= 0);
+   assert(word <= 3);
+
+   if (word == 0) {
+      int tmp1 = rT;
+      spe_ila(p, tmp1, 66051);
+      spe_shufb(p, rT, rA, rA, tmp1);
+   }
+   else {
+      /* XXX review this, we may not need the rotqbyi instruction */
+      int tmp1 = rT;
+      int tmp2 = spe_allocate_available_register(p);
+
+      spe_ila(p, tmp1, 66051);
+      spe_rotqbyi(p, tmp2, rA, 4 * word);
+      spe_shufb(p, rT, tmp2, tmp2, tmp1);
+
+      spe_release_register(p, tmp2);
+   }
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the smaller of the
+ * two, compositing them into the rT register.
+ * 
+ * The Float Compare Greater Than (fcgt) instruction will put 1s into
+ * compare_reg where rA > rB, and 0s where rA <= rB.
+ *
+ * Then the Select Bits (selb) instruction will take bits from rA where
+ * compare_reg is 0, and from rB where compare_reg is 1; i.e., from rA
+ * where rA <= rB and from rB where rB > rA, which is exactly the
+ * "min" operation.
+ *
+ * The compare_reg could in many cases be the same as rT, unless
+ * rT == rA || rt == rB.  But since this is common in constructions
+ * like "x = min(x, a)", we always allocate a new register to be safe.
+ */
+void 
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rA, rB, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
+/**
+ * For each 32-bit float element of rA and rB, choose the greater of the
+ * two, compositing them into the rT register.
+ * 
+ * The logic is similar to that of spe_float_min() above; the only
+ * difference is that the registers on spe_selb() have been reversed,
+ * so that the larger of the two is selected instead of the smaller.
+ */
+void 
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB)
+{
+   unsigned int compare_reg = spe_allocate_available_register(p);
+   spe_fcgt(p, compare_reg, rA, rB);
+   spe_selb(p, rT, rB, rA, compare_reg);
+   spe_release_register(p, compare_reg);
+}
+
 #endif /* GALLIUM_CELL */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
index d95e5aace34..61c7edeb604 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_ppc_spe.h
@@ -28,6 +28,7 @@
  * For details, see /opt/cell/sdk/docs/arch/SPU_ISA_v1.2_27Jan2007_pub.pdf
  *
  * \author Ian Romanick <[email protected]>
+ * \author Brian Paul
  */
 
 #ifndef RTASM_PPC_SPE_H
@@ -39,10 +40,10 @@
 /** number of general-purpose SIMD registers */
 #define SPE_NUM_REGS  128
 
-/** Return Address register */
+/** Return Address register (aka $lr / Link Register) */
 #define SPE_REG_RA  0
 
-/** Stack Pointer register */
+/** Stack Pointer register (aka $sp) */
 #define SPE_REG_SP  1
 
 
@@ -63,15 +64,25 @@ struct spe_function
      * spe_release_register
      */
     uint64_t regs[SPE_NUM_REGS / 64];
+
+    boolean print; /**< print/dump instructions as they're emitted? */
+    int indent;    /**< number of spaces to indent */
 };
 
+
 extern void spe_init_func(struct spe_function *p, unsigned code_size);
 extern void spe_release_func(struct spe_function *p);
+extern unsigned spe_code_size(const struct spe_function *p);
 
 extern int spe_allocate_available_register(struct spe_function *p);
 extern int spe_allocate_register(struct spe_function *p, int reg);
 extern void spe_release_register(struct spe_function *p, int reg);
 
+extern void spe_print_code(struct spe_function *p, boolean enable);
+extern void spe_indent(struct spe_function *p, int spaces);
+extern void spe_comment(struct spe_function *p, int rel_indent, const char *s);
+
+
 #endif /* RTASM_PPC_SPE_H */
 
 #ifndef EMIT_
@@ -292,13 +303,17 @@ spe_load_float(struct spe_function *p, unsigned rT, float x);
 extern void
 spe_load_int(struct spe_function *p, unsigned rT, int i);
 
+/** Load/splat immediate unsigned int into rT. */
+extern void
+spe_load_uint(struct spe_function *p, unsigned rT, unsigned int ui);
+
 /** Replicate word 0 of rA across rT. */
 extern void
 spe_splat(struct spe_function *p, unsigned rT, unsigned rA);
 
-/** Complement/invert all bits in rT. */
+/** rT = complement_all_bits(rA). */
 extern void
-spe_complement(struct spe_function *p, unsigned rT);
+spe_complement(struct spe_function *p, unsigned rT, unsigned rA);
 
 /** rT = rA. */
 extern void
@@ -308,6 +323,18 @@ spe_move(struct spe_function *p, unsigned rT, unsigned rA);
 extern void
 spe_zero(struct spe_function *p, unsigned rT);
 
+/** rT = splat(rA, word) */
+extern void
+spe_splat_word(struct spe_function *p, unsigned rT, unsigned rA, int word);
+
+/** rT = float min(rA, rB) */
+extern void
+spe_float_min(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
+/** rT = float max(rA, rB) */
+extern void
+spe_float_max(struct spe_function *p, unsigned rT, unsigned rA, unsigned rB);
+
 
 /* Floating-point instructions
  */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 6d4c081e04e..3bba9dcc076 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -240,7 +240,8 @@ static void emit_modrm( struct x86_function *p,
    /* Oh-oh we've stumbled into the SIB thing.
     */
    if (regmem.file == file_REG32 &&
-       regmem.idx == reg_SP) {
+       regmem.idx == reg_SP &&
+       regmem.mod != mod_REG) {
       emit_1ub(p, 0x24);		/* simplistic! */
    }
 
@@ -435,25 +436,70 @@ void x86_call( struct x86_function *p, struct x86_reg reg)
 }
 
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
 {
    DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
    emit_1ub(p, 0xb8 + dst.idx);
    emit_1i(p, imm);
 }
 
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+/**
+ * Immediate group 1 instructions.
+ */
+static INLINE void 
+x86_group1_imm( struct x86_function *p, 
+                unsigned op, struct x86_reg dst, int imm )
 {
-   DUMP_RI( dst, imm );
+   assert(dst.file == file_REG32);
    assert(dst.mod == mod_REG);
-   emit_1ub(p, 0x80);
-   emit_modrm_noreg(p, 0, dst);
-   emit_1ub(p, imm);
+   if(-0x80 <= imm && imm < 0x80) {
+      emit_1ub(p, 0x83);
+      emit_modrm_noreg(p, op, dst);
+      emit_1b(p, (char)imm);
+   }
+   else {
+      emit_1ub(p, 0x81);
+      emit_modrm_noreg(p, op, dst);
+      emit_1i(p, imm);
+   }
+}
+
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 0, dst, imm);
+}
+
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 1, dst, imm);
+}
+
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 4, dst, imm);
+}
+
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 5, dst, imm);
+}
+
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 6, dst, imm);
+}
+
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm )
+{
+   DUMP_RI( dst, imm );
+   x86_group1_imm(p, 7, dst, imm);
 }
 
 
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index af94577aaba..510aa1b0dec 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -152,12 +152,13 @@ void x86_jmp( struct x86_function *p, int label );
 /* void x86_call( struct x86_function *p, void (*label)() ); */
 void x86_call( struct x86_function *p, struct x86_reg reg);
 
-/* michal:
- * Temporary. As I need immediate operands, and dont want to mess with the codegen,
- * I load the immediate into general purpose register and use it.
- */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
-void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
+void x86_add_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_or_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_and_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_sub_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_xor_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_cmp_imm( struct x86_function *p, struct x86_reg dst, int imm );
 
 
 /* Macro for sse_shufps() and sse2_pshufd():
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
index 806a2bd4c52..c7155a93168 100644
--- a/src/gallium/auxiliary/tgsi/Makefile
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -4,6 +4,7 @@ include $(TOP)/configs/current
 LIBNAME = tgsi
 
 C_SOURCES = \
+	tgsi_sanity.c \
 	tgsi_build.c \
 	tgsi_dump.c \
 	tgsi_exec.c \
@@ -12,6 +13,7 @@ C_SOURCES = \
 	tgsi_parse.c \
 	tgsi_scan.c \
 	tgsi_sse2.c \
+	tgsi_text.c \
 	tgsi_transform.c \
 	tgsi_util.c
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_build.c b/src/gallium/auxiliary/tgsi/tgsi_build.c
index 74614d36884..38fcaf88293 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_build.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_build.c
@@ -793,10 +793,14 @@ tgsi_default_instruction_ext_nv( void )
    return instruction_ext_nv;
 }
 
-union token_u32
+
+/** test for inequality of 32-bit values pointed to by a and b */
+static INLINE boolean
+compare32(const void *a, const void *b)
 {
-   unsigned u32;
-};
+   return *((uint32_t *) a) != *((uint32_t *) b);
+}
+
 
 unsigned
 tgsi_compare_instruction_ext_nv(
@@ -805,7 +809,7 @@ tgsi_compare_instruction_ext_nv(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_nv
@@ -864,7 +868,7 @@ tgsi_compare_instruction_ext_label(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_label
@@ -905,7 +909,7 @@ tgsi_compare_instruction_ext_texture(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_instruction_ext_texture
@@ -1027,7 +1031,7 @@ tgsi_compare_src_register_ext_swz(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_swz
@@ -1095,7 +1099,7 @@ tgsi_compare_src_register_ext_mod(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_src_register_ext_mod
@@ -1241,7 +1245,7 @@ tgsi_compare_dst_register_ext_concode(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_concode
@@ -1299,7 +1303,7 @@ tgsi_compare_dst_register_ext_modulate(
 {
    a.Padding = b.Padding = 0;
    a.Extended = b.Extended = 0;
-   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+   return compare32(&a, &b);
 }
 
 struct tgsi_dst_register_ext_modulate
diff --git a/src/gallium/auxiliary/tgsi/tgsi_dump.c b/src/gallium/auxiliary/tgsi/tgsi_dump.c
index afc8ffa553c..3177f549523 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_dump.c
@@ -68,6 +68,7 @@ dump_enum(
 #define CHR(C)          ctx->printf( ctx, "%c", C )
 #define UIX(I)          ctx->printf( ctx, "0x%x", I )
 #define UID(I)          ctx->printf( ctx, "%u", I )
+#define INSTID(I)          ctx->printf( ctx, "% 3u", I )
 #define SID(I)          ctx->printf( ctx, "%d", I )
 #define FLT(F)          ctx->printf( ctx, "%10.4f", F )
 #define ENM(E,ENUMS)    dump_enum( ctx, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
@@ -315,8 +316,8 @@ iter_instruction(
    uint i;
    boolean first_reg = TRUE;
 
-   UID( instno );
-   CHR( ':' );
+   INSTID( instno );
+   TXT( ": " );
    TXT( tgsi_get_opcode_info( inst->Instruction.Opcode )->mnemonic );
 
    switch (inst->Instruction.Saturate) {
diff --git a/src/gallium/auxiliary/tgsi/tgsi_exec.c b/src/gallium/auxiliary/tgsi/tgsi_exec.c
index df002939c6b..1a5294eabc3 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_exec.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_exec.c
@@ -1674,6 +1674,7 @@ exec_declaration(
             break;
 
          default:
+            eval = NULL;
             assert( 0 );
          }
 
diff --git a/src/gallium/auxiliary/tgsi/tgsi_info.c b/src/gallium/auxiliary/tgsi/tgsi_info.c
index a4899cd4c2d..68c7a6b7f58 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_info.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_info.c
@@ -69,7 +69,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 1, 1, 0, 0, "COS" },
    { 1, 1, 0, 0, "DDX" },
    { 1, 1, 0, 0, "DDY" },
-   { 0, 1, 0, 0, "KILP" },
+   { 0, 0, 0, 0, "KILP" },
    { 1, 1, 0, 0, "PK2H" },
    { 1, 1, 0, 0, "PK2US" },
    { 1, 1, 0, 0, "PK4B" },
@@ -146,7 +146,7 @@ static const struct tgsi_opcode_info opcode_info[TGSI_OPCODE_LAST] =
    { 0, 1, 0, 0, "CALLNZ" },
    { 0, 1, 0, 0, "IFC" },
    { 0, 1, 0, 0, "BREAKC" },
-   { 0, 0, 0, 0, "KIL" },
+   { 0, 1, 0, 0, "KIL" },
    { 0, 0, 0, 0, "END" },
    { 1, 1, 0, 0, "SWZ" }
 };
diff --git a/src/gallium/auxiliary/tgsi/tgsi_parse.c b/src/gallium/auxiliary/tgsi/tgsi_parse.c
index 3757486ba9b..2cd56e413a5 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_parse.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_parse.c
@@ -88,16 +88,33 @@ tgsi_parse_end_of_tokens(
       1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
 }
 
+
+/**
+ * This function is used to avoid and work-around type punning/aliasing
+ * warnings.  The warnings seem harmless on x86 but on PPC they cause
+ * real failures.
+ */
+static INLINE void
+copy_token(void *dst, const void *src)
+{
+   memcpy(dst, src, 4);
+}
+
+
+/**
+ * Get next 4-byte token, return it at address specified by 'token'
+ */
 static void
 next_token(
    struct tgsi_parse_context *ctx,
    void *token )
 {
    assert( !tgsi_parse_end_of_tokens( ctx ) );
-
-   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+   copy_token(token, &ctx->Tokens[ctx->Position]);
+   ctx->Position++;
 }
 
+
 void
 tgsi_parse_token(
    struct tgsi_parse_context *ctx )
@@ -116,7 +133,7 @@ tgsi_parse_token(
       struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
 
       *decl = tgsi_default_full_declaration();
-      decl->Declaration = *(struct tgsi_declaration *) &token;
+      copy_token(&decl->Declaration, &token);
 
       next_token( ctx, &decl->DeclarationRange );
 
@@ -132,8 +149,7 @@ tgsi_parse_token(
       struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
 
       *imm = tgsi_default_full_immediate();
-      imm->Immediate = *(struct tgsi_immediate *) &token;
-
+      copy_token(&imm->Immediate, &token);
       assert( !imm->Immediate.Extended );
 
       switch (imm->Immediate.DataType) {
@@ -158,8 +174,7 @@ tgsi_parse_token(
       unsigned extended;
 
       *inst = tgsi_default_full_instruction();
-      inst->Instruction = *(struct tgsi_instruction *) &token;
-
+      copy_token(&inst->Instruction, &token);
       extended = inst->Instruction.Extended;
 
       while( extended ) {
@@ -169,18 +184,15 @@ tgsi_parse_token(
 
          switch( token.Type ) {
          case TGSI_INSTRUCTION_EXT_TYPE_NV:
-            inst->InstructionExtNv =
-               *(struct tgsi_instruction_ext_nv *) &token;
+            copy_token(&inst->InstructionExtNv, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
-            inst->InstructionExtLabel =
-               *(struct tgsi_instruction_ext_label *) &token;
+            copy_token(&inst->InstructionExtLabel, &token);
             break;
 
          case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
-            inst->InstructionExtTexture =
-               *(struct tgsi_instruction_ext_texture *) &token;
+            copy_token(&inst->InstructionExtTexture, &token);
             break;
 
          default:
@@ -212,13 +224,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
-               inst->FullDstRegisters[i].DstRegisterExtConcode =
-                  *(struct tgsi_dst_register_ext_concode *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtConcode,
+                          &token);
                break;
 
             case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
-               inst->FullDstRegisters[i].DstRegisterExtModulate =
-                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               copy_token(&inst->FullDstRegisters[i].DstRegisterExtModulate,
+                          &token);
                break;
 
             default:
@@ -245,13 +257,13 @@ tgsi_parse_token(
 
             switch( token.Type ) {
             case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
-               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
-                  *(struct tgsi_src_register_ext_swz *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtSwz,
+                          &token);
                break;
 
             case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
-               inst->FullSrcRegisters[i].SrcRegisterExtMod =
-                  *(struct tgsi_src_register_ext_mod *) &token;
+               copy_token(&inst->FullSrcRegisters[i].SrcRegisterExtMod,
+                          &token);
                break;
 
             default:
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sanity.c b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
index c6590272969..11659247c0c 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sanity.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sanity.c
@@ -152,6 +152,12 @@ check_register_usage(
 {
    if (!check_file_name( ctx, file ))
       return FALSE;
+
+   if (index < 0 || index > MAX_REGISTERS) {
+      report_error( ctx, "%s[%i]: Invalid index %s", file_names[file], index, name );
+      return FALSE;
+   }
+
    if (indirect_access) {
       if (!is_any_register_declared( ctx, file ))
          report_error( ctx, "%s: Undeclared %s register", file_names[file], name );
@@ -174,12 +180,10 @@ iter_instruction(
    const struct tgsi_opcode_info *info;
    uint i;
 
-   /* There must be no other instructions after END.
-    */
-   if (ctx->index_of_END != ~0) {
-      report_error( ctx, "Unexpected instruction after END" );
-   }
-   else if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
+   if (inst->Instruction.Opcode == TGSI_OPCODE_END) {
+      if (ctx->index_of_END != ~0) {
+         report_error( ctx, "Too many END instructions" );
+      }
       ctx->index_of_END = ctx->num_instructions;
    }
 
@@ -301,10 +305,10 @@ epilog(
    struct sanity_check_ctx *ctx = (struct sanity_check_ctx *) iter;
    uint file;
 
-   /* There must be an END instruction at the end.
+   /* There must be an END instruction somewhere.
     */
-   if (ctx->index_of_END == ~0 || ctx->index_of_END != ctx->num_instructions - 1) {
-      report_error( ctx, "Expected END at end of instruction sequence" );
+   if (ctx->index_of_END == ~0) {
+      report_error( ctx, "Missing END instruction" );
    }
 
    /* Check if all declared registers were used.
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
index 4681b29f52b..4fdad3a5c78 100644
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
@@ -28,6 +28,7 @@
 #include "pipe/p_debug.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_math.h"
+#include "util/u_sse.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi_exec.h"
@@ -480,10 +481,31 @@ emit_coef_dady(
  * Function call helpers.
  */
 
+/**
+ * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
+ * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
+ * that the stack pointer is 16 byte aligned, as expected.
+ */
 static void
-emit_push_gp(
-   struct x86_function *func )
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_save,
+   unsigned xmm_dst,
+   void (PIPE_CDECL *code)() )
 {
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned i, n, xmm;
+   unsigned xmm_mask;
+   
+   /* Bitmask of the xmm registers to save */
+   xmm_mask = (1 << xmm_save) - 1;
+   xmm_mask &= ~(1 << xmm_dst);
+
+   sse_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_AX) );
@@ -493,12 +515,49 @@ emit_push_gp(
    x86_push(
       func,
       x86_make_reg( file_REG32, reg_DX) );
-}
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i))
+         ++n;
+   
+   x86_sub_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
+
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
+            make_xmm( xmm ) );
+         ++n;
+      }
+   
+   x86_lea(
+      func,
+      ecx,
+      get_temp( TEMP_R0, 0 ) );
+   
+   x86_push( func, ecx );
+   x86_mov_reg_imm( func, ecx, (unsigned long) code );
+   x86_call( func, ecx );
+   x86_pop(func, ecx );
+   
+   for(i = 0, n = 0; i < 8; ++i)
+      if(xmm_mask & (1 << i)) {
+         sse_movups(
+            func,
+            make_xmm( xmm ),
+            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
+         ++n;
+      }
+   
+   x86_add_imm(
+      func, 
+      x86_make_reg( file_REG32, reg_SP ),
+      n*16);
 
-static void
-x86_pop_gp(
-   struct x86_function *func )
-{
    /* Restore GP registers in a reverse order.
     */
    x86_pop(
@@ -510,39 +569,6 @@ x86_pop_gp(
    x86_pop(
       func,
       x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   void (PIPE_CDECL *code)() )
-{
-   sse_movaps(
-      func,
-      get_temp( TEMP_R0, 0 ),
-      make_xmm( xmm_dst ) );
-
-   emit_push_gp(
-      func );
-
-   {
-      struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-
-      x86_lea(
-         func,
-         ecx,
-         get_temp( TEMP_R0, 0 ) );
-
-      x86_push( func, ecx );
-      x86_mov_reg_imm( func, ecx, (unsigned long) code );
-      x86_call( func, ecx );
-      x86_pop(func, ecx ); 
-   }
-
-
-   x86_pop_gp(
-      func );
 
    sse_movaps(
       func,
@@ -553,6 +579,7 @@ emit_func_call_dst(
 static void
 emit_func_call_dst_src(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src,
    void (PIPE_CDECL *code)() )
@@ -564,10 +591,111 @@ emit_func_call_dst_src(
 
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       code );
 }
 
+/*
+ * Fast SSE2 implementation of special math functions.
+ */
+
+#define POLY0(x, c0) _mm_set1_ps(c0)
+#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
+#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
+#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
+#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
+#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
+
+#define EXP_POLY_DEGREE 3
+#define LOG_POLY_DEGREE 5
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+exp2f4(__m128 x)
+{
+   __m128i ipart;
+   __m128 fpart, expipart, expfpart;
+
+   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
+   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
+
+   /* ipart = int(x - 0.5) */
+   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
+
+   /* fpart = x - ipart */
+   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
+
+   /* expipart = (float) (1 << ipart) */
+   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
+
+   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
+#if EXP_POLY_DEGREE == 5
+   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
+#elif EXP_POLY_DEGREE == 4
+   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
+#elif EXP_POLY_DEGREE == 3
+   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
+#elif EXP_POLY_DEGREE == 2
+   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
+#else
+#error
+#endif
+
+   return _mm_mul_ps(expipart, expfpart);
+}
+
+/**
+ * See http://www.devmaster.net/forums/showthread.php?p=43580
+ */
+static INLINE __m128 
+log2f4(__m128 x)
+{
+   __m128i expmask = _mm_set1_epi32(0x7f800000);
+   __m128i mantmask = _mm_set1_epi32(0x007fffff);
+   __m128 one = _mm_set1_ps(1.0f);
+
+   __m128i i = _mm_castps_si128(x);
+
+   /* exp = (float) exponent(x) */
+   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
+
+   /* mant = (float) mantissa(x) */
+   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
+
+   __m128 logmant;
+
+   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
+    * These coefficients can be generate with 
+    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
+    */
+#if LOG_POLY_DEGREE == 6
+   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
+#elif LOG_POLY_DEGREE == 5
+   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
+#elif LOG_POLY_DEGREE == 4
+   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
+#elif LOG_POLY_DEGREE == 3
+   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
+#else
+#error
+#endif
+
+   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
+   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
+
+   return _mm_add_ps(logmant, exp);
+}
+
+static INLINE __m128
+powf4(__m128 x, __m128 y)
+{
+   return exp2f4(_mm_mul_ps(log2f4(x), y));
+}
+
+
 /**
  * Low-level instruction translators.
  */
@@ -610,38 +738,35 @@ cos4f(
 static void
 emit_cos(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save, 
       xmm_dst,
       cos4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 ex24f(
    float *store )
 {
-#if FAST_MATH
-   store[0] = util_fast_exp2( store[0] );
-   store[1] = util_fast_exp2( store[1] );
-   store[2] = util_fast_exp2( store[2] );
-   store[3] = util_fast_exp2( store[3] );
-#else
-   store[0] = powf( 2.0f, store[0] );
-   store[1] = powf( 2.0f, store[1] );
-   store[2] = powf( 2.0f, store[2] );
-   store[3] = powf( 2.0f, store[3] );
-#endif
+   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
 }
 
 static void
 emit_ex2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       ex24f );
 }
@@ -670,10 +795,12 @@ flr4f(
 static void
 emit_flr(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       flr4f );
 }
@@ -691,31 +818,35 @@ frc4f(
 static void
 emit_frc(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       frc4f );
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 lg24f(
    float *store )
 {
-   store[0] = util_fast_log2( store[0] );
-   store[1] = util_fast_log2( store[1] );
-   store[2] = util_fast_log2( store[2] );
-   store[3] = util_fast_log2( store[3] );
+   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
 }
 
 static void
 emit_lg2(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst )
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       lg24f );
 }
@@ -757,14 +888,14 @@ emit_neg(
 }
 
 static void PIPE_CDECL
+#if defined(PIPE_CC_GCC)
+__attribute__((force_align_arg_pointer))
+#endif
 pow4f(
    float *store )
 {
-#if FAST_MATH
-   store[0] = util_fast_pow( store[0], store[4] );
-   store[1] = util_fast_pow( store[1], store[5] );
-   store[2] = util_fast_pow( store[2], store[6] );
-   store[3] = util_fast_pow( store[3], store[7] );
+#if 1
+   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
 #else
    store[0] = powf( store[0], store[4] );
    store[1] = powf( store[1], store[5] );
@@ -776,11 +907,13 @@ pow4f(
 static void
 emit_pow(
    struct x86_function *func,
+   unsigned xmm_save, 
    unsigned xmm_dst,
    unsigned xmm_src )
 {
    emit_func_call_dst_src(
       func,
+      xmm_save,
       xmm_dst,
       xmm_src,
       pow4f );
@@ -873,10 +1006,12 @@ sin4f(
 
 static void
 emit_sin (struct x86_function *func,
+          unsigned xmm_save, 
           unsigned xmm_dst)
 {
    emit_func_call_dst(
       func,
+      xmm_save,
       xmm_dst,
       sin4f );
 }
@@ -1296,7 +1431,7 @@ emit_instruction(
                get_temp(
                   TGSI_EXEC_TEMP_MINUS_128_I,
                   TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 1, 2 );
+            emit_pow( func, 3, 1, 2 );
             FETCH( func, *inst, 0, 0, CHAN_X );
             sse_xorps(
                func,
@@ -1342,11 +1477,11 @@ emit_instruction(
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
             emit_MOV( func, 1, 0 );
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = ex2(floor(src.x)) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                emit_MOV( func, 2, 1 );
-               emit_ex2( func, 2 );
+               emit_ex2( func, 3, 2 );
                STORE( func, *inst, 2, 0, CHAN_X );
             }
             /* dst.y = src.x - floor(src.x) */
@@ -1358,7 +1493,7 @@ emit_instruction(
          }
          /* dst.z = ex2(src.x) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 0 );
+            emit_ex2( func, 3, 0 );
             STORE( func, *inst, 0, 0, CHAN_Z );
          }
       }
@@ -1376,21 +1511,21 @@ emit_instruction(
          FETCH( func, *inst, 0, 0, CHAN_X );
          emit_abs( func, 0 );
          emit_MOV( func, 1, 0 );
-         emit_lg2( func, 1 );
+         emit_lg2( func, 2, 1 );
          /* dst.z = lg2(abs(src.x)) */
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
             STORE( func, *inst, 1, 0, CHAN_Z );
          }
          if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
              IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 1 );
+            emit_flr( func, 2, 1 );
             /* dst.x = floor(lg2(abs(src.x))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
                STORE( func, *inst, 1, 0, CHAN_X );
             }
             /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
             if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 1 );
+               emit_ex2( func, 2, 1 );
                emit_rcp( func, 1, 1 );
                emit_mul( func, 0, 1 );
                STORE( func, *inst, 0, 0, CHAN_Y );
@@ -1580,7 +1715,7 @@ emit_instruction(
    /* TGSI_OPCODE_FRC */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0 );
+         emit_frc( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1593,7 +1728,7 @@ emit_instruction(
    /* TGSI_OPCODE_FLR */
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0 );
+         emit_flr( func, 0, 0 );
          STORE( func, *inst, 0, 0, chan_index );
       }
       break;
@@ -1605,7 +1740,7 @@ emit_instruction(
    case TGSI_OPCODE_EXPBASE2:
    /* TGSI_OPCODE_EX2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0 );
+      emit_ex2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1614,7 +1749,7 @@ emit_instruction(
    case TGSI_OPCODE_LOGBASE2:
    /* TGSI_OPCODE_LG2 */
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0 );
+      emit_lg2( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1624,7 +1759,7 @@ emit_instruction(
    /* TGSI_OPCODE_POW */
       FETCH( func, *inst, 0, 0, CHAN_X );
       FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 1 );
+      emit_pow( func, 0, 0, 1 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1715,7 +1850,7 @@ emit_instruction(
 
    case TGSI_OPCODE_COS:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0 );
+      emit_cos( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1774,7 +1909,7 @@ emit_instruction(
 
    case TGSI_OPCODE_SIN:
       FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0 );
+      emit_sin( func, 0, 0 );
       FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
          STORE( func, *inst, 0, 0, chan_index );
       }
@@ -1868,12 +2003,12 @@ emit_instruction(
    case TGSI_OPCODE_SCS:
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0 );
+         emit_cos( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_X );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
          FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0 );
+         emit_sin( func, 0, 0 );
          STORE( func, *inst, 0, 0, CHAN_Y );
       }
       IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
index b6cff281e6d..3ed8bdfdf33 100644
--- a/src/gallium/auxiliary/util/p_debug.c
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -306,6 +306,13 @@ debug_get_flags_option(const char *name,
    str = _debug_get_option(name);
    if(!str)
       result = dfault;
+   else if (!util_strcmp(str, "help")) {
+      result = dfault;
+      while (flags->name) {
+         debug_printf("%s: help for %s: %s [0x%lx]\n", __FUNCTION__, name, flags->name, flags->value);
+         flags++;
+      }
+   }
    else {
       result = 0;
       while( flags->name ) {
@@ -315,7 +322,12 @@ debug_get_flags_option(const char *name,
       }
    }
 
-   debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   if (str) {
+      debug_printf("%s: %s = 0x%lx (%s)\n", __FUNCTION__, name, result, str);
+   }
+   else {
+      debug_printf("%s: %s = 0x%lx\n", __FUNCTION__, name, result);
+   }
 
    return result;
 }
diff --git a/src/gallium/auxiliary/util/p_debug_mem.c b/src/gallium/auxiliary/util/p_debug_mem.c
index ed18c6540ea..9511479cbbe 100644
--- a/src/gallium/auxiliary/util/p_debug_mem.c
+++ b/src/gallium/auxiliary/util/p_debug_mem.c
@@ -122,8 +122,12 @@ debug_malloc(const char *file, unsigned line, const char *function,
    struct debug_memory_footer *ftr;
    
    hdr = real_malloc(sizeof(*hdr) + size + sizeof(*ftr));
-   if(!hdr)
+   if(!hdr) {
+      debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
+                   file, line, function,
+                   (long unsigned)size);
       return NULL;
+   }
  
    hdr->no = last_no++;
    hdr->file = file;
@@ -219,8 +223,12 @@ debug_realloc(const char *file, unsigned line, const char *function,
 
    /* alloc new */
    new_hdr = real_malloc(sizeof(*new_hdr) + new_size + sizeof(*new_ftr));
-   if(!new_hdr)
+   if(!new_hdr) {
+      debug_printf("%s:%u:%s: out of memory when trying to allocate %lu bytes\n",
+                   file, line, function,
+                   (long unsigned)new_size);
       return NULL;
+   }
    new_hdr->no = old_hdr->no;
    new_hdr->file = old_hdr->file;
    new_hdr->line = old_hdr->line;
@@ -261,8 +269,19 @@ debug_memory_end(unsigned long start_no)
    for (; entry != &list; entry = entry->prev) {
       struct debug_memory_header *hdr;
       void *ptr;
+      struct debug_memory_footer *ftr;
+
       hdr = LIST_ENTRY(struct debug_memory_header, entry, head);
       ptr = data_from_header(hdr);
+      ftr = footer_from_header(hdr);
+
+      if(hdr->magic != DEBUG_MEMORY_MAGIC) {
+         debug_printf("%s:%u:%s: bad or corrupted memory %p\n",
+                      hdr->file, hdr->line, hdr->function,
+                      ptr);
+         debug_assert(0);
+      }
+
       if((start_no <= hdr->no && hdr->no < last_no) ||
 	 (last_no < start_no && (hdr->no < last_no || start_no <= hdr->no))) {
 	 debug_printf("%s:%u:%s: %u bytes at %p not freed\n",
@@ -270,7 +289,15 @@ debug_memory_end(unsigned long start_no)
 		      hdr->size, ptr);
 	 total_size += hdr->size;
       }
+
+      if(ftr->magic != DEBUG_MEMORY_MAGIC) {
+         debug_printf("%s:%u:%s: buffer overflow %p\n",
+                      hdr->file, hdr->line, hdr->function,
+                      ptr);
+         debug_assert(0);
+      }
    }
+
    if(total_size) {
       debug_printf("Total of %u KB of system memory apparently leaked\n",
 		   (total_size + 1023)/1024);
diff --git a/src/gallium/auxiliary/util/u_blit.c b/src/gallium/auxiliary/util/u_blit.c
index 9adf72944e6..d28201ac8dc 100644
--- a/src/gallium/auxiliary/util/u_blit.c
+++ b/src/gallium/auxiliary/util/u_blit.c
@@ -104,6 +104,7 @@ util_create_blit(struct pipe_context *pipe, struct cso_context *cso)
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
    ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* samplers */
    memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_gen_mipmap.c b/src/gallium/auxiliary/util/u_gen_mipmap.c
index b19a649bbcd..9d305ad763a 100644
--- a/src/gallium/auxiliary/util/u_gen_mipmap.c
+++ b/src/gallium/auxiliary/util/u_gen_mipmap.c
@@ -725,6 +725,7 @@ util_create_gen_mipmap(struct pipe_context *pipe,
    ctx->rasterizer.cull_mode = PIPE_WINDING_NONE;
    ctx->rasterizer.bypass_clipping = 1;
    /*ctx->rasterizer.bypass_vs = 1;*/
+   ctx->rasterizer.gl_rasterization_rules = 1;
 
    /* sampler state */
    memset(&ctx->sampler, 0, sizeof(ctx->sampler));
diff --git a/src/gallium/auxiliary/util/u_math.h b/src/gallium/auxiliary/util/u_math.h
index 9b4ca393714..084655e6ddc 100644
--- a/src/gallium/auxiliary/util/u_math.h
+++ b/src/gallium/auxiliary/util/u_math.h
@@ -40,6 +40,7 @@
 
 
 #include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
 
 
 #ifdef __cplusplus
@@ -253,7 +254,7 @@ util_fast_exp2(float x)
 
 
 /**
- * Based on code from http://www.flipcode.com/totd/
+ * Based on code from http://www.flipcode.com/archives/Fast_log_Function.shtml
  */
 static INLINE float
 util_fast_log2(float val)
@@ -272,8 +273,10 @@ util_fast_log2(float val)
 static INLINE float
 util_fast_pow(float x, float y)
 {
-   /* XXX this test may need adjustment */
-   if (y >= 3.0 && -0.02f <= x && x <= 0.02f)
+   /* XXX these tests may need adjustment */
+   if (y >= 3.0f && (-0.02f <= x && x <= 0.02f))
+      return 0.0f;
+   if (y >= 50.0f && (-0.9f <= x && x <= 0.9f))
       return 0.0f;
    return util_fast_exp2(util_fast_log2(x) * y);
 }
diff --git a/src/gallium/auxiliary/util/u_sse.h b/src/gallium/auxiliary/util/u_sse.h
new file mode 100644
index 00000000000..0c8356cd057
--- /dev/null
+++ b/src/gallium/auxiliary/util/u_sse.h
@@ -0,0 +1,72 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * @file
+ * SSE intrinsics portability header.
+ * 
+ * Although the SSE intrinsics are support by all modern x86 and x86-64 
+ * compilers, there are some intrisincs missing in some implementations 
+ * (especially older MSVC versions). This header abstracts that away.
+ */
+
+#ifndef U_SSE_H_
+#define U_SSE_H_
+
+#include <xmmintrin.h>
+#include <emmintrin.h>
+
+
+/* MSVC before VC8 does not support the _mm_castxxx_yyy */
+#if defined(_MSC_VER) && _MSC_VER < 1500
+
+union __declspec(align(16)) m128_types {
+   __m128 m128;
+   __m128i m128i;
+   __m128d m128d;
+};
+
+static __inline __m128
+_mm_castsi128_ps(__m128i a)
+{
+   union m128_types u;
+   u.m128i = a;
+   return u.m128;
+}
+
+static __inline __m128i
+_mm_castps_si128(__m128 a)
+{
+   union m128_types u;
+   u.m128 = a;
+   return u.m128i;
+}
+
+#endif
+
+
+#endif /* U_SSE_H_ */
author	Ben Skeggs <[email protected]>	2008-09-30 11:57:45 +1000
committer	Ben Skeggs <[email protected]>	2008-09-30 11:57:45 +1000
commit	8cd046c9b1da31e4ff178816bdfe2ee3451a1553 (patch)
tree	6b9de732a9d75c3005a80b078af0c97b9cb6c0ab /src/gallium/auxiliary
parent	8c14bd6992e338da8e467775b0237c8db7cbe44d (diff)
parent	5dc8e67078be8b8c42a809311debd275ac7d64a7 (diff)