85 files changed, 26344 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/Makefile b/src/gallium/auxiliary/Makefile
new file mode 100644
index 00000000000..da68498aa1f
--- /dev/null
+++ b/src/gallium/auxiliary/Makefile
@@ -0,0 +1,24 @@
+TOP = ../../..
+include $(TOP)/configs/current
+
+
+ifeq ($(CONFIG_NAME), linux-llvm)
+LLVM_DIR = llvm
+endif
+
+SUBDIRS = pipebuffer $(LLVM_DIR)
+
+
+default: subdirs
+
+
+subdirs:
+	@for dir in $(SUBDIRS) ; do \
+		if [ -d $$dir ] ; then \
+			(cd $$dir && $(MAKE)) || exit 1 ; \
+		fi \
+	done
+
+
+clean:
+	rm -f `find . -name \*.[oa]`
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.c b/src/gallium/auxiliary/cso_cache/cso_cache.c
new file mode 100644
index 00000000000..9e77e0774dd
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.c
@@ -0,0 +1,181 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/* Authors:  Zack Rusin <[email protected]>
+ */
+
+#include "cso_cache.h"
+#include "cso_hash.h"
+
+#if 1
+static unsigned hash_key(const void *key, unsigned key_size)
+{
+   unsigned *ikey = (unsigned *)key;
+   unsigned hash = 0, i;
+
+   assert(key_size % 4 == 0);
+
+   /* I'm sure this can be improved on:
+    */
+   for (i = 0; i < key_size/4; i++)
+      hash ^= ikey[i];
+
+   return hash;
+}
+#else
+static unsigned hash_key(const unsigned char *p, int n)
+{
+   unsigned h = 0;
+   unsigned g;
+
+   while (n--) {
+      h = (h << 4) + *p++;
+      if ((g = (h & 0xf0000000)) != 0)
+         h ^= g >> 23;
+      h &= ~g;
+   }
+   return h;
+}
+#endif
+
+unsigned cso_construct_key(void *item, int item_size)
+{
+   return hash_key((item), item_size);
+}
+
+static struct cso_hash *_cso_hash_for_type(struct cso_cache *sc, enum cso_cache_type type)
+{
+   struct cso_hash *hash = 0;
+
+   switch(type) {
+   case CSO_BLEND:
+      hash = sc->blend_hash;
+      break;
+   case CSO_SAMPLER:
+      hash = sc->sampler_hash;
+      break;
+   case CSO_DEPTH_STENCIL_ALPHA:
+      hash = sc->depth_stencil_hash;
+      break;
+   case CSO_RASTERIZER:
+      hash = sc->rasterizer_hash;
+      break;
+   case CSO_FRAGMENT_SHADER:
+      hash = sc->fs_hash;
+      break;
+   case CSO_VERTEX_SHADER:
+      hash = sc->vs_hash;
+      break;
+   }
+
+   return hash;
+}
+
+static int _cso_size_for_type(enum cso_cache_type type)
+{
+   switch(type) {
+   case CSO_BLEND:
+      return sizeof(struct pipe_blend_state);
+   case CSO_SAMPLER:
+      return sizeof(struct pipe_sampler_state);
+   case CSO_DEPTH_STENCIL_ALPHA:
+      return sizeof(struct pipe_depth_stencil_alpha_state);
+   case CSO_RASTERIZER:
+      return sizeof(struct pipe_rasterizer_state);
+   case CSO_FRAGMENT_SHADER:
+      return sizeof(struct pipe_shader_state);
+   case CSO_VERTEX_SHADER:
+      return sizeof(struct pipe_shader_state);
+   }
+   return 0;
+}
+
+struct cso_hash_iter
+cso_insert_state(struct cso_cache *sc,
+                 unsigned hash_key, enum cso_cache_type type,
+                 void *state)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   return cso_hash_insert(hash, hash_key, state);
+}
+
+struct cso_hash_iter
+cso_find_state(struct cso_cache *sc,
+               unsigned hash_key, enum cso_cache_type type)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+
+   return cso_hash_find(hash, hash_key);
+}
+
+struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
+                                             unsigned hash_key, enum cso_cache_type type,
+                                             void *templ)
+{
+   struct cso_hash_iter iter = cso_find_state(sc, hash_key, type);
+   int size = _cso_size_for_type(type);
+   while (!cso_hash_iter_is_null(iter)) {
+      void *iter_data = cso_hash_iter_data(iter);
+      if (!memcmp(iter_data, templ, size))
+         return iter;
+      iter = cso_hash_iter_next(iter);
+   }
+   return iter;
+}
+
+void * cso_take_state(struct cso_cache *sc,
+                      unsigned hash_key, enum cso_cache_type type)
+{
+   struct cso_hash *hash = _cso_hash_for_type(sc, type);
+   return cso_hash_take(hash, hash_key);
+}
+
+struct cso_cache *cso_cache_create(void)
+{
+   struct cso_cache *sc = malloc(sizeof(struct cso_cache));
+
+   sc->blend_hash         = cso_hash_create();
+   sc->sampler_hash       = cso_hash_create();
+   sc->depth_stencil_hash = cso_hash_create();
+   sc->rasterizer_hash    = cso_hash_create();
+   sc->fs_hash            = cso_hash_create();
+   sc->vs_hash            = cso_hash_create();
+
+   return sc;
+}
+
+void cso_cache_delete(struct cso_cache *sc)
+{
+   assert(sc);
+   cso_hash_delete(sc->blend_hash);
+   cso_hash_delete(sc->sampler_hash);
+   cso_hash_delete(sc->depth_stencil_hash);
+   cso_hash_delete(sc->rasterizer_hash);
+   cso_hash_delete(sc->fs_hash);
+   cso_hash_delete(sc->vs_hash);
+   free(sc);
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_cache.h b/src/gallium/auxiliary/cso_cache/cso_cache.h
new file mode 100644
index 00000000000..116e2eaa2c8
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_cache.h
@@ -0,0 +1,107 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin <[email protected]>
+  */
+
+#ifndef CSO_CACHE_H
+#define CSO_CACHE_H
+
+#include "pipe/p_context.h"
+#include "pipe/p_state.h"
+
+
+struct cso_hash;
+
+struct cso_cache {
+   struct cso_hash *blend_hash;
+   struct cso_hash *depth_stencil_hash;
+   struct cso_hash *fs_hash;
+   struct cso_hash *vs_hash;
+   struct cso_hash *rasterizer_hash;
+   struct cso_hash *sampler_hash;
+};
+
+struct cso_blend {
+   struct pipe_blend_state state;
+   void   *data;
+};
+
+struct cso_depth_stencil_alpha {
+   struct pipe_depth_stencil_alpha_state state;
+   void *data;
+};
+
+struct cso_rasterizer {
+   struct pipe_rasterizer_state state;
+   void *data;
+};
+
+struct cso_fragment_shader {
+   struct pipe_shader_state state;
+   void *data;
+};
+
+struct cso_vertex_shader {
+   struct pipe_shader_state state;
+   void *data;
+};
+
+struct cso_sampler {
+   struct pipe_sampler_state state;
+   void *data;
+};
+
+
+enum cso_cache_type {
+   CSO_BLEND,
+   CSO_SAMPLER,
+   CSO_DEPTH_STENCIL_ALPHA,
+   CSO_RASTERIZER,
+   CSO_FRAGMENT_SHADER,
+   CSO_VERTEX_SHADER
+};
+
+unsigned cso_construct_key(void *item, int item_size);
+
+struct cso_cache *cso_cache_create(void);
+void cso_cache_delete(struct cso_cache *sc);
+
+struct cso_hash_iter cso_insert_state(struct cso_cache *sc,
+                                      unsigned hash_key, enum cso_cache_type type,
+                                      void *state);
+struct cso_hash_iter cso_find_state(struct cso_cache *sc,
+                                    unsigned hash_key, enum cso_cache_type type);
+struct cso_hash_iter cso_find_state_template(struct cso_cache *sc,
+                                             unsigned hash_key, enum cso_cache_type type,
+                                             void *templ);
+void * cso_take_state(struct cso_cache *sc, unsigned hash_key,
+                      enum cso_cache_type type);
+
+#endif
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.c b/src/gallium/auxiliary/cso_cache/cso_hash.c
new file mode 100644
index 00000000000..0338cb3b474
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.c
@@ -0,0 +1,388 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin <[email protected]>
+  */
+
+#include "cso_hash.h"
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <assert.h>
+
+#define MAX(a, b) ((a > b) ? (a) : (b))
+
+static const int MinNumBits = 4;
+
+static const unsigned char prime_deltas[] = {
+   0,  0,  1,  3,  1,  5,  3,  3,  1,  9,  7,  5,  3,  9, 25,  3,
+   1, 21,  3, 21,  7, 15,  9,  5,  3, 29, 15,  0,  0,  0,  0,  0
+};
+
+static int primeForNumBits(int numBits)
+{
+   return (1 << numBits) + prime_deltas[numBits];
+}
+
+/*
+    Returns the smallest integer n such that
+    primeForNumBits(n) >= hint.
+*/
+static int countBits(int hint)
+{
+   int numBits = 0;
+   int bits = hint;
+
+   while (bits > 1) {
+      bits >>= 1;
+      numBits++;
+   }
+
+   if (numBits >= (int)sizeof(prime_deltas)) {
+      numBits = sizeof(prime_deltas) - 1;
+   } else if (primeForNumBits(numBits) < hint) {
+      ++numBits;
+   }
+   return numBits;
+}
+
+struct cso_node {
+   struct cso_node *next;
+   unsigned key;
+   void *value;
+};
+
+struct cso_hash_data {
+   struct cso_node *fakeNext;
+   struct cso_node **buckets;
+   int size;
+   int nodeSize;
+   short userNumBits;
+   short numBits;
+   int numBuckets;
+};
+
+struct cso_hash {
+   union {
+      struct cso_hash_data *d;
+      struct cso_node      *e;
+   } data;
+};
+
+static void *cso_data_allocate_node(struct cso_hash_data *hash)
+{
+   return malloc(hash->nodeSize);
+}
+
+static void cso_data_free_node(struct cso_node *node)
+{
+   /* XXX still a leak here.
+    * Need to cast value ptr to original cso type, then free the
+    * driver-specific data hanging off of it.  For example:
+   struct cso_sampler *csamp = (struct cso_sampler *) node->value;
+   free(csamp->data);
+   */
+   free(node->value);
+   free(node);
+}
+
+static struct cso_node *
+cso_hash_create_node(struct cso_hash *hash,
+                      unsigned akey, void *avalue,
+                      struct cso_node **anextNode)
+{
+   struct cso_node *node = cso_data_allocate_node(hash->data.d);
+   node->key = akey;
+   node->value = avalue;
+
+   node->next = (struct cso_node*)(*anextNode);
+   *anextNode = node;
+   ++hash->data.d->size;
+   return node;
+}
+
+static void cso_data_rehash(struct cso_hash_data *hash, int hint)
+{
+   if (hint < 0) {
+      hint = countBits(-hint);
+      if (hint < MinNumBits)
+         hint = MinNumBits;
+      hash->userNumBits = hint;
+      while (primeForNumBits(hint) < (hash->size >> 1))
+         ++hint;
+   } else if (hint < MinNumBits) {
+      hint = MinNumBits;
+   }
+
+   if (hash->numBits != hint) {
+      struct cso_node *e = (struct cso_node *)(hash);
+      struct cso_node **oldBuckets = hash->buckets;
+      int oldNumBuckets = hash->numBuckets;
+      int  i = 0;
+
+      hash->numBits = hint;
+      hash->numBuckets = primeForNumBits(hint);
+      hash->buckets = malloc(sizeof(struct cso_node*) * hash->numBuckets);
+      for (i = 0; i < hash->numBuckets; ++i)
+         hash->buckets[i] = e;
+
+      for (i = 0; i < oldNumBuckets; ++i) {
+         struct cso_node *firstNode = oldBuckets[i];
+         while (firstNode != e) {
+            unsigned h = firstNode->key;
+            struct cso_node *lastNode = firstNode;
+            while (lastNode->next != e && lastNode->next->key == h)
+               lastNode = lastNode->next;
+
+            struct cso_node *afterLastNode = lastNode->next;
+            struct cso_node **beforeFirstNode = &hash->buckets[h % hash->numBuckets];
+            while (*beforeFirstNode != e)
+               beforeFirstNode = &(*beforeFirstNode)->next;
+            lastNode->next = *beforeFirstNode;
+            *beforeFirstNode = firstNode;
+            firstNode = afterLastNode;
+         }
+      }
+      free(oldBuckets);
+   }
+}
+
+static void cso_data_might_grow(struct cso_hash_data *hash)
+{
+   if (hash->size >= hash->numBuckets)
+      cso_data_rehash(hash, hash->numBits + 1);
+}
+
+static void cso_data_has_shrunk(struct cso_hash_data *hash)
+{
+   if (hash->size <= (hash->numBuckets >> 3) &&
+       hash->numBits > hash->userNumBits) {
+      int max = MAX(hash->numBits-2, hash->userNumBits);
+      cso_data_rehash(hash,  max);
+   }
+}
+
+static struct cso_node *cso_data_first_node(struct cso_hash_data *hash)
+{
+   struct cso_node *e = (struct cso_node *)(hash);
+   struct cso_node **bucket = hash->buckets;
+   int n = hash->numBuckets;
+   while (n--) {
+      if (*bucket != e)
+         return *bucket;
+      ++bucket;
+   }
+   return e;
+}
+
+static struct cso_node **cso_hash_find_node(struct cso_hash *hash, unsigned akey)
+{
+   struct cso_node **node;
+
+   if (hash->data.d->numBuckets) {
+      node = (struct cso_node **)(&hash->data.d->buckets[akey % hash->data.d->numBuckets]);
+      assert(*node == hash->data.e || (*node)->next);
+      while (*node != hash->data.e && (*node)->key != akey)
+         node = &(*node)->next;
+   } else {
+      node = (struct cso_node **)((const struct cso_node * const *)(&hash->data.e));
+   }
+   return node;
+}
+
+struct cso_hash_iter cso_hash_insert(struct cso_hash *hash,
+                                       unsigned key, void *data)
+{
+   cso_data_might_grow(hash->data.d);
+
+   struct cso_node **nextNode = cso_hash_find_node(hash, key);
+   struct cso_node *node = cso_hash_create_node(hash, key, data, nextNode);
+   struct cso_hash_iter iter = {hash, node};
+   return iter;
+}
+
+struct cso_hash * cso_hash_create(void)
+{
+   struct cso_hash *hash = malloc(sizeof(struct cso_hash));
+   hash->data.d = malloc(sizeof(struct cso_hash_data));
+   hash->data.d->fakeNext = 0;
+   hash->data.d->buckets = 0;
+   hash->data.d->size = 0;
+   hash->data.d->nodeSize = sizeof(struct cso_node);
+   hash->data.d->userNumBits = MinNumBits;
+   hash->data.d->numBits = 0;
+   hash->data.d->numBuckets = 0;
+
+   return hash;
+}
+
+void cso_hash_delete(struct cso_hash *hash)
+{
+   struct cso_node *e_for_x = (struct cso_node *)(hash->data.d);
+   struct cso_node **bucket = (struct cso_node **)(hash->data.d->buckets);
+   int n = hash->data.d->numBuckets;
+   while (n--) {
+      struct cso_node *cur = *bucket++;
+      while (cur != e_for_x) {
+         struct cso_node *next = cur->next;
+         cso_data_free_node(cur);
+         cur = next;
+      }
+   }
+   free(hash->data.d->buckets);
+   free(hash->data.d);
+   free(hash);
+}
+
+struct cso_hash_iter cso_hash_find(struct cso_hash *hash,
+                                     unsigned key)
+{
+   struct cso_node **nextNode = cso_hash_find_node(hash, key);
+   struct cso_hash_iter iter = {hash, *nextNode};
+   return iter;
+}
+
+unsigned cso_hash_iter_key(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->key;
+}
+
+void * cso_hash_iter_data(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.hash->data.e == iter.node)
+      return 0;
+   return iter.node->value;
+}
+
+static struct cso_node *cso_hash_data_next(struct cso_node *node)
+{
+   union {
+      struct cso_node *next;
+      struct cso_node *e;
+      struct cso_hash_data *d;
+   } a;
+   a.next = node->next;
+   if (!a.next) {
+      fprintf(stderr, "iterating beyond the last element\n");
+      return 0;
+   }
+   if (a.next->next)
+      return a.next;
+
+   int start = (node->key % a.d->numBuckets) + 1;
+   struct cso_node **bucket = a.d->buckets + start;
+   int n = a.d->numBuckets - start;
+   while (n--) {
+      if (*bucket != a.e)
+         return *bucket;
+      ++bucket;
+   }
+   return a.e;
+}
+
+
+static struct cso_node *cso_hash_data_prev(struct cso_node *node)
+{
+   union {
+      struct cso_node *e;
+      struct cso_hash_data *d;
+   } a;
+
+   a.e = node;
+   while (a.e->next)
+      a.e = a.e->next;
+
+   int start;
+   if (node == a.e)
+      start = a.d->numBuckets - 1;
+   else
+      start = node->key % a.d->numBuckets;
+
+   struct cso_node *sentinel = node;
+   struct cso_node **bucket = a.d->buckets + start;
+   while (start >= 0) {
+      if (*bucket != sentinel) {
+         struct cso_node *prev = *bucket;
+         while (prev->next != sentinel)
+            prev = prev->next;
+         return prev;
+      }
+
+      sentinel = a.e;
+      --bucket;
+      --start;
+   }
+   fprintf(stderr, "iterating backward beyond first element\n");
+   return a.e;
+}
+
+struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter)
+{
+   struct cso_hash_iter next = {iter.hash, cso_hash_data_next(iter.node)};
+   return next;
+}
+
+int cso_hash_iter_is_null(struct cso_hash_iter iter)
+{
+   if (!iter.node || iter.node == iter.hash->data.e)
+      return 1;
+   return 0;
+}
+
+void * cso_hash_take(struct cso_hash *hash,
+                      unsigned akey)
+{
+   struct cso_node **node = cso_hash_find_node(hash, akey);
+   if (*node != hash->data.e) {
+      void *t = (*node)->value;
+      struct cso_node *next = (*node)->next;
+      cso_data_free_node(*node);
+      *node = next;
+      --hash->data.d->size;
+      cso_data_has_shrunk(hash->data.d);
+      return t;
+   }
+   return 0;
+}
+
+struct cso_hash_iter cso_hash_iter_prev(struct cso_hash_iter iter)
+{
+   struct cso_hash_iter prev = {iter.hash,
+                                 cso_hash_data_prev(iter.node)};
+   return prev;
+}
+
+struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash)
+{
+   struct cso_hash_iter iter = {hash, cso_data_first_node(hash->data.d)};
+   return iter;
+}
diff --git a/src/gallium/auxiliary/cso_cache/cso_hash.h b/src/gallium/auxiliary/cso_cache/cso_hash.h
new file mode 100644
index 00000000000..b4aa111860f
--- /dev/null
+++ b/src/gallium/auxiliary/cso_cache/cso_hash.h
@@ -0,0 +1,62 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin <[email protected]>
+  */
+
+#ifndef CSO_HASH_H
+#define CSO_HASH_H
+
+struct cso_hash;
+struct cso_node;
+
+struct cso_hash_iter {
+   struct cso_hash *hash;
+   struct cso_node  *node;
+};
+
+struct cso_hash *cso_hash_create(void);
+void              cso_hash_delete(struct cso_hash *hash);
+
+struct cso_hash_iter cso_hash_insert(struct cso_hash *hash, unsigned key,
+                                     void *data);
+void  *cso_hash_take(struct cso_hash *hash, unsigned key);
+
+struct cso_hash_iter cso_hash_first_node(struct cso_hash *hash);
+struct cso_hash_iter cso_hash_find(struct cso_hash *hash, unsigned key);
+
+
+int       cso_hash_iter_is_null(struct cso_hash_iter iter);
+unsigned  cso_hash_iter_key(struct cso_hash_iter iter);
+void     *cso_hash_iter_data(struct cso_hash_iter iter);
+
+struct cso_hash_iter cso_hash_iter_next(struct cso_hash_iter iter);
+struct cso_hash_iter cso_hash_iter_prev(struct cso_hash_iter iter);
+
+#endif
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
new file mode 100644
index 00000000000..fe9b150f304
--- /dev/null
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -0,0 +1,2 @@
+default:
+	cd ../../../mesa ; make
diff --git a/src/gallium/auxiliary/draw/draw_clip.c b/src/gallium/auxiliary/draw/draw_clip.c
new file mode 100644
index 00000000000..e3051507eaf
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_clip.c
@@ -0,0 +1,488 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Clipping stage
+ *
+ * \author  Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_context.h"
+#include "draw_private.h"
+
+
+#ifndef IS_NEGATIVE
+#define IS_NEGATIVE(X) ((X) < 0.0)
+#endif
+
+#ifndef DIFFERENT_SIGNS
+#define DIFFERENT_SIGNS(x, y) ((x) * (y) <= 0.0F && (x) - (y) != 0.0F)
+#endif
+
+#ifndef MAX_CLIPPED_VERTICES
+#define MAX_CLIPPED_VERTICES ((2 * (6 + PIPE_MAX_CLIP_PLANES))+1)
+#endif
+
+
+
+struct clipper {
+   struct draw_stage stage;      /**< base class */
+
+   /* Basically duplicate some of the flatshading logic here:
+    */
+   boolean flat;
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+
+   float (*plane)[4];
+};
+
+
+/* This is a bit confusing:
+ */
+static INLINE struct clipper *clipper_stage( struct draw_stage *stage )
+{
+   return (struct clipper *)stage;
+}
+
+
+#define LINTERP(T, OUT, IN) ((OUT) + (T) * ((IN) - (OUT)))
+
+
+/* All attributes are float[4], so this is easy:
+ */
+static void interp_attr( float *fdst,
+			 float t,
+			 const float *fin,
+			 const float *fout )
+{  
+   fdst[0] = LINTERP( t, fout[0], fin[0] );
+   fdst[1] = LINTERP( t, fout[1], fin[1] );
+   fdst[2] = LINTERP( t, fout[2], fin[2] );
+   fdst[3] = LINTERP( t, fout[3], fin[3] );
+}
+
+static void copy_colors( struct draw_stage *stage,
+			 struct vertex_header *dst,
+			 const struct vertex_header *src )
+{
+   const struct clipper *clipper = clipper_stage(stage);
+   uint i;
+   for (i = 0; i < clipper->num_color_attribs; i++) {
+      const uint attr = clipper->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
+
+
+
+/* Interpolate between two vertices to produce a third.  
+ */
+static void interp( const struct clipper *clip,
+		    struct vertex_header *dst,
+		    float t,
+		    const struct vertex_header *out, 
+		    const struct vertex_header *in )
+{
+   const unsigned nr_attrs = clip->stage.draw->num_vs_outputs;
+   unsigned j;
+
+   /* Vertex header.
+    */
+   {
+      dst->clipmask = 0;
+      dst->edgeflag = 0;
+      dst->pad = 0;
+      dst->vertex_id = UNDEFINED_VERTEX_ID;
+   }
+
+   /* Clip coordinates:  interpolate normally
+    */
+   {
+      interp_attr(dst->clip, t, in->clip, out->clip);
+   }
+
+   /* Do the projective divide and insert window coordinates:
+    */
+   {
+      const float *pos = dst->clip;
+      const float *scale = clip->stage.draw->viewport.scale;
+      const float *trans = clip->stage.draw->viewport.translate;
+      const float oow = 1.0f / pos[3];
+
+      dst->data[0][0] = pos[0] * oow * scale[0] + trans[0];
+      dst->data[0][1] = pos[1] * oow * scale[1] + trans[1];
+      dst->data[0][2] = pos[2] * oow * scale[2] + trans[2];
+      dst->data[0][3] = oow;
+   }
+
+   /* Other attributes
+    * Note: start at 1 to skip winpos (data[0]) since we just computed
+    * it above.
+    */
+   for (j = 1; j < nr_attrs; j++) {
+      interp_attr(dst->data[j], t, in->data[j], out->data[j]);
+   }
+}
+
+
+static void emit_poly( struct draw_stage *stage,
+		       struct vertex_header **inlist,
+		       unsigned n,
+		       const struct prim_header *origPrim)
+{
+   struct prim_header header;
+   unsigned i;
+
+   /* later stages may need the determinant, but only the sign matters */
+   header.det = origPrim->det;
+
+   for (i = 2; i < n; i++) {
+      header.v[0] = inlist[i-1];
+      header.v[1] = inlist[i];
+      header.v[2] = inlist[0];	/* keep in v[2] for flatshading */
+	
+      {
+	 unsigned tmp1 = header.v[1]->edgeflag;
+	 unsigned tmp2 = header.v[2]->edgeflag;
+
+	 if (i != n-1) header.v[1]->edgeflag = 0;
+	 if (i != 2)   header.v[2]->edgeflag = 0;
+
+         header.edgeflags = ((header.v[0]->edgeflag << 0) | 
+                             (header.v[1]->edgeflag << 1) | 
+                             (header.v[2]->edgeflag << 2));
+
+	 stage->next->tri( stage->next, &header );
+
+	 header.v[1]->edgeflag = tmp1;
+	 header.v[2]->edgeflag = tmp2;
+      }
+   }
+}
+
+
+
+
+/* Clip a triangle against the viewport and user clip planes.
+ */
+static void
+do_clip_tri( struct draw_stage *stage, 
+	     struct prim_header *header,
+	     unsigned clipmask )
+{
+   struct clipper *clipper = clipper_stage( stage );
+   struct vertex_header *a[MAX_CLIPPED_VERTICES];
+   struct vertex_header *b[MAX_CLIPPED_VERTICES];
+   struct vertex_header **inlist = a;
+   struct vertex_header **outlist = b;
+   unsigned tmpnr = 0;
+   unsigned n = 3;
+   unsigned i;
+
+   inlist[0] = header->v[0];
+   inlist[1] = header->v[1];
+   inlist[2] = header->v[2];
+
+   while (clipmask && n >= 3) {
+      const unsigned plane_idx = ffs(clipmask)-1;
+      const float *plane = clipper->plane[plane_idx];
+      struct vertex_header *vert_prev = inlist[0];
+      float dp_prev = dot4( vert_prev->clip, plane );
+      unsigned outcount = 0;
+
+      clipmask &= ~(1<<plane_idx);
+
+      inlist[n] = inlist[0]; /* prevent rotation of vertices */
+
+      for (i = 1; i <= n; i++) {
+	 struct vertex_header *vert = inlist[i];
+
+	 float dp = dot4( vert->clip, plane );
+
+	 if (!IS_NEGATIVE(dp_prev)) {
+	    outlist[outcount++] = vert_prev;
+	 }
+
+	 if (DIFFERENT_SIGNS(dp, dp_prev)) {
+	    struct vertex_header *new_vert = clipper->stage.tmp[tmpnr++];
+	    outlist[outcount++] = new_vert;
+
+	    if (IS_NEGATIVE(dp)) {
+	       /* Going out of bounds.  Avoid division by zero as we
+		* know dp != dp_prev from DIFFERENT_SIGNS, above.
+		*/
+	       float t = dp / (dp - dp_prev);
+	       interp( clipper, new_vert, t, vert, vert_prev );
+	       
+	       /* Force edgeflag true in this case:
+		*/
+	       new_vert->edgeflag = 1;
+	    } else {
+	       /* Coming back in.
+		*/
+	       float t = dp_prev / (dp_prev - dp);
+	       interp( clipper, new_vert, t, vert_prev, vert );
+
+	       /* Copy starting vert's edgeflag:
+		*/
+	       new_vert->edgeflag = vert_prev->edgeflag;
+	    }
+	 }
+
+	 vert_prev = vert;
+	 dp_prev = dp;
+      }
+
+      {
+	 struct vertex_header **tmp = inlist;
+	 inlist = outlist;
+	 outlist = tmp;
+	 n = outcount;
+      }
+   }
+
+   /* If flat-shading, copy color to new provoking vertex.
+    */
+   if (clipper->flat && inlist[0] != header->v[2]) {
+      if (1) {
+	 inlist[0] = dup_vert(stage, inlist[0], tmpnr++);
+      }
+
+      copy_colors(stage, inlist[0], header->v[2]);
+   }
+
+
+
+   /* Emit the polygon as triangles to the setup stage:
+    */
+   if (n >= 3)
+      emit_poly( stage, inlist, n, header );
+}
+
+
+/* Clip a line against the viewport and user clip planes.
+ */
+static void
+do_clip_line( struct draw_stage *stage,
+	      struct prim_header *header,
+	      unsigned clipmask )
+{
+   const struct clipper *clipper = clipper_stage( stage );
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   const float *pos0 = v0->clip;
+   const float *pos1 = v1->clip;
+   float t0 = 0.0F;
+   float t1 = 0.0F;
+   struct prim_header newprim;
+
+   while (clipmask) {
+      const unsigned plane_idx = ffs(clipmask)-1;
+      const float *plane = clipper->plane[plane_idx];
+      const float dp0 = dot4( pos0, plane );
+      const float dp1 = dot4( pos1, plane );
+
+      if (dp1 < 0.0F) {
+	 float t = dp1 / (dp1 - dp0);
+         t1 = MAX2(t1, t);
+      } 
+
+      if (dp0 < 0.0F) {
+	 float t = dp0 / (dp0 - dp1);
+         t0 = MAX2(t0, t);
+      }
+
+      if (t0 + t1 >= 1.0F)
+	 return; /* discard */
+
+      clipmask &= ~(1 << plane_idx);  /* turn off this plane's bit */
+   }
+
+   if (v0->clipmask) {
+      interp( clipper, stage->tmp[0], t0, v0, v1 );
+
+      if (clipper->flat)
+	 copy_colors(stage, stage->tmp[0], v0);
+
+      newprim.v[0] = stage->tmp[0];
+   }
+   else {
+      newprim.v[0] = v0;
+   }
+
+   if (v1->clipmask) {
+      interp( clipper, stage->tmp[1], t1, v1, v0 );
+      newprim.v[1] = stage->tmp[1];
+   }
+   else {
+      newprim.v[1] = v1;
+   }
+
+   stage->next->line( stage->next, &newprim );
+}
+
+
+static void
+clip_point( struct draw_stage *stage, 
+	    struct prim_header *header )
+{
+   if (header->v[0]->clipmask == 0) 
+      stage->next->point( stage->next, header );
+}
+
+
+static void
+clip_line( struct draw_stage *stage,
+	   struct prim_header *header )
+{
+   unsigned clipmask = (header->v[0]->clipmask | 
+                        header->v[1]->clipmask);
+   
+   if (clipmask == 0) {
+      /* no clipping needed */
+      stage->next->line( stage->next, header );
+   }
+   else if ((header->v[0]->clipmask &
+             header->v[1]->clipmask) == 0) {
+      do_clip_line(stage, header, clipmask);
+   }
+   /* else, totally clipped */
+}
+
+
+static void
+clip_tri( struct draw_stage *stage,
+	  struct prim_header *header )
+{
+   unsigned clipmask = (header->v[0]->clipmask | 
+                        header->v[1]->clipmask | 
+                        header->v[2]->clipmask);
+   
+   if (clipmask == 0) {
+      /* no clipping needed */
+      stage->next->tri( stage->next, header );
+   }
+   else if ((header->v[0]->clipmask & 
+             header->v[1]->clipmask & 
+             header->v[2]->clipmask) == 0) {
+      do_clip_tri(stage, header, clipmask);
+   }
+}
+
+/* Update state.  Could further delay this until we hit the first
+ * primitive that really requires clipping.
+ */
+static void 
+clip_init_state( struct draw_stage *stage )
+{
+   struct clipper *clipper = clipper_stage( stage );
+
+   clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
+
+   if (clipper->flat) {
+      const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+      uint i;
+
+      clipper->num_color_attribs = 0;
+      for (i = 0; i < vs->num_outputs; i++) {
+	 if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+	     vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+	    clipper->color_attribs[clipper->num_color_attribs++] = i;
+	 }
+      }
+   }
+   
+   stage->tri = clip_tri;
+   stage->line = clip_line;
+}
+
+
+
+static void clip_first_tri( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void clip_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   clip_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void clip_flush( struct draw_stage *stage, 
+			     unsigned flags )
+{
+   stage->tri = clip_first_tri;
+   stage->line = clip_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void clip_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void clip_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Allocate a new clipper stage.
+ * \return pointer to new stage object
+ */
+struct draw_stage *draw_clip_stage( struct draw_context *draw )
+{
+   struct clipper *clipper = CALLOC_STRUCT(clipper);
+
+   draw_alloc_temp_verts( &clipper->stage, MAX_CLIPPED_VERTICES+1 );
+
+   clipper->stage.draw = draw;
+   clipper->stage.point = clip_point;
+   clipper->stage.line = clip_first_line;
+   clipper->stage.tri = clip_first_tri;
+   clipper->stage.flush = clip_flush;
+   clipper->stage.reset_stipple_counter = clip_reset_stipple_counter;
+   clipper->stage.destroy = clip_destroy;
+
+   clipper->plane = draw->plane;
+
+   return &clipper->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
new file mode 100644
index 00000000000..4be38303169
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -0,0 +1,293 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+
+#include "pipe/p_util.h"
+#include "draw_context.h"
+#include "draw_private.h"
+
+
+
+struct draw_context *draw_create( void )
+{
+   struct draw_context *draw = CALLOC_STRUCT( draw_context );
+
+#if defined(__i386__) || defined(__386__)
+   draw->use_sse = GETENV( "GALLIUM_NOSSE" ) == NULL;
+#else
+   draw->use_sse = FALSE;
+#endif
+
+   /* create pipeline stages */
+   draw->pipeline.wide      = draw_wide_stage( draw );
+   draw->pipeline.stipple   = draw_stipple_stage( draw );
+   draw->pipeline.unfilled  = draw_unfilled_stage( draw );
+   draw->pipeline.twoside   = draw_twoside_stage( draw );
+   draw->pipeline.offset    = draw_offset_stage( draw );
+   draw->pipeline.clip      = draw_clip_stage( draw );
+   draw->pipeline.flatshade = draw_flatshade_stage( draw );
+   draw->pipeline.cull      = draw_cull_stage( draw );
+   draw->pipeline.validate  = draw_validate_stage( draw );
+   draw->pipeline.first     = draw->pipeline.validate;
+
+   ASSIGN_4V( draw->plane[0], -1,  0,  0, 1 );
+   ASSIGN_4V( draw->plane[1],  1,  0,  0, 1 );
+   ASSIGN_4V( draw->plane[2],  0, -1,  0, 1 );
+   ASSIGN_4V( draw->plane[3],  0,  1,  0, 1 );
+   ASSIGN_4V( draw->plane[4],  0,  0,  1, 1 ); /* yes these are correct */
+   ASSIGN_4V( draw->plane[5],  0,  0, -1, 1 ); /* mesa's a bit wonky */
+   draw->nr_planes = 6;
+
+   /* Statically allocate maximum sized vertices for the cache - could be cleverer...
+    */
+   {
+      uint i;
+      const unsigned size = (MAX_VERTEX_SIZE + 0x0f) & ~0x0f;
+      char *tmp = align_malloc(Elements(draw->vcache.vertex) * size, 16);
+
+      for (i = 0; i < Elements(draw->vcache.vertex); i++)
+	 draw->vcache.vertex[i] = (struct vertex_header *)(tmp + i * size);
+   }
+
+   draw->shader_queue_flush = draw_vertex_shader_queue_flush;
+
+   draw->convert_wide_points = TRUE;
+   draw->convert_wide_lines = TRUE;
+
+   draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
+
+   draw_vertex_cache_invalidate( draw );
+   draw_set_mapped_element_buffer( draw, 0, NULL );
+
+   return draw;
+}
+
+
+void draw_destroy( struct draw_context *draw )
+{
+   draw->pipeline.wide->destroy( draw->pipeline.wide );
+   draw->pipeline.stipple->destroy( draw->pipeline.stipple );
+   draw->pipeline.unfilled->destroy( draw->pipeline.unfilled );
+   draw->pipeline.twoside->destroy( draw->pipeline.twoside );
+   draw->pipeline.offset->destroy( draw->pipeline.offset );
+   draw->pipeline.clip->destroy( draw->pipeline.clip );
+   draw->pipeline.flatshade->destroy( draw->pipeline.flatshade );
+   draw->pipeline.cull->destroy( draw->pipeline.cull );
+   draw->pipeline.validate->destroy( draw->pipeline.validate );
+   if (draw->pipeline.rasterize)
+      draw->pipeline.rasterize->destroy( draw->pipeline.rasterize );
+   tgsi_exec_machine_free_data(&draw->machine);
+   align_free( draw->vcache.vertex[0] ); /* Frees all the vertices. */
+   FREE( draw );
+}
+
+
+
+void draw_flush( struct draw_context *draw )
+{
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+}
+
+
+
+/**
+ * Register new primitive rasterization/rendering state.
+ * This causes the drawing pipeline to be rebuilt.
+ */
+void draw_set_rasterizer_state( struct draw_context *draw,
+                                const struct pipe_rasterizer_state *raster )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->rasterizer = raster;
+}
+
+
+/** 
+ * Plug in the primitive rendering/rasterization stage (which is the last
+ * stage in the drawing pipeline).
+ * This is provided by the device driver.
+ */
+void draw_set_rasterize_stage( struct draw_context *draw,
+                               struct draw_stage *stage )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->pipeline.rasterize = stage;
+}
+
+
+/**
+ * Set the draw module's clipping state.
+ */
+void draw_set_clip_state( struct draw_context *draw,
+                          const struct pipe_clip_state *clip )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   assert(clip->nr <= PIPE_MAX_CLIP_PLANES);
+   memcpy(&draw->plane[6], clip->ucp, clip->nr * sizeof(clip->ucp[0]));
+   draw->nr_planes = 6 + clip->nr;
+}
+
+
+/**
+ * Set the draw module's viewport state.
+ */
+void draw_set_viewport_state( struct draw_context *draw,
+                              const struct pipe_viewport_state *viewport )
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->viewport = *viewport; /* struct copy */
+}
+
+
+
+void
+draw_set_vertex_buffer(struct draw_context *draw,
+                       unsigned attr,
+                       const struct pipe_vertex_buffer *buffer)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   assert(attr < PIPE_ATTRIB_MAX);
+   draw->vertex_buffer[attr] = *buffer;
+}
+
+
+void
+draw_set_vertex_element(struct draw_context *draw,
+                        unsigned attr,
+                        const struct pipe_vertex_element *element)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   assert(attr < PIPE_ATTRIB_MAX);
+   draw->vertex_element[attr] = *element;
+}
+
+
+/**
+ * Tell drawing context where to find mapped vertex buffers.
+ */
+void
+draw_set_mapped_vertex_buffer(struct draw_context *draw,
+                              unsigned attr, const void *buffer)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   draw->user.vbuffer[attr] = buffer;
+}
+
+
+void
+draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                const void *buffer)
+{
+   draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE/*STATE_CHANGE*/ );
+   draw->user.constants = buffer;
+}
+
+
+/**
+ * Tells the draw module whether to convert wide points (size != 1)
+ * into triangles.
+ */
+void
+draw_convert_wide_points(struct draw_context *draw, boolean enable)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->convert_wide_points = enable;
+}
+
+
+/**
+ * Tells the draw module whether to convert wide lines (width != 1)
+ * into triangles.
+ */
+void
+draw_convert_wide_lines(struct draw_context *draw, boolean enable)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+   draw->convert_wide_lines = enable;
+}
+
+
+/**
+ * Allocate space for temporary post-transform vertices, such as for clipping.
+ */
+void draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr )
+{
+   assert(!stage->tmp);
+
+   stage->nr_tmps = nr;
+
+   if (nr) {
+      ubyte *store = (ubyte *) MALLOC( MAX_VERTEX_SIZE * nr );
+      unsigned i;
+
+      stage->tmp = (struct vertex_header **) MALLOC( sizeof(struct vertex_header *) * nr );
+      
+      for (i = 0; i < nr; i++)
+	 stage->tmp[i] = (struct vertex_header *)(store + i * MAX_VERTEX_SIZE);
+   }
+}
+
+
+void draw_free_temp_verts( struct draw_stage *stage )
+{
+   if (stage->tmp) {
+      FREE( stage->tmp[0] );
+      FREE( stage->tmp );
+      stage->tmp = NULL;
+   }
+}
+
+
+boolean draw_use_sse(struct draw_context *draw)
+{
+   return (boolean) draw->use_sse;
+}
+
+
+void draw_reset_vertex_ids(struct draw_context *draw)
+{
+   struct draw_stage *stage = draw->pipeline.first;
+   
+   while (stage) {
+      unsigned i;
+
+      for (i = 0; i < stage->nr_tmps; i++)
+	 stage->tmp[i]->vertex_id = UNDEFINED_VERTEX_ID;
+
+      stage = stage->next;
+   }
+
+   draw_vertex_cache_reset_vertex_ids(draw);
+}
diff --git a/src/gallium/auxiliary/draw/draw_context.h b/src/gallium/auxiliary/draw/draw_context.h
new file mode 100644
index 00000000000..ddeb184497a
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_context.h
@@ -0,0 +1,142 @@
+
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Public interface into the drawing module.
+ */
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+
+#ifndef DRAW_CONTEXT_H
+#define DRAW_CONTEXT_H
+
+
+#include "pipe/p_state.h"
+
+
+struct vertex_buffer;
+struct vertex_info;
+struct draw_context;
+struct draw_stage;
+struct draw_vertex_shader;
+
+
+/**
+ * Clipmask flags
+ */
+/*@{*/
+#define CLIP_RIGHT_BIT   0x01
+#define CLIP_LEFT_BIT    0x02
+#define CLIP_TOP_BIT     0x04
+#define CLIP_BOTTOM_BIT  0x08
+#define CLIP_NEAR_BIT    0x10
+#define CLIP_FAR_BIT     0x20
+/*@}*/
+
+/**
+ * Bitshift for each clip flag
+ */
+/*@{*/
+#define CLIP_RIGHT_SHIFT 	0
+#define CLIP_LEFT_SHIFT 	1
+#define CLIP_TOP_SHIFT  	2
+#define CLIP_BOTTOM_SHIFT       3
+#define CLIP_NEAR_SHIFT  	4
+#define CLIP_FAR_SHIFT  	5
+/*@}*/
+
+
+struct draw_context *draw_create( void );
+
+void draw_destroy( struct draw_context *draw );
+
+void draw_set_viewport_state( struct draw_context *draw,
+                              const struct pipe_viewport_state *viewport );
+
+void draw_set_clip_state( struct draw_context *pipe,
+                          const struct pipe_clip_state *clip );
+
+void draw_set_rasterizer_state( struct draw_context *draw,
+                                const struct pipe_rasterizer_state *raster );
+
+void draw_set_rasterize_stage( struct draw_context *draw,
+                               struct draw_stage *stage );
+
+void draw_convert_wide_points(struct draw_context *draw, boolean enable);
+
+void draw_convert_wide_lines(struct draw_context *draw, boolean enable);
+
+
+struct draw_vertex_shader *
+draw_create_vertex_shader(struct draw_context *draw,
+                          const struct pipe_shader_state *shader);
+void draw_bind_vertex_shader(struct draw_context *draw,
+                             struct draw_vertex_shader *dvs);
+void draw_delete_vertex_shader(struct draw_context *draw,
+                               struct draw_vertex_shader *dvs);
+
+boolean draw_use_sse(struct draw_context *draw);
+
+void draw_set_vertex_buffer(struct draw_context *draw,
+			    unsigned attr,
+			    const struct pipe_vertex_buffer *buffer);
+
+void draw_set_vertex_element(struct draw_context *draw,
+			     unsigned attr,
+			     const struct pipe_vertex_element *element);
+
+void draw_set_mapped_element_buffer( struct draw_context *draw,
+                                     unsigned eltSize, void *elements );
+
+void draw_set_mapped_vertex_buffer(struct draw_context *draw,
+                                   unsigned attr, const void *buffer);
+
+void draw_set_mapped_constant_buffer(struct draw_context *draw,
+                                     const void *buffer);
+
+
+/***********************************************************************
+ * draw_prim.c 
+ */
+
+void draw_arrays(struct draw_context *draw, unsigned prim,
+		 unsigned start, unsigned count);
+
+void draw_flush(struct draw_context *draw);
+
+/***********************************************************************
+ * draw_debug.c 
+ */
+boolean draw_validate_prim( unsigned prim, unsigned length );
+unsigned draw_trim_prim( unsigned mode, unsigned count );
+
+
+
+#endif /* DRAW_CONTEXT_H */
diff --git a/src/gallium/auxiliary/draw/draw_cull.c b/src/gallium/auxiliary/draw/draw_cull.c
new file mode 100644
index 00000000000..8177b0ac86e
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_cull.c
@@ -0,0 +1,150 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Drawing stage for polygon culling
+ */
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+
+
+struct cull_stage {
+   struct draw_stage stage;
+   unsigned winding;  /**< which winding(s) to cull (one of PIPE_WINDING_x) */
+};
+
+
+static INLINE struct cull_stage *cull_stage( struct draw_stage *stage )
+{
+   return (struct cull_stage *)stage;
+}
+
+
+
+
+static void cull_tri( struct draw_stage *stage,
+		      struct prim_header *header )
+{
+   /* Window coords: */
+   const float *v0 = header->v[0]->data[0];
+   const float *v1 = header->v[1]->data[0];
+   const float *v2 = header->v[2]->data[0];
+
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0] - v2[0];
+   const float ey = v0[1] - v2[1];
+   const float fx = v1[0] - v2[0];
+   const float fy = v1[1] - v2[1];
+   
+   /* det = cross(e,f).z */
+   header->det = ex * fy - ey * fx;
+
+   if (header->det != 0) {
+      /* if (det < 0 then Z points toward camera and triangle is 
+       * counter-clockwise winding.
+       */
+      unsigned winding = (header->det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
+
+      if ((winding & cull_stage(stage)->winding) == 0) {
+         /* triangle is not culled, pass to next stage */
+	 stage->next->tri( stage->next, header );
+      }
+   }
+}
+
+static void cull_first_tri( struct draw_stage *stage, 
+			    struct prim_header *header )
+{
+   struct cull_stage *cull = cull_stage(stage);
+
+   cull->winding = stage->draw->rasterizer->cull_mode;
+
+   stage->tri = cull_tri;
+   stage->tri( stage, header );
+}
+
+
+
+static void cull_line( struct draw_stage *stage,
+		       struct prim_header *header )
+{
+   stage->next->line( stage->next, header );
+}
+
+
+static void cull_point( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void cull_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->tri = cull_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+static void cull_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void cull_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create a new polygon culling stage.
+ */
+struct draw_stage *draw_cull_stage( struct draw_context *draw )
+{
+   struct cull_stage *cull = CALLOC_STRUCT(cull_stage);
+
+   draw_alloc_temp_verts( &cull->stage, 0 );
+
+   cull->stage.draw = draw;
+   cull->stage.next = NULL;
+   cull->stage.point = cull_point;
+   cull->stage.line = cull_line;
+   cull->stage.tri = cull_first_tri;
+   cull->stage.flush = cull_flush;
+   cull->stage.reset_stipple_counter = cull_reset_stipple_counter;
+   cull->stage.destroy = cull_destroy;
+
+   return &cull->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_debug.c b/src/gallium/auxiliary/draw/draw_debug.c
new file mode 100644
index 00000000000..d6220b5f62c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_debug.c
@@ -0,0 +1,113 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+
+static void
+draw_prim_info(unsigned prim, unsigned *first, unsigned *incr)
+{
+   assert(prim >= PIPE_PRIM_POINTS);
+   assert(prim <= PIPE_PRIM_POLYGON);
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      *first = 1;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_LINES:
+      *first = 2;
+      *incr = 2;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+      *first = 2;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_LINE_LOOP:
+      *first = 2;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      *first = 3;
+      *incr = 3;
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      *first = 3;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      *first = 3;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      *first = 4;
+      *incr = 4;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      *first = 4;
+      *incr = 2;
+      break;
+   default:
+      assert(0);
+      *first = 1;
+      *incr = 1;
+      break;
+   }
+}
+
+
+unsigned 
+draw_trim_prim( unsigned mode, unsigned count )
+{
+   unsigned length, first, incr;
+
+   draw_prim_info( mode, &first, &incr );
+
+   if (count < first)
+      length = 0;
+   else
+      length = count - (count - first) % incr; 
+
+   return length;
+}
+
+
+boolean
+draw_validate_prim( unsigned mode, unsigned count )
+{
+   return (count > 0 &&
+           count == draw_trim_prim( mode, count ));
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_flatshade.c b/src/gallium/auxiliary/draw/draw_flatshade.c
new file mode 100644
index 00000000000..4398abbc60c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_flatshade.c
@@ -0,0 +1,205 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+/** subclass of draw_stage */
+struct flat_stage
+{
+   struct draw_stage stage;
+
+   uint num_color_attribs;
+   uint color_attribs[4];  /* front/back primary/secondary colors */
+};
+
+
+static INLINE struct flat_stage *
+flat_stage(struct draw_stage *stage)
+{
+   return (struct flat_stage *) stage;
+}
+
+
+/** Copy all the color attributes from 'src' vertex to 'dst' vertex */
+static INLINE void copy_colors( struct draw_stage *stage,
+                                struct vertex_header *dst,
+                                const struct vertex_header *src )
+{
+   const struct flat_stage *flat = flat_stage(stage);
+   uint i;
+   for (i = 0; i < flat->num_color_attribs; i++) {
+      const uint attr = flat->color_attribs[i];
+      COPY_4FV(dst->data[attr], src->data[attr]);
+   }
+}
+
+
+/** Copy all the color attributes from src vertex to dst0 & dst1 vertices */
+static INLINE void copy_colors2( struct draw_stage *stage,
+                                 struct vertex_header *dst0,
+                                 struct vertex_header *dst1,
+                                 const struct vertex_header *src )
+{
+   const struct flat_stage *flat = flat_stage(stage);
+   uint i;
+   for (i = 0; i < flat->num_color_attribs; i++) {
+      const uint attr = flat->color_attribs[i];
+      COPY_4FV(dst0->data[attr], src->data[attr]);
+      COPY_4FV(dst1->data[attr], src->data[attr]);
+   }
+}
+
+
+/**
+ * Flatshade tri.  Required for clipping and when unfilled tris are
+ * active, otherwise handled by hardware.
+ */
+static void flatshade_tri( struct draw_stage *stage,
+			   struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.edgeflags = header->edgeflags;
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = dup_vert(stage, header->v[1], 1);
+   tmp.v[2] = header->v[2];
+
+   copy_colors2(stage, tmp.v[0], tmp.v[1], tmp.v[2]);
+   
+   stage->next->tri( stage->next, &tmp );
+}
+
+
+/**
+ * Flatshade line.  Required for clipping.
+ */
+static void flatshade_line( struct draw_stage *stage,
+			    struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = header->v[1];
+
+   copy_colors(stage, tmp.v[0], tmp.v[1]);
+   
+   stage->next->line( stage->next, &tmp );
+}
+
+
+static void flatshade_point( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void flatshade_init_state( struct draw_stage *stage )
+{
+   struct flat_stage *flat = flat_stage(stage);
+   const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+   uint i;
+
+   /* Find which vertex shader outputs are colors, make a list */
+   flat->num_color_attribs = 0;
+   for (i = 0; i < vs->num_outputs; i++) {
+      if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR ||
+          vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+         flat->color_attribs[flat->num_color_attribs++] = i;
+      }
+   }
+
+   stage->line = flatshade_line;
+   stage->tri = flatshade_tri;
+}
+
+static void flatshade_first_tri( struct draw_stage *stage,
+				 struct prim_header *header )
+{
+   flatshade_init_state( stage );
+   stage->tri( stage, header );
+}
+
+static void flatshade_first_line( struct draw_stage *stage,
+				  struct prim_header *header )
+{
+   flatshade_init_state( stage );
+   stage->line( stage, header );
+}
+
+
+static void flatshade_flush( struct draw_stage *stage, 
+			     unsigned flags )
+{
+   stage->tri = flatshade_first_tri;
+   stage->line = flatshade_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void flatshade_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void flatshade_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create flatshading drawing stage.
+ */
+struct draw_stage *draw_flatshade_stage( struct draw_context *draw )
+{
+   struct flat_stage *flatshade = CALLOC_STRUCT(flat_stage);
+
+   draw_alloc_temp_verts( &flatshade->stage, 2 );
+
+   flatshade->stage.draw = draw;
+   flatshade->stage.next = NULL;
+   flatshade->stage.point = flatshade_point;
+   flatshade->stage.line = flatshade_first_line;
+   flatshade->stage.tri = flatshade_first_tri;
+   flatshade->stage.flush = flatshade_flush;
+   flatshade->stage.reset_stipple_counter = flatshade_reset_stipple_counter;
+   flatshade->stage.destroy = flatshade_destroy;
+
+   return &flatshade->stage;
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_offset.c b/src/gallium/auxiliary/draw/draw_offset.c
new file mode 100644
index 00000000000..dbc676deae4
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_offset.c
@@ -0,0 +1,186 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  polygon offset state
+ *
+ * \author  Keith Whitwell <[email protected]>
+ * \author  Brian Paul
+ */
+
+#include "pipe/p_util.h"
+#include "draw_private.h"
+
+
+
+struct offset_stage {
+   struct draw_stage stage;
+
+   float scale;
+   float units;
+};
+
+
+
+static INLINE struct offset_stage *offset_stage( struct draw_stage *stage )
+{
+   return (struct offset_stage *) stage;
+}
+
+
+
+
+
+/**
+ * Offset tri Z.  Some hardware can handle this, but not usually when
+ * doing unfilled rendering.
+ */
+static void do_offset_tri( struct draw_stage *stage,
+			   struct prim_header *header )
+{
+   struct offset_stage *offset = offset_stage(stage);   
+   float inv_det = 1.0f / header->det;
+
+   /* Window coords:
+    */
+   float *v0 = header->v[0]->data[0];
+   float *v1 = header->v[1]->data[0];
+   float *v2 = header->v[2]->data[0];
+
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   float ex = v0[0] - v2[0];
+   float ey = v0[1] - v2[1];
+   float ez = v0[2] - v2[2];
+   float fx = v1[0] - v2[0];
+   float fy = v1[1] - v2[1];
+   float fz = v1[2] - v2[2];
+
+   /* (a,b) = cross(e,f).xy */
+   float a = ey*fz - ez*fy;
+   float b = ez*fx - ex*fz;
+
+   float dzdx = FABSF(a * inv_det);
+   float dzdy = FABSF(b * inv_det);
+
+   float zoffset = offset->units + MAX2(dzdx, dzdy) * offset->scale;
+
+   /*
+    * Note: we're applying the offset and clamping per-vertex.
+    * Ideally, the offset is applied per-fragment prior to fragment shading.
+    */
+   v0[2] = CLAMP(v0[2] + zoffset, 0.0f, 1.0f);
+   v1[2] = CLAMP(v1[2] + zoffset, 0.0f, 1.0f);
+   v2[2] = CLAMP(v2[2] + zoffset, 0.0f, 1.0f);
+
+   stage->next->tri( stage->next, header );
+}
+
+
+static void offset_tri( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   struct prim_header tmp;
+
+   tmp.det = header->det;
+   tmp.edgeflags = header->edgeflags;
+   tmp.v[0] = dup_vert(stage, header->v[0], 0);
+   tmp.v[1] = dup_vert(stage, header->v[1], 1);
+   tmp.v[2] = dup_vert(stage, header->v[2], 2);
+
+   do_offset_tri( stage, &tmp );
+}
+
+
+static void offset_first_tri( struct draw_stage *stage, 
+			      struct prim_header *header )
+{
+   struct offset_stage *offset = offset_stage(stage);
+   float mrd = 1.0f / 65535.0f; /* XXX this depends on depthbuffer bits! */
+
+   offset->units = stage->draw->rasterizer->offset_units * mrd;
+   offset->scale = stage->draw->rasterizer->offset_scale;
+
+   stage->tri = offset_tri;
+   stage->tri( stage, header );
+}
+
+
+static void offset_line( struct draw_stage *stage,
+			 struct prim_header *header )
+{
+   stage->next->line( stage->next, header );
+}
+
+
+static void offset_point( struct draw_stage *stage,
+			  struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void offset_flush( struct draw_stage *stage,
+			  unsigned flags )
+{
+   stage->tri = offset_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void offset_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void offset_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create polygon offset drawing stage.
+ */
+struct draw_stage *draw_offset_stage( struct draw_context *draw )
+{
+   struct offset_stage *offset = CALLOC_STRUCT(offset_stage);
+
+   draw_alloc_temp_verts( &offset->stage, 3 );
+
+   offset->stage.draw = draw;
+   offset->stage.next = NULL;
+   offset->stage.point = offset_point;
+   offset->stage.line = offset_line;
+   offset->stage.tri = offset_first_tri;
+   offset->stage.flush = offset_flush;
+   offset->stage.reset_stipple_counter = offset_reset_stipple_counter;
+   offset->stage.destroy = offset_destroy;
+
+   return &offset->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_prim.c b/src/gallium/auxiliary/draw/draw_prim.c
new file mode 100644
index 00000000000..51e2242719a
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_prim.c
@@ -0,0 +1,482 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "pipe/p_debug.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+
+#define RP_NONE  0
+#define RP_POINT 1
+#define RP_LINE  2
+#define RP_TRI   3
+
+
+static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
+   RP_POINT,
+   RP_LINE,
+   RP_LINE,
+   RP_LINE,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI,
+   RP_TRI
+};
+
+
+static void draw_prim_queue_flush( struct draw_context *draw )
+{
+   unsigned i;
+
+   if (0)
+      debug_printf("Flushing with %d prims, %d verts\n",
+                   draw->pq.queue_nr, draw->vs.queue_nr);
+
+   assert (draw->pq.queue_nr != 0);
+
+   /* NOTE: we cannot save draw->pipeline->first in a local var because
+    * draw->pipeline->first is often changed by the first call to tri(),
+    * line(), etc.
+    */
+   if (draw->rasterizer->line_stipple_enable) {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) {
+	    if (draw->pq.queue[i].reset_line_stipple)
+	       draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	    
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 }
+	 break;
+      case RP_POINT:
+	 draw->pipeline.first->reset_stipple_counter( draw->pipeline.first );
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      }
+   }
+   else {
+      switch (draw->reduced_prim) {
+      case RP_TRI:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->tri( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_LINE:
+	 for (i = 0; i < draw->pq.queue_nr; i++) 
+	    draw->pipeline.first->line( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      case RP_POINT:
+	 for (i = 0; i < draw->pq.queue_nr; i++)
+	    draw->pipeline.first->point( draw->pipeline.first, &draw->pq.queue[i] );
+	 break;
+      }
+   }
+
+   draw->pq.queue_nr = 0;   
+   draw_vertex_cache_unreference( draw );
+}
+
+
+
+void draw_do_flush( struct draw_context *draw, unsigned flags )
+{
+   if (0)
+      debug_printf("Flushing with %d verts, %d prims\n",
+                   draw->vs.queue_nr,
+                   draw->pq.queue_nr );
+
+
+   if (flags >= DRAW_FLUSH_SHADER_QUEUE) {
+      if (draw->vs.queue_nr)
+         (*draw->shader_queue_flush)(draw);
+
+      if (flags >= DRAW_FLUSH_PRIM_QUEUE) {
+	 if (draw->pq.queue_nr)
+	    draw_prim_queue_flush(draw);
+
+	 if (flags >= DRAW_FLUSH_VERTEX_CACHE) {
+            draw_vertex_cache_invalidate(draw);
+
+	    if (flags >= DRAW_FLUSH_STATE_CHANGE) {
+               draw->pipeline.first->flush( draw->pipeline.first, flags );
+               draw->pipeline.first = draw->pipeline.validate;
+               draw->reduced_prim = ~0;
+	    }
+	 }
+      }    
+   }
+}
+
+
+
+/* Return a pointer to a freshly queued primitive header.  Ensure that
+ * there is room in the vertex cache for a maximum of "nr_verts" new
+ * vertices.  Flush primitive and/or vertex queues if necessary to
+ * make space.
+ */
+static struct prim_header *get_queued_prim( struct draw_context *draw,
+					    unsigned nr_verts )
+{
+   if (!draw_vertex_cache_check_space( draw, nr_verts )) {
+//      debug_printf("v");
+      draw_do_flush( draw, DRAW_FLUSH_VERTEX_CACHE );
+   }
+   else if (draw->pq.queue_nr == PRIM_QUEUE_LENGTH) {
+//      debug_printf("p");
+      draw_do_flush( draw, DRAW_FLUSH_PRIM_QUEUE );
+   }
+
+   assert(draw->pq.queue_nr < PRIM_QUEUE_LENGTH);
+
+   return &draw->pq.queue[draw->pq.queue_nr++];
+}
+
+
+
+/**
+ * Add a point to the primitive queue.
+ * \param i0  index into user's vertex arrays
+ */
+static void do_point( struct draw_context *draw,
+		      unsigned i0 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 1 );
+   
+   prim->reset_line_stipple = 0;
+   prim->edgeflags = 1;
+   prim->pad = 0;
+   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
+}
+
+
+/**
+ * Add a line to the primitive queue.
+ * \param i0  index into user's vertex arrays
+ * \param i1  index into user's vertex arrays
+ */
+static void do_line( struct draw_context *draw,
+		     boolean reset_stipple,
+		     unsigned i0,
+		     unsigned i1 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 2 );
+   
+   prim->reset_line_stipple = reset_stipple;
+   prim->edgeflags = 1;
+   prim->pad = 0;
+   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
+   prim->v[1] = draw->vcache.get_vertex( draw, i1 );
+}
+
+/**
+ * Add a triangle to the primitive queue.
+ */
+static void do_triangle( struct draw_context *draw,
+			 unsigned i0,
+			 unsigned i1,
+			 unsigned i2 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 3 );
+   
+   prim->reset_line_stipple = 1;
+   prim->edgeflags = ~0;
+   prim->pad = 0;
+   prim->v[0] = draw->vcache.get_vertex( draw, i0 );
+   prim->v[1] = draw->vcache.get_vertex( draw, i1 );
+   prim->v[2] = draw->vcache.get_vertex( draw, i2 );
+}
+			  
+static void do_ef_triangle( struct draw_context *draw,
+			    boolean reset_stipple,
+			    unsigned ef_mask,
+			    unsigned i0,
+			    unsigned i1,
+			    unsigned i2 )
+{
+   struct prim_header *prim = get_queued_prim( draw, 3 );
+   struct vertex_header *v0 = draw->vcache.get_vertex( draw, i0 );
+   struct vertex_header *v1 = draw->vcache.get_vertex( draw, i1 );
+   struct vertex_header *v2 = draw->vcache.get_vertex( draw, i2 );
+
+   prim->reset_line_stipple = reset_stipple;
+
+   prim->edgeflags = ef_mask & ((v0->edgeflag << 0) | 
+				(v1->edgeflag << 1) | 
+				(v2->edgeflag << 2));
+   prim->pad = 0;
+   prim->v[0] = v0;
+   prim->v[1] = v1;
+   prim->v[2] = v2;
+}
+
+
+static void do_ef_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   const unsigned omitEdge2 = ~(1 << 1);
+   const unsigned omitEdge3 = ~(1 << 2);
+   do_ef_triangle( draw, 1, omitEdge2, v0, v1, v3 );
+   do_ef_triangle( draw, 0, omitEdge3, v1, v2, v3 );
+}
+
+static void do_quad( struct draw_context *draw,
+		     unsigned v0,
+		     unsigned v1,
+		     unsigned v2,
+		     unsigned v3 )
+{
+   do_triangle( draw, v0, v1, v3 );
+   do_triangle( draw, v1, v2, v3 );
+}
+
+
+/**
+ * Main entrypoint to draw some number of points/lines/triangles
+ */
+static void
+draw_prim( struct draw_context *draw, 
+	   unsigned prim, unsigned start, unsigned count )
+{
+   unsigned i;
+   boolean unfilled = (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+		       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL);
+
+//   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count );
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i ++) {
+	 do_point( draw,
+		   start + i );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) {
+	 do_line( draw, 
+		  TRUE,
+		  start + i + 0,
+		  start + i + 1);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:  
+      if (count >= 2) {
+	 for (i = 1; i < count; i++) {
+	    do_line( draw, 
+		     i == 1, 	/* XXX: only if vb not split */
+		     start + i - 1,
+		     start + i );
+	 }
+
+	 do_line( draw, 
+		  0,
+		  start + count - 1,
+		  start + 0 );
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      for (i = 1; i < count; i++) {
+	 do_line( draw,
+		  i == 1,
+		  start + i - 1,
+		  start + i );
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      if (unfilled) {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_ef_triangle( draw,
+			    1, 
+			    ~0,
+			    start + i + 0,
+			    start + i + 1,
+			    start + i + 2 );
+	 }
+      } 
+      else {
+	 for (i = 0; i+2 < count; i += 3) {
+	    do_triangle( draw,
+			 start + i + 0,
+			 start + i + 1,
+			 start + i + 2 );
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      for (i = 0; i+2 < count; i++) {
+	 if (i & 1) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 0,
+			 start + i + 2 );
+	 }
+	 else {
+	    do_triangle( draw,
+			 start + i + 0,
+			 start + i + 1,
+			 start + i + 2 );
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + 0,
+			 start + i + 1,
+			 start + i + 2 );
+	 }
+      }
+      break;
+
+
+   case PIPE_PRIM_QUADS:
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_ef_quad( draw,
+			start + i + 0,
+			start + i + 1,
+			start + i + 2,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 4) {
+	    do_quad( draw,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 2,
+		     start + i + 3);
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      if (unfilled) {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_ef_quad( draw,
+			start + i + 2,
+			start + i + 0,
+			start + i + 1,
+			start + i + 3);
+	 }
+      }
+      else {
+	 for (i = 0; i+3 < count; i += 2) {
+	    do_quad( draw,
+		     start + i + 2,
+		     start + i + 0,
+		     start + i + 1,
+		     start + i + 3);
+	 }
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      if (unfilled) {
+	 unsigned ef_mask = (1<<2) | (1<<0);
+
+	 for (i = 0; i+2 < count; i++) {
+
+            if (i + 3 >= count)
+	       ef_mask |= (1<<1);
+
+	    do_ef_triangle( draw,
+			    i == 0,
+			    ef_mask,
+			    start + i + 1,
+			    start + i + 2,
+			    start + 0);
+
+	    ef_mask &= ~(1<<2);
+	 }
+      }
+      else {
+	 for (i = 0; i+2 < count; i++) {
+	    do_triangle( draw,
+			 start + i + 1,
+			 start + i + 2,
+			 start + 0);
+	 }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+}
+
+
+
+
+/**
+ * Draw vertex arrays
+ * This is the main entrypoint into the drawing module.
+ * \param prim  one of PIPE_PRIM_x
+ * \param start  index of first vertex to draw
+ * \param count  number of vertices to draw
+ */
+void
+draw_arrays(struct draw_context *draw, unsigned prim,
+            unsigned start, unsigned count)
+{
+   if (reduced_prim[prim] != draw->reduced_prim) {
+      draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+      draw->reduced_prim = reduced_prim[prim];
+   }
+
+   /* drawing done here: */
+   draw_prim(draw, prim, start, count);
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
new file mode 100644
index 00000000000..bc11259cb21
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -0,0 +1,354 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Private data structures, etc for the draw module.
+ */
+
+
+/**
+ * Authors:
+ * Keith Whitwell <[email protected]>
+ * Brian Paul
+ */
+
+
+#ifndef DRAW_PRIVATE_H
+#define DRAW_PRIVATE_H
+
+
+#include "pipe/p_state.h"
+#include "pipe/p_defines.h"
+
+#include "x86/rtasm/x86sse.h"
+#include "tgsi/exec/tgsi_exec.h"
+
+
+struct gallivm_prog;
+struct gallivm_cpu_engine;
+
+/**
+ * Basic vertex info.
+ * Carry some useful information around with the vertices in the prim pipe.  
+ */
+struct vertex_header {
+   unsigned clipmask:12;
+   unsigned edgeflag:1;
+   unsigned pad:3;
+   unsigned vertex_id:16;
+
+   float clip[4];
+
+   float data[][4];		/* Note variable size */
+};
+
+/* NOTE: It should match vertex_id size above */
+#define UNDEFINED_VERTEX_ID 0xffff
+
+/* XXX This is too large */
+#define MAX_VERTEX_SIZE ((2 + PIPE_MAX_SHADER_OUTPUTS) * 4 * sizeof(float))
+
+
+
+/**
+ * Basic info for a point/line/triangle primitive.
+ */
+struct prim_header {
+   float det;                 /**< front/back face determinant */
+   unsigned reset_line_stipple:1;
+   unsigned edgeflags:3;
+   unsigned pad:28;
+   struct vertex_header *v[3];  /**< 1 to 3 vertex pointers */
+};
+
+
+
+struct draw_context;
+
+/**
+ * Base class for all primitive drawing stages.
+ */
+struct draw_stage
+{
+   struct draw_context *draw;   /**< parent context */
+
+   struct draw_stage *next;     /**< next stage in pipeline */
+
+   struct vertex_header **tmp;  /**< temp vert storage, such as for clipping */
+   unsigned nr_tmps;
+
+   void (*point)( struct draw_stage *,
+		  struct prim_header * );
+
+   void (*line)( struct draw_stage *,
+		 struct prim_header * );
+
+   void (*tri)( struct draw_stage *,
+		struct prim_header * );
+
+   void (*flush)( struct draw_stage *,
+		  unsigned flags );
+
+   void (*reset_stipple_counter)( struct draw_stage * );
+
+   void (*destroy)( struct draw_stage * );
+};
+
+
+#define PRIM_QUEUE_LENGTH      16
+#define VCACHE_SIZE            32
+#define VCACHE_OVERFLOW        4
+#define VS_QUEUE_LENGTH        (VCACHE_SIZE + VCACHE_OVERFLOW + 1)	/* can never fill up */
+
+/**
+ * Private version of the compiled vertex_shader
+ */
+struct draw_vertex_shader {
+
+   /* This member will disappear shortly:
+    */
+   const struct pipe_shader_state   *state;
+
+   void (*prepare)( struct draw_vertex_shader *shader,
+		    struct draw_context *draw );
+
+   /* Run the shader - this interface will get cleaned up in the
+    * future:
+    */
+   void (*run)( struct draw_vertex_shader *shader,
+		struct draw_context *draw,
+		const unsigned *elts, 
+		unsigned count,
+		struct vertex_header *vOut[] );
+		    
+
+   void (*delete)( struct draw_vertex_shader * );
+};
+
+
+/* Internal function for vertex fetch.
+ */
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*full_fetch_func)( struct draw_context *draw,
+				 struct tgsi_exec_machine *machine,
+				 const unsigned *elts,
+				 unsigned count );
+
+
+
+/**
+ * Private context for the drawing module.
+ */
+struct draw_context
+{
+   /** Drawing/primitive pipeline stages */
+   struct {
+      struct draw_stage *first;  /**< one of the following */
+
+      struct draw_stage *validate; 
+
+      /* stages (in logical order) */
+      struct draw_stage *flatshade;
+      struct draw_stage *clip;
+      struct draw_stage *cull;
+      struct draw_stage *twoside;
+      struct draw_stage *offset;
+      struct draw_stage *unfilled;
+      struct draw_stage *stipple;
+      struct draw_stage *wide;
+      struct draw_stage *rasterize;
+   } pipeline;
+
+   /* pipe state that we need: */
+   const struct pipe_rasterizer_state *rasterizer;
+   struct pipe_viewport_state viewport;
+   struct pipe_vertex_buffer vertex_buffer[PIPE_ATTRIB_MAX];
+   struct pipe_vertex_element vertex_element[PIPE_ATTRIB_MAX];
+   struct draw_vertex_shader *vertex_shader;
+
+   uint num_vs_outputs;  /**< convenience, from vertex_shader */
+
+   /* user-space vertex data, buffers */
+   struct {
+      /** vertex element/index buffer (ex: glDrawElements) */
+      const void *elts;
+      /** bytes per index (0, 1, 2 or 4) */
+      unsigned eltSize;
+
+      /** vertex arrays */
+      const void *vbuffer[PIPE_ATTRIB_MAX];
+
+      /** constant buffer (for vertex shader) */
+      const void *constants;
+   } user;
+
+   /* Clip derived state:
+    */
+   float plane[12][4];
+   unsigned nr_planes;
+
+   boolean convert_wide_points; /**< convert wide points to tris? */
+   boolean convert_wide_lines;  /**< convert side lines to tris? */
+   boolean use_sse;
+
+   unsigned reduced_prim;
+
+   /** TGSI program interpreter runtime state */
+   struct tgsi_exec_machine machine;
+
+   /* Vertex fetch internal state
+    */
+   struct {
+      const ubyte *src_ptr[PIPE_ATTRIB_MAX];
+      unsigned pitch[PIPE_ATTRIB_MAX];
+      fetch_func fetch[PIPE_ATTRIB_MAX];
+      unsigned nr_attrs;
+      full_fetch_func fetch_func;
+   } vertex_fetch;
+
+   /* Post-tnl vertex cache:
+    */
+   struct {
+      unsigned referenced;  /**< bitfield */
+      unsigned idx[VCACHE_SIZE + VCACHE_OVERFLOW];
+      struct vertex_header *vertex[VCACHE_SIZE + VCACHE_OVERFLOW];
+      unsigned overflow;
+
+      /** To find space in the vertex cache: */
+      struct vertex_header *(*get_vertex)( struct draw_context *draw,
+                                           unsigned i );
+   } vcache;
+
+   /* Vertex shader queue:
+    */
+   struct {
+      struct {
+	 unsigned elt;   /**< index into the user's vertex arrays */
+	 struct vertex_header *dest; /**< points into vcache.vertex[] array */
+      } queue[VS_QUEUE_LENGTH];
+      unsigned queue_nr;
+   } vs;
+
+   /**
+    * Run the vertex shader on all vertices in the vertex queue.
+    */
+   void (*shader_queue_flush)(struct draw_context *draw);
+
+   /* Prim pipeline queue:
+    */
+   struct {
+      /* Need to queue up primitives until their vertices have been
+       * transformed by a vs queue flush.
+       */
+      struct prim_header queue[PRIM_QUEUE_LENGTH];
+      unsigned queue_nr;
+   } pq;
+
+
+   /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private.
+    */
+   struct gallivm_cpu_engine *engine;   
+   void *driver_private;
+};
+
+
+
+extern struct draw_stage *draw_unfilled_stage( struct draw_context *context );
+extern struct draw_stage *draw_twoside_stage( struct draw_context *context );
+extern struct draw_stage *draw_offset_stage( struct draw_context *context );
+extern struct draw_stage *draw_clip_stage( struct draw_context *context );
+extern struct draw_stage *draw_flatshade_stage( struct draw_context *context );
+extern struct draw_stage *draw_cull_stage( struct draw_context *context );
+extern struct draw_stage *draw_stipple_stage( struct draw_context *context );
+extern struct draw_stage *draw_wide_stage( struct draw_context *context );
+extern struct draw_stage *draw_validate_stage( struct draw_context *context );
+
+
+extern void draw_free_temp_verts( struct draw_stage *stage );
+
+extern void draw_alloc_temp_verts( struct draw_stage *stage, unsigned nr );
+
+extern void draw_reset_vertex_ids( struct draw_context *draw );
+
+
+extern int draw_vertex_cache_check_space( struct draw_context *draw, 
+					  unsigned nr_verts );
+
+extern void draw_vertex_cache_invalidate( struct draw_context *draw );
+extern void draw_vertex_cache_unreference( struct draw_context *draw );
+extern void draw_vertex_cache_reset_vertex_ids( struct draw_context *draw );
+
+extern void draw_vertex_shader_queue_flush( struct draw_context *draw );
+
+struct tgsi_exec_machine;
+
+extern void draw_update_vertex_fetch( struct draw_context *draw );
+
+
+#define DRAW_FLUSH_SHADER_QUEUE              0x1 /* sized not to overflow, never raised */
+#define DRAW_FLUSH_PRIM_QUEUE                0x2
+#define DRAW_FLUSH_VERTEX_CACHE              0x4
+#define DRAW_FLUSH_STATE_CHANGE              0x8
+#define DRAW_FLUSH_BACKEND                   0x10
+
+
+void draw_do_flush( struct draw_context *draw, unsigned flags );
+
+
+
+/**
+ * Get a writeable copy of a vertex.
+ * \param stage  drawing stage info
+ * \param vert  the vertex to copy (source)
+ * \param idx  index into stage's tmp[] array to put the copy (dest)
+ * \return  pointer to the copied vertex
+ */
+static INLINE struct vertex_header *
+dup_vert( struct draw_stage *stage,
+	  const struct vertex_header *vert,
+	  unsigned idx )
+{   
+   struct vertex_header *tmp = stage->tmp[idx];
+   const uint vsize = sizeof(struct vertex_header)
+      + stage->draw->num_vs_outputs * 4 * sizeof(float);
+   memcpy(tmp, vert, vsize);
+   tmp->vertex_id = UNDEFINED_VERTEX_ID;
+   return tmp;
+}
+
+static INLINE float
+dot4(const float *a, const float *b)
+{
+   float result = (a[0]*b[0] +
+                   a[1]*b[1] +
+                   a[2]*b[2] +
+                   a[3]*b[3]);
+
+   return result;
+}
+
+#endif /* DRAW_PRIVATE_H */
diff --git a/src/gallium/auxiliary/draw/draw_stipple.c b/src/gallium/auxiliary/draw/draw_stipple.c
new file mode 100644
index 00000000000..506f33512c8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_stipple.c
@@ -0,0 +1,239 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+/* Implement line stipple by cutting lines up into smaller lines.
+ * There are hundreds of ways to implement line stipple, this is one
+ * choice that should work in all situations, requires no state
+ * manipulations, but with a penalty in terms of large amounts of
+ * generated geometry.
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+/** Subclass of draw_stage */
+struct stipple_stage {
+   struct draw_stage stage;
+   float counter;
+   uint pattern;
+   uint factor;
+};
+
+
+static INLINE struct stipple_stage *
+stipple_stage(struct draw_stage *stage)
+{
+   return (struct stipple_stage *) stage;
+}
+
+
+/**
+ * Compute interpolated vertex attributes for 'dst' at position 't' 
+ * between 'v0' and 'v1'.
+ * XXX using linear interpolation for all attribs at this time.
+ */
+static void
+screen_interp( struct draw_context *draw,
+               struct vertex_header *dst,
+               float t,
+               const struct vertex_header *v0, 
+               const struct vertex_header *v1 )
+{
+   uint attr;
+   for (attr = 0; attr < draw->num_vs_outputs; attr++) {
+      const float *val0 = v0->data[attr];
+      const float *val1 = v1->data[attr];
+      float *newv = dst->data[attr];
+      uint i;
+      for (i = 0; i < 4; i++) {
+         newv[i] = val0[i] + t * (val1[i] - val0[i]);
+      }
+   }
+}
+
+
+static void
+emit_segment(struct draw_stage *stage, struct prim_header *header,
+             float t0, float t1)
+{
+   struct vertex_header *v0new = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1new = dup_vert(stage, header->v[1], 1);
+   struct prim_header newprim = *header;
+
+   if (t0 > 0.0) {
+      screen_interp( stage->draw, v0new, t0, header->v[0], header->v[1] );
+      newprim.v[0] = v0new;
+   }
+
+   if (t1 < 1.0) {
+      screen_interp( stage->draw, v1new, t1, header->v[0], header->v[1] );
+      newprim.v[1] = v1new;
+   }
+
+   stage->next->line( stage->next, &newprim );
+}
+
+
+static INLINE unsigned
+stipple_test(int counter, ushort pattern, int factor)
+{
+   int b = (counter / factor) & 0xf;
+   return (1 << b) & pattern;
+}
+
+
+static void
+stipple_line(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   const float *pos0 = v0->data[0];
+   const float *pos1 = v1->data[0];
+   float start = 0;
+   int state = 0;
+
+   float x0 = pos0[0];
+   float x1 = pos1[0];
+   float y0 = pos0[1];
+   float y1 = pos1[1];
+
+   float dx = x0 > x1 ? x0 - x1 : x1 - x0;
+   float dy = y0 > y1 ? y0 - y1 : y1 - y0;
+
+   float length = MAX2(dx, dy);
+   int i;
+
+   /* XXX ToDo: intead of iterating pixel-by-pixel, use a look-up table.
+    */
+   for (i = 0; i < length; i++) {
+      int result = stipple_test( (int) stipple->counter+i,
+                                 (ushort) stipple->pattern, stipple->factor );
+      if (result != state) {
+         /* changing from "off" to "on" or vice versa */
+	 if (state) {
+	    if (start != i) {
+               /* finishing an "on" segment */
+	       emit_segment( stage, header, start / length, i / length );
+            }
+	 }
+	 else {
+            /* starting an "on" segment */
+	    start = (float) i;
+	 }
+	 state = result;	   
+      }
+   }
+
+   if (state && start < length)
+      emit_segment( stage, header, start / length, 1.0 );
+
+   stipple->counter += length;
+}
+
+
+static void
+reset_stipple_counter(struct draw_stage *stage)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void
+stipple_first_line(struct draw_stage *stage, 
+		   struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   stipple->pattern = draw->rasterizer->line_stipple_pattern;
+   stipple->factor = draw->rasterizer->line_stipple_factor + 1;
+
+   stage->line = stipple_line;
+   stage->line( stage, header );
+}
+
+
+static void
+stipple_flush(struct draw_stage *stage, unsigned flags)
+{
+   stage->line = stipple_first_line;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void
+passthrough_point(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void
+passthrough_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   stage->next->tri(stage->next, header);
+}
+
+
+static void 
+stipple_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create line stippler stage
+ */
+struct draw_stage *draw_stipple_stage( struct draw_context *draw )
+{
+   struct stipple_stage *stipple = CALLOC_STRUCT(stipple_stage);
+
+   draw_alloc_temp_verts( &stipple->stage, 2 );
+
+   stipple->stage.draw = draw;
+   stipple->stage.next = NULL;
+   stipple->stage.point = passthrough_point;
+   stipple->stage.line = stipple_first_line;
+   stipple->stage.tri = passthrough_tri;
+   stipple->stage.reset_stipple_counter = reset_stipple_counter;
+   stipple->stage.flush = stipple_flush;
+   stipple->stage.destroy = stipple_destroy;
+
+   return &stipple->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_twoside.c b/src/gallium/auxiliary/draw/draw_twoside.c
new file mode 100644
index 00000000000..1c389579871
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_twoside.c
@@ -0,0 +1,203 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+struct twoside_stage {
+   struct draw_stage stage;
+   float sign;         /**< +1 or -1 */
+   uint attrib_front0, attrib_back0;
+   uint attrib_front1, attrib_back1;
+};
+
+
+static INLINE struct twoside_stage *twoside_stage( struct draw_stage *stage )
+{
+   return (struct twoside_stage *)stage;
+}
+
+
+
+
+/**
+ * Copy back color(s) to front color(s).
+ */
+static INLINE struct vertex_header *
+copy_bfc( struct twoside_stage *twoside, 
+          const struct vertex_header *v,
+          unsigned idx )
+{   
+   struct vertex_header *tmp = dup_vert( &twoside->stage, v, idx );
+   
+   if (twoside->attrib_back0) {
+      COPY_4FV(tmp->data[twoside->attrib_front0],
+               tmp->data[twoside->attrib_back0]);
+   }
+   if (twoside->attrib_back1) {
+      COPY_4FV(tmp->data[twoside->attrib_front1],
+               tmp->data[twoside->attrib_back1]);
+   }
+
+   return tmp;
+}
+
+
+/* Twoside tri:
+ */
+static void twoside_tri( struct draw_stage *stage,
+			 struct prim_header *header )
+{
+   struct twoside_stage *twoside = twoside_stage(stage);
+
+   if (header->det * twoside->sign < 0.0) {
+      /* this is a back-facing triangle */
+      struct prim_header tmp;
+
+      tmp.det = header->det;
+      tmp.edgeflags = header->edgeflags;
+      /* copy back attribs to front attribs */
+      tmp.v[0] = copy_bfc(twoside, header->v[0], 0);
+      tmp.v[1] = copy_bfc(twoside, header->v[1], 1);
+      tmp.v[2] = copy_bfc(twoside, header->v[2], 2);
+
+      stage->next->tri( stage->next, &tmp );
+   }
+   else {
+      stage->next->tri( stage->next, header );
+   }
+}
+
+
+static void twoside_line( struct draw_stage *stage,
+		       struct prim_header *header )
+{
+   /* pass-through */
+   stage->next->line( stage->next, header );
+}
+
+
+static void twoside_point( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   /* pass-through */
+   stage->next->point( stage->next, header );
+}
+
+
+static void twoside_first_tri( struct draw_stage *stage, 
+			       struct prim_header *header )
+{
+   struct twoside_stage *twoside = twoside_stage(stage);
+   const struct pipe_shader_state *vs = stage->draw->vertex_shader->state;
+   uint i;
+
+   twoside->attrib_front0 = 0;
+   twoside->attrib_front1 = 0;
+   twoside->attrib_back0 = 0;
+   twoside->attrib_back1 = 0;
+
+   /* Find which vertex shader outputs are front/back colors */
+   for (i = 0; i < vs->num_outputs; i++) {
+      if (vs->output_semantic_name[i] == TGSI_SEMANTIC_COLOR) {
+         if (vs->output_semantic_index[i] == 0)
+            twoside->attrib_front0 = i;
+         else
+            twoside->attrib_front1 = i;
+      }
+      if (vs->output_semantic_name[i] == TGSI_SEMANTIC_BCOLOR) {
+         if (vs->output_semantic_index[i] == 0)
+            twoside->attrib_back0 = i;
+         else
+            twoside->attrib_back1 = i;
+      }
+   }
+
+   if (!twoside->attrib_back0)
+      twoside->attrib_front0 = 0;
+
+   if (!twoside->attrib_back1)
+      twoside->attrib_front1 = 0;
+
+   /*
+    * We'll multiply the primitive's determinant by this sign to determine
+    * if the triangle is back-facing (negative).
+    * sign = -1 for CCW, +1 for CW
+    */
+   twoside->sign = (stage->draw->rasterizer->front_winding == PIPE_WINDING_CCW) ? -1.0f : 1.0f;
+
+   stage->tri = twoside_tri;
+   stage->tri( stage, header );
+}
+
+
+static void twoside_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->tri = twoside_first_tri;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void twoside_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void twoside_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create twoside pipeline stage.
+ */
+struct draw_stage *draw_twoside_stage( struct draw_context *draw )
+{
+   struct twoside_stage *twoside = CALLOC_STRUCT(twoside_stage);
+
+   draw_alloc_temp_verts( &twoside->stage, 3 );
+
+   twoside->stage.draw = draw;
+   twoside->stage.next = NULL;
+   twoside->stage.point = twoside_point;
+   twoside->stage.line = twoside_line;
+   twoside->stage.tri = twoside_first_tri;
+   twoside->stage.flush = twoside_flush;
+   twoside->stage.reset_stipple_counter = twoside_reset_stipple_counter;
+   twoside->stage.destroy = twoside_destroy;
+
+   return &twoside->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_unfilled.c b/src/gallium/auxiliary/draw/draw_unfilled.c
new file mode 100644
index 00000000000..8777cfdfc86
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_unfilled.c
@@ -0,0 +1,206 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * \brief  Drawing stage for handling glPolygonMode(line/point).
+ * Convert triangles to points or lines as needed.
+ */
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+
+
+struct unfilled_stage {
+   struct draw_stage stage;
+
+   /** [0] = front face, [1] = back face.
+    * legal values:  PIPE_POLYGON_MODE_FILL, PIPE_POLYGON_MODE_LINE,
+    * and PIPE_POLYGON_MODE_POINT,
+    */
+   unsigned mode[2];
+};
+
+
+static INLINE struct unfilled_stage *unfilled_stage( struct draw_stage *stage )
+{
+   return (struct unfilled_stage *)stage;
+}
+
+
+
+static void point( struct draw_stage *stage,
+		   struct vertex_header *v0 )
+{
+   struct prim_header tmp;
+   tmp.v[0] = v0;
+   stage->next->point( stage->next, &tmp );
+}
+
+static void line( struct draw_stage *stage,
+		  struct vertex_header *v0,
+		  struct vertex_header *v1 )
+{
+   struct prim_header tmp;
+   tmp.v[0] = v0;
+   tmp.v[1] = v1;
+   stage->next->line( stage->next, &tmp );
+}
+
+
+static void points( struct draw_stage *stage,
+		    struct prim_header *header )
+{
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   struct vertex_header *v2 = header->v[2];
+
+   if (header->edgeflags & 0x1) point( stage, v0 );
+   if (header->edgeflags & 0x2) point( stage, v1 );
+   if (header->edgeflags & 0x4) point( stage, v2 );
+}
+
+
+static void lines( struct draw_stage *stage,
+		   struct prim_header *header )
+{
+   struct vertex_header *v0 = header->v[0];
+   struct vertex_header *v1 = header->v[1];
+   struct vertex_header *v2 = header->v[2];
+
+#if 0
+   assert(((header->edgeflags & 0x1) >> 0) == header->v[0]->edgeflag);
+   assert(((header->edgeflags & 0x2) >> 1) == header->v[1]->edgeflag);
+   assert(((header->edgeflags & 0x4) >> 2) == header->v[2]->edgeflag);
+#endif
+
+   if (header->edgeflags & 0x1) line( stage, v0, v1 );
+   if (header->edgeflags & 0x2) line( stage, v1, v2 );
+   if (header->edgeflags & 0x4) line( stage, v2, v0 );
+}
+
+
+/* Unfilled tri:  
+ *
+ * Note edgeflags in the vertex struct is not sufficient as we will
+ * need to manipulate them when decomposing primitives???
+ */
+static void unfilled_tri( struct draw_stage *stage,
+			  struct prim_header *header )
+{
+   struct unfilled_stage *unfilled = unfilled_stage(stage);
+   unsigned mode = unfilled->mode[header->det >= 0.0];
+  
+   switch (mode) {
+   case PIPE_POLYGON_MODE_FILL:
+      stage->next->tri( stage->next, header );
+      break;
+   case PIPE_POLYGON_MODE_LINE:
+      lines( stage, header );
+      break;
+   case PIPE_POLYGON_MODE_POINT:
+      points( stage, header );
+      break;
+   default:
+      abort();
+   }   
+}
+
+
+static void unfilled_first_tri( struct draw_stage *stage, 
+				struct prim_header *header )
+{
+   struct unfilled_stage *unfilled = unfilled_stage(stage);
+
+   unfilled->mode[0] = stage->draw->rasterizer->fill_ccw; /* front */
+   unfilled->mode[1] = stage->draw->rasterizer->fill_cw;  /* back */
+
+   stage->tri = unfilled_tri;
+   stage->tri( stage, header );
+}
+
+
+static void unfilled_line( struct draw_stage *stage,
+                           struct prim_header *header )
+{
+   stage->next->line( stage->next, header );
+}
+
+
+static void unfilled_point( struct draw_stage *stage,
+                            struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+
+static void unfilled_flush( struct draw_stage *stage,
+			    unsigned flags )
+{
+   stage->next->flush( stage->next, flags );
+
+   stage->tri = unfilled_first_tri;
+}
+
+
+static void unfilled_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void unfilled_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+/**
+ * Create unfilled triangle stage.
+ */
+struct draw_stage *draw_unfilled_stage( struct draw_context *draw )
+{
+   struct unfilled_stage *unfilled = CALLOC_STRUCT(unfilled_stage);
+
+   draw_alloc_temp_verts( &unfilled->stage, 0 );
+
+   unfilled->stage.draw = draw;
+   unfilled->stage.next = NULL;
+   unfilled->stage.tmp = NULL;
+   unfilled->stage.point = unfilled_point;
+   unfilled->stage.line = unfilled_line;
+   unfilled->stage.tri = unfilled_first_tri;
+   unfilled->stage.flush = unfilled_flush;
+   unfilled->stage.reset_stipple_counter = unfilled_reset_stipple_counter;
+   unfilled->stage.destroy = unfilled_destroy;
+
+   return &unfilled->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_validate.c b/src/gallium/auxiliary/draw/draw_validate.c
new file mode 100644
index 00000000000..4375ebabbc4
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_validate.c
@@ -0,0 +1,185 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "draw_private.h"
+
+
+
+
+
+/**
+ * Rebuild the rendering pipeline.
+ */
+static struct draw_stage *validate_pipeline( struct draw_stage *stage )
+{
+   struct draw_context *draw = stage->draw;
+   struct draw_stage *next = draw->pipeline.rasterize;
+   int need_det = 0;
+   int precalc_flat = 0;
+
+   /* Set the validate's next stage to the rasterize stage, so that it
+    * can be found later if needed for flushing.
+    */
+   stage->next = next;
+
+   /*
+    * NOTE: we build up the pipeline in end-to-start order.
+    *
+    * TODO: make the current primitive part of the state and build
+    * shorter pipelines for lines & points.
+    */
+
+   if ((draw->rasterizer->line_width != 1.0 && draw->convert_wide_lines) ||
+       (draw->rasterizer->point_size != 1.0 && draw->convert_wide_points) ||
+       draw->rasterizer->point_sprite) {
+      draw->pipeline.wide->next = next;
+      next = draw->pipeline.wide;
+   }
+
+   if (draw->rasterizer->line_stipple_enable) {
+      draw->pipeline.stipple->next = next;
+      next = draw->pipeline.stipple;
+      precalc_flat = 1;		/* only needed for lines really */
+   }
+
+   if (draw->rasterizer->fill_cw != PIPE_POLYGON_MODE_FILL ||
+       draw->rasterizer->fill_ccw != PIPE_POLYGON_MODE_FILL) {
+      draw->pipeline.unfilled->next = next;
+      next = draw->pipeline.unfilled;
+      precalc_flat = 1;		/* only needed for triangles really */
+      need_det = 1;
+   }
+
+   if (draw->rasterizer->flatshade && precalc_flat) {
+      draw->pipeline.flatshade->next = next;
+      next = draw->pipeline.flatshade;
+   }
+	 
+   if (draw->rasterizer->offset_cw ||
+       draw->rasterizer->offset_ccw) {
+      draw->pipeline.offset->next = next;
+      next = draw->pipeline.offset;
+      need_det = 1;
+   }
+
+   if (draw->rasterizer->light_twoside) {
+      draw->pipeline.twoside->next = next;
+      next = draw->pipeline.twoside;
+      need_det = 1;
+   }
+
+   /* Always run the cull stage as we calculate determinant there
+    * also.  
+    *
+    * This can actually be a win as culling out the triangles can lead
+    * to less work emitting vertices, smaller vertex buffers, etc.
+    * It's difficult to say whether this will be true in general.
+    */
+   if (need_det || draw->rasterizer->cull_mode) {
+      draw->pipeline.cull->next = next;
+      next = draw->pipeline.cull;
+   }
+
+   /* Clip stage
+    */
+   if (!draw->rasterizer->bypass_clipping)
+   {
+      draw->pipeline.clip->next = next;
+      next = draw->pipeline.clip;
+   }
+
+   
+   draw->pipeline.first = next;
+   return next;
+}
+
+static void validate_tri( struct draw_stage *stage, 
+			  struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->tri( pipeline, header );
+}
+
+static void validate_line( struct draw_stage *stage, 
+			   struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->line( pipeline, header );
+}
+
+static void validate_point( struct draw_stage *stage, 
+			    struct prim_header *header )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->point( pipeline, header );
+}
+
+static void validate_reset_stipple_counter( struct draw_stage *stage )
+{
+   struct draw_stage *pipeline = validate_pipeline( stage );
+   pipeline->reset_stipple_counter( pipeline );
+}
+
+static void validate_flush( struct draw_stage *stage, 
+			    unsigned flags )
+{
+   /* May need to pass a backend flush on to the rasterize stage.
+    */
+   if (stage->next)
+      stage->next->flush( stage->next, flags );
+}
+
+
+static void validate_destroy( struct draw_stage *stage )
+{
+   FREE( stage );
+}
+
+
+/**
+ * Create validate pipeline stage.
+ */
+struct draw_stage *draw_validate_stage( struct draw_context *draw )
+{
+   struct draw_stage *stage = CALLOC_STRUCT(draw_stage);
+
+   stage->draw = draw;
+   stage->next = NULL;
+   stage->point = validate_point;
+   stage->line = validate_line;
+   stage->tri = validate_tri;
+   stage->flush = validate_flush;
+   stage->reset_stipple_counter = validate_reset_stipple_counter;
+   stage->destroy = validate_destroy;
+
+   return stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.c b/src/gallium/auxiliary/draw/draw_vbuf.c
new file mode 100644
index 00000000000..71ac73912b8
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vbuf.c
@@ -0,0 +1,570 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Vertex buffer drawing stage.
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ * \author Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vbuf.h"
+#include "draw_private.h"
+#include "draw_vertex.h"
+#include "draw_vf.h"
+
+
+/**
+ * Vertex buffer emit stage.
+ */
+struct vbuf_stage {
+   struct draw_stage stage; /**< This must be first (base class) */
+
+   struct vbuf_render *render;
+   
+   const struct vertex_info *vinfo;
+   
+   /** Vertex size in bytes */
+   unsigned vertex_size;
+
+   struct draw_vertex_fetch *vf;
+   
+   /* FIXME: we have no guarantee that 'unsigned' is 32bit */
+
+   /** Vertices in hardware format */
+   unsigned *vertices;
+   unsigned *vertex_ptr;
+   unsigned max_vertices;
+   unsigned nr_vertices;
+   
+   /** Indices */
+   ushort *indices;
+   unsigned max_indices;
+   unsigned nr_indices;
+
+   /** Pipe primitive */
+   unsigned prim;
+};
+
+
+/**
+ * Basically a cast wrapper.
+ */
+static INLINE struct vbuf_stage *
+vbuf_stage( struct draw_stage *stage )
+{
+   assert(stage);
+   return (struct vbuf_stage *)stage;
+}
+
+
+static void vbuf_flush_indices( struct vbuf_stage *vbuf );
+static void vbuf_flush_vertices( struct vbuf_stage *vbuf );
+static void vbuf_alloc_vertices( struct vbuf_stage *vbuf );
+
+
+static INLINE boolean 
+overflow( void *map, void *ptr, unsigned bytes, unsigned bufsz )
+{
+   unsigned long used = (unsigned long) ((char *)ptr - (char *)map);
+   return (used + bytes) > bufsz;
+}
+
+
+static INLINE void 
+check_space( struct vbuf_stage *vbuf, unsigned nr )
+{
+   if (vbuf->nr_vertices + nr > vbuf->max_vertices ) {
+      vbuf_flush_vertices(vbuf);
+      vbuf_alloc_vertices(vbuf);
+   }
+
+   if (vbuf->nr_indices + nr > vbuf->max_indices )
+      vbuf_flush_indices(vbuf);
+}
+
+
+#if 0
+static INLINE void
+dump_emitted_vertex(const struct vertex_info *vinfo, const uint8_t *data)
+{
+   assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+   unsigned i, j, k;
+
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         debug_printf("EMIT_OMIT:");
+         break;
+      case EMIT_ALL:
+         assert(i == 0);
+         assert(j == 0);
+         debug_printf("EMIT_ALL:\t");
+         for(k = 0; k < vinfo->size*4; ++k)
+            debug_printf("%02x ", *data++);
+         break;
+      case EMIT_1F:
+         debug_printf("EMIT_1F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_1F_PSIZE:
+         debug_printf("EMIT_1F_PSIZE:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_2F:
+         debug_printf("EMIT_2F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_3F:
+         debug_printf("EMIT_3F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         data += sizeof(float);
+         break;
+      case EMIT_4F:
+         debug_printf("EMIT_4F:\t");
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         debug_printf("%f ", *(float *)data); data += sizeof(float);
+         break;
+      case EMIT_4UB:
+         debug_printf("EMIT_4UB:\t");
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         debug_printf("%u ", *data++);
+         break;
+      default:
+         assert(0);
+      }
+      debug_printf("\n");
+   }
+   debug_printf("\n");
+}
+#endif
+
+
+/**
+ * Extract the needed fields from post-transformed vertex and emit
+ * a hardware(driver) vertex.
+ * Recall that the vertices are constructed by the 'draw' module and
+ * have a couple of slots at the beginning (1-dword header, 4-dword
+ * clip pos) that we ignore here.  We only use the vertex->data[] fields.
+ */
+static INLINE void 
+emit_vertex( struct vbuf_stage *vbuf,
+             struct vertex_header *vertex )
+{
+#if 0
+   debug_printf("emit vertex %d to %p\n", 
+           vbuf->nr_vertices, vbuf->vertex_ptr);
+#endif
+
+   if(vertex->vertex_id != UNDEFINED_VERTEX_ID) {
+      if(vertex->vertex_id < vbuf->nr_vertices)
+	 return;
+      else
+	 debug_printf("Bad vertex id 0x%04x (>= 0x%04x)\n", 
+	         vertex->vertex_id, vbuf->nr_vertices);
+      return;
+   }
+      
+   vertex->vertex_id = vbuf->nr_vertices++;
+
+   if(!vbuf->vf) {
+      const struct vertex_info *vinfo = vbuf->vinfo;
+      uint i;
+      uint count = 0;  /* for debug/sanity */
+      
+      assert(vinfo == vbuf->render->get_vertex_info(vbuf->render));
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         uint j = vinfo->src_index[i];
+         switch (vinfo->emit[i]) {
+         case EMIT_OMIT:
+            /* no-op */
+            break;
+         case EMIT_ALL:
+            /* just copy the whole vertex as-is to the vbuf */
+            assert(i == 0);
+            assert(j == 0);
+            memcpy(vbuf->vertex_ptr, vertex, vinfo->size * 4);
+            vbuf->vertex_ptr += vinfo->size;
+            count += vinfo->size;
+            break;
+         case EMIT_1F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            count++;
+            break;
+         case EMIT_1F_PSIZE:
+            *vbuf->vertex_ptr++ = fui(vbuf->stage.draw->rasterizer->point_size);
+            count++;
+            break;
+         case EMIT_2F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            count += 2;
+            break;
+         case EMIT_3F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            count += 3;
+            break;
+         case EMIT_4F:
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][0]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][1]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][2]);
+            *vbuf->vertex_ptr++ = fui(vertex->data[j][3]);
+            count += 4;
+            break;
+         case EMIT_4UB:
+            *vbuf->vertex_ptr++ = pack_ub4(float_to_ubyte( vertex->data[j][2] ),
+                                           float_to_ubyte( vertex->data[j][1] ),
+                                           float_to_ubyte( vertex->data[j][0] ),
+                                           float_to_ubyte( vertex->data[j][3] ));
+            count += 1;
+            break;
+         default:
+            assert(0);
+         }
+      }
+      assert(count == vinfo->size);
+#if 0
+      {
+	 static float data[256]; 
+	 draw_vf_emit_vertex(vbuf->vf, vertex, data);
+	 if(memcmp((uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size, data, vbuf->vertex_size)) {
+            debug_printf("With VF:\n");
+            dump_emitted_vertex(vbuf->vinfo, (uint8_t *)data);
+	    debug_printf("Without VF:\n");
+	    dump_emitted_vertex(vbuf->vinfo, (uint8_t *)vbuf->vertex_ptr - vbuf->vertex_size);
+	    assert(0);
+	 }
+      }
+#endif
+   }
+   else {
+      draw_vf_emit_vertex(vbuf->vf, vertex, vbuf->vertex_ptr);
+   
+      vbuf->vertex_ptr += vbuf->vertex_size/4;
+   }
+}
+
+
+static void 
+vbuf_tri( struct draw_stage *stage,
+          struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   unsigned i;
+
+   check_space( vbuf, 3 );
+
+   for (i = 0; i < 3; i++) {
+      emit_vertex( vbuf, prim->v[i] );
+      
+      vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[i]->vertex_id;
+   }
+}
+
+
+static void 
+vbuf_line( struct draw_stage *stage, 
+           struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+   unsigned i;
+
+   check_space( vbuf, 2 );
+
+   for (i = 0; i < 2; i++) {
+      emit_vertex( vbuf, prim->v[i] );
+
+      vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[i]->vertex_id;
+   }   
+}
+
+
+static void 
+vbuf_point( struct draw_stage *stage, 
+            struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   check_space( vbuf, 1 );
+
+   emit_vertex( vbuf, prim->v[0] );
+   
+   vbuf->indices[vbuf->nr_indices++] = (ushort) prim->v[0]->vertex_id;
+}
+
+
+/**
+ * Set the prim type for subsequent vertices.
+ * This may result in a new vertex size.  The existing vbuffer (if any)
+ * will be flushed if needed and a new one allocated.
+ */
+static void
+vbuf_set_prim( struct vbuf_stage *vbuf, uint newprim )
+{
+   const struct vertex_info *vinfo;
+   unsigned vertex_size;
+
+   assert(newprim == PIPE_PRIM_POINTS ||
+          newprim == PIPE_PRIM_LINES ||
+          newprim == PIPE_PRIM_TRIANGLES);
+
+   vbuf->prim = newprim;
+   vbuf->render->set_primitive(vbuf->render, newprim);
+
+   vinfo = vbuf->render->get_vertex_info(vbuf->render);
+   vertex_size = vinfo->size * sizeof(float);
+
+   if (vertex_size != vbuf->vertex_size)
+      vbuf_flush_vertices(vbuf);
+
+   vbuf->vinfo = vinfo;
+   vbuf->vertex_size = vertex_size;
+   if(vbuf->vf)
+      draw_vf_set_vertex_info(vbuf->vf, 
+                              vbuf->vinfo,
+                              vbuf->stage.draw->rasterizer->point_size);
+   
+   if (!vbuf->vertices)
+      vbuf_alloc_vertices(vbuf);
+}
+
+
+static void 
+vbuf_first_tri( struct draw_stage *stage,
+                struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );   
+   stage->tri = vbuf_tri;
+   vbuf_set_prim(vbuf, PIPE_PRIM_TRIANGLES);
+   stage->tri( stage, prim );
+}
+
+
+static void 
+vbuf_first_line( struct draw_stage *stage,
+                 struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+   stage->line = vbuf_line;
+   vbuf_set_prim(vbuf, PIPE_PRIM_LINES);
+   stage->line( stage, prim );
+}
+
+
+static void 
+vbuf_first_point( struct draw_stage *stage,
+                  struct prim_header *prim )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+   stage->point = vbuf_point;
+   vbuf_set_prim(vbuf, PIPE_PRIM_POINTS);
+   stage->point( stage, prim );
+}
+
+
+static void 
+vbuf_flush_indices( struct vbuf_stage *vbuf ) 
+{
+   if(!vbuf->nr_indices)
+      return;
+   
+   assert((uint) (vbuf->vertex_ptr - vbuf->vertices) == 
+          vbuf->nr_vertices * vbuf->vertex_size / sizeof(unsigned));
+
+   switch(vbuf->prim) {
+   case PIPE_PRIM_POINTS:
+      break;
+   case PIPE_PRIM_LINES:
+      assert(vbuf->nr_indices % 2 == 0);
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      assert(vbuf->nr_indices % 3 == 0);
+      break;
+   default:
+      assert(0);
+   }
+   
+   vbuf->render->draw(vbuf->render, vbuf->indices, vbuf->nr_indices);
+   
+   vbuf->nr_indices = 0;
+
+   /* don't need to reset point/line/tri functions */
+#if 0
+   stage->point = vbuf_first_point;
+   stage->line = vbuf_first_line;
+   stage->tri = vbuf_first_tri;
+#endif
+}
+
+
+/**
+ * Flush existing vertex buffer and allocate a new one.
+ * 
+ * XXX: We separate flush-on-index-full and flush-on-vb-full, but may 
+ * raise issues uploading vertices if the hardware wants to flush when
+ * we flush.
+ */
+static void 
+vbuf_flush_vertices( struct vbuf_stage *vbuf )
+{
+   if(vbuf->vertices) {      
+      vbuf_flush_indices(vbuf);
+      
+      /* Reset temporary vertices ids */
+      if(vbuf->nr_vertices)
+	 draw_reset_vertex_ids( vbuf->stage.draw );
+      
+      /* Free the vertex buffer */
+      vbuf->render->release_vertices(vbuf->render,
+                                     vbuf->vertices,
+                                     vbuf->vertex_size,
+                                     vbuf->nr_vertices);
+      vbuf->max_vertices = vbuf->nr_vertices = 0;
+      vbuf->vertex_ptr = vbuf->vertices = NULL;
+      
+   }
+}
+   
+
+static void 
+vbuf_alloc_vertices( struct vbuf_stage *vbuf )
+{
+   assert(!vbuf->nr_indices);
+   assert(!vbuf->vertices);
+   
+   /* Allocate a new vertex buffer */
+   vbuf->max_vertices = vbuf->render->max_vertex_buffer_bytes / vbuf->vertex_size;
+   vbuf->vertices = (uint *) vbuf->render->allocate_vertices(vbuf->render,
+                                                    (ushort) vbuf->vertex_size,
+                                                    (ushort) vbuf->max_vertices);
+   vbuf->vertex_ptr = vbuf->vertices;
+}
+
+
+
+static void 
+vbuf_flush( struct draw_stage *stage, unsigned flags )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   vbuf_flush_indices( vbuf );
+
+   stage->point = vbuf_first_point;
+   stage->line = vbuf_first_line;
+   stage->tri = vbuf_first_tri;
+
+   if (flags & DRAW_FLUSH_BACKEND)
+      vbuf_flush_vertices( vbuf );
+}
+
+
+static void 
+vbuf_reset_stipple_counter( struct draw_stage *stage )
+{
+   /* XXX: Need to do something here for hardware with linestipple.
+    */
+   (void) stage;
+}
+
+
+static void vbuf_destroy( struct draw_stage *stage )
+{
+   struct vbuf_stage *vbuf = vbuf_stage( stage );
+
+   if(vbuf->indices)
+      align_free( vbuf->indices );
+   
+   if(vbuf->vf)
+      draw_vf_destroy( vbuf->vf );
+
+   if (vbuf->render)
+      vbuf->render->destroy( vbuf->render );
+
+   FREE( stage );
+}
+
+
+/**
+ * Create a new primitive vbuf/render stage.
+ */
+struct draw_stage *draw_vbuf_stage( struct draw_context *draw,
+                                    struct vbuf_render *render )
+{
+   struct vbuf_stage *vbuf = CALLOC_STRUCT(vbuf_stage);
+
+   if(!vbuf)
+      return NULL;
+   
+   vbuf->stage.draw = draw;
+   vbuf->stage.point = vbuf_first_point;
+   vbuf->stage.line = vbuf_first_line;
+   vbuf->stage.tri = vbuf_first_tri;
+   vbuf->stage.flush = vbuf_flush;
+   vbuf->stage.reset_stipple_counter = vbuf_reset_stipple_counter;
+   vbuf->stage.destroy = vbuf_destroy;
+   
+   vbuf->render = render;
+
+   assert(render->max_indices < UNDEFINED_VERTEX_ID);
+   vbuf->max_indices = render->max_indices;
+   vbuf->indices = (ushort *)
+      align_malloc( vbuf->max_indices * sizeof(vbuf->indices[0]), 16 );
+   if(!vbuf->indices)
+      vbuf_destroy(&vbuf->stage);
+   
+   vbuf->vertices = NULL;
+   vbuf->vertex_ptr = vbuf->vertices;
+
+   vbuf->prim = ~0;
+   
+   if(!GETENV("GALLIUM_NOVF"))
+      vbuf->vf = draw_vf_create();
+   
+   return &vbuf->stage;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vbuf.h b/src/gallium/auxiliary/draw/draw_vbuf.h
new file mode 100644
index 00000000000..cfd2b9820c1
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vbuf.h
@@ -0,0 +1,106 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Vertex buffer drawing stage.
+ * 
+ * \author Keith Whitwell <[email protected]>
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+#ifndef DRAW_VBUF_H_
+#define DRAW_VBUF_H_
+
+
+#include "pipe/p_util.h"
+
+
+struct draw_context;
+struct vertex_info;
+
+
+/**
+ * Interface for hardware vertex buffer rendering.
+ */
+struct vbuf_render {
+
+   /**
+    * Driver limits.  May be tuned lower to improve cache hits on
+    * index list.
+    */
+   unsigned max_indices;
+   unsigned max_vertex_buffer_bytes;
+
+   /**
+    * Get the hardware vertex format.
+    * 
+    * XXX: have this in draw_context instead?
+    */ 
+   const struct vertex_info *(*get_vertex_info)( struct vbuf_render * );
+	 
+   /**
+    * Request a destination for vertices.
+    * Hardware renderers will use ttm memory, others will just malloc
+    * something.
+    */
+   void *(*allocate_vertices)( struct vbuf_render *,
+			       ushort vertex_size,
+			       ushort nr_vertices );
+
+   /**
+    * Notify the renderer of the current primitive when it changes.
+    * Prim is restricted to TRIANGLES, LINES and POINTS.
+    */
+   void (*set_primitive)( struct vbuf_render *, unsigned prim );
+
+   /**
+    * DrawElements, note indices are ushort:
+    */
+   void (*draw)( struct vbuf_render *,
+		 const ushort *indices,
+		 uint nr_indices );
+
+   /**
+    * Called when vbuf is done with this set of vertices:
+    */
+   void (*release_vertices)( struct vbuf_render *,
+			     void *vertices, 
+			     unsigned vertex_size,
+			     unsigned vertices_used );
+
+   void (*destroy)( struct vbuf_render * );
+};
+
+
+
+struct draw_stage *
+draw_vbuf_stage( struct draw_context *draw,
+                 struct vbuf_render *render );
+
+
+#endif /*DRAW_VBUF_H_*/
diff --git a/src/gallium/auxiliary/draw/draw_vertex.c b/src/gallium/auxiliary/draw/draw_vertex.c
new file mode 100644
index 00000000000..daf1ef4b80f
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex.c
@@ -0,0 +1,79 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/*
+ * Functions for specifying the post-transformation vertex layout.
+ *
+ * Author:
+ *    Brian Paul
+ *    Keith Whitwell
+ */
+
+
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+
+
+/**
+ * Compute the size of a vertex, in dwords/floats, to update the
+ * vinfo->size field.
+ */
+void
+draw_compute_vertex_size(struct vertex_info *vinfo)
+{
+   uint i;
+
+   vinfo->size = 0;
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         break;
+      case EMIT_4UB:
+         /* fall-through */
+      case EMIT_1F_PSIZE:
+         /* fall-through */
+      case EMIT_1F:
+         vinfo->size += 1;
+         break;
+      case EMIT_2F:
+         vinfo->size += 2;
+         break;
+      case EMIT_3F:
+         vinfo->size += 3;
+         break;
+      case EMIT_4F:
+         vinfo->size += 4;
+         break;
+      case EMIT_ALL:
+         /* fall-through */
+      default:
+         assert(0);
+      }
+   }
+
+   assert(vinfo->size * 4 <= MAX_VERTEX_SIZE);
+}
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
new file mode 100644
index 00000000000..267c74203bd
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -0,0 +1,111 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Post-transform vertex format info.  The vertex_info struct is used by
+ * the draw_vbuf code to emit hardware-specific vertex layouts into hw
+ * vertex buffers.
+ *
+ * Author:
+ *    Brian Paul
+ */
+
+
+#ifndef DRAW_VERTEX_H
+#define DRAW_VERTEX_H
+
+
+#include "pipe/p_state.h"
+
+
+/**
+ * Vertex attribute emit modes
+ */
+enum attrib_emit {
+   EMIT_OMIT,      /**< don't emit the attribute */
+   EMIT_ALL,       /**< emit whole post-xform vertex, w/ header */
+   EMIT_1F,
+   EMIT_1F_PSIZE,  /**< insert constant point size */
+   EMIT_2F,
+   EMIT_3F,
+   EMIT_4F,
+   EMIT_4UB  /**< XXX may need variations for RGBA vs BGRA, etc */
+};
+
+
+/**
+ * Attribute interpolation mode
+ */
+enum interp_mode {
+   INTERP_NONE,      /**< never interpolate vertex header info */
+   INTERP_POS,       /**< special case for frag position */
+   INTERP_CONSTANT,
+   INTERP_LINEAR,
+   INTERP_PERSPECTIVE
+};
+
+
+/**
+ * Information about hardware/rasterization vertex layout.
+ */
+struct vertex_info
+{
+   uint num_attribs;
+   uint hwfmt[4];      /**< hardware format info for this format */
+   enum interp_mode interp_mode[PIPE_MAX_SHADER_INPUTS];
+   enum attrib_emit emit[PIPE_MAX_SHADER_INPUTS];   /**< EMIT_x */
+   uint src_index[PIPE_MAX_SHADER_INPUTS]; /**< map to post-xform attribs */
+   uint size;          /**< total vertex size in dwords */
+};
+
+
+
+/**
+ * Add another attribute to the given vertex_info object.
+ * \param src_index  indicates which post-transformed vertex attrib slot
+ *                   corresponds to this attribute.
+ * \return slot in which the attribute was added
+ */
+static INLINE uint
+draw_emit_vertex_attr(struct vertex_info *vinfo,
+                      enum attrib_emit emit, enum interp_mode interp,
+                      uint src_index)
+{
+   const uint n = vinfo->num_attribs;
+   assert(n < PIPE_MAX_SHADER_INPUTS);
+   vinfo->emit[n] = emit;
+   vinfo->interp_mode[n] = interp;
+   vinfo->src_index[n] = src_index;
+   vinfo->num_attribs++;
+   return n;
+}
+
+
+extern void draw_compute_vertex_size(struct vertex_info *vinfo);
+
+
+#endif /* DRAW_VERTEX_H */
diff --git a/src/gallium/auxiliary/draw/draw_vertex_cache.c b/src/gallium/auxiliary/draw/draw_vertex_cache.c
new file mode 100644
index 00000000000..44427999ccf
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex_cache.c
@@ -0,0 +1,196 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "pipe/p_util.h"
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+void draw_vertex_cache_invalidate( struct draw_context *draw )
+{
+   assert(draw->pq.queue_nr == 0);
+   assert(draw->vs.queue_nr == 0);
+   assert(draw->vcache.referenced == 0);
+
+   memset(draw->vcache.idx, ~0, sizeof(draw->vcache.idx));
+}
+
+
+/**
+ * Check if vertex is in cache, otherwise add it.  It won't go through
+ * VS yet, not until there is a flush operation or the VS queue fills up.  
+ *
+ * Note that cache entries are basically just two pointers: the first
+ * an index into the user's vertex arrays, the second a location in
+ * the vertex shader cache for the post-transformed vertex.
+ *
+ * \return pointer to location of (post-transformed) vertex header in the cache
+ */
+static struct vertex_header *get_vertex( struct draw_context *draw,
+					 unsigned i )
+{
+   unsigned slot = (i + (i>>5)) % VCACHE_SIZE;
+   
+   assert(slot < 32); /* so we don't exceed the bitfield size below */
+
+   /* Cache miss?
+    */
+   if (draw->vcache.idx[slot] != i) {
+
+      /* If slot is in use, use the overflow area:
+       */
+      if (draw->vcache.referenced & (1 << slot)) {
+	 slot = VCACHE_SIZE + draw->vcache.overflow++;
+      }
+
+      assert(slot < Elements(draw->vcache.idx));
+
+      draw->vcache.idx[slot] = i;
+
+      /* Add to vertex shader queue:
+       */
+      assert(draw->vs.queue_nr < VS_QUEUE_LENGTH);
+      draw->vs.queue[draw->vs.queue_nr].dest = draw->vcache.vertex[slot];
+      draw->vs.queue[draw->vs.queue_nr].elt = i;
+      draw->vs.queue_nr++;
+
+      /* Need to set the vertex's edge flag here.  If we're being called
+       * by do_ef_triangle(), that function needs edge flag info!
+       */
+      draw->vcache.vertex[slot]->clipmask = 0;
+      draw->vcache.vertex[slot]->edgeflag = 1; /*XXX use user's edge flag! */
+      draw->vcache.vertex[slot]->pad = 0;
+      draw->vcache.vertex[slot]->vertex_id = UNDEFINED_VERTEX_ID;
+   }
+
+
+   /* primitive flushing may have cleared the bitfield but did not
+    * clear the idx[] array values.  Set the bit now.  This fixes a
+    * bug found when drawing long triangle fans.
+    */
+   draw->vcache.referenced |= (1 << slot);
+   return draw->vcache.vertex[slot];
+}
+
+
+static struct vertex_header *get_uint_elt_vertex( struct draw_context *draw,
+                                                  unsigned i )
+{
+   const unsigned *elts = (const unsigned *) draw->user.elts;
+   return get_vertex( draw, elts[i] );
+}
+
+
+static struct vertex_header *get_ushort_elt_vertex( struct draw_context *draw,
+						    unsigned i )
+{
+   const ushort *elts = (const ushort *) draw->user.elts;
+   return get_vertex( draw, elts[i] );
+}
+
+
+static struct vertex_header *get_ubyte_elt_vertex( struct draw_context *draw,
+                                                   unsigned i )
+{
+   const ubyte *elts = (const ubyte *) draw->user.elts;
+   return get_vertex( draw, elts[i] );
+}
+
+
+void draw_vertex_cache_reset_vertex_ids( struct draw_context *draw )
+{
+   unsigned i;
+
+   for (i = 0; i < Elements(draw->vcache.vertex); i++)
+      draw->vcache.vertex[i]->vertex_id = UNDEFINED_VERTEX_ID;
+}
+
+
+void draw_vertex_cache_unreference( struct draw_context *draw )
+{
+   draw->vcache.referenced = 0;
+   draw->vcache.overflow = 0;
+}
+
+
+int draw_vertex_cache_check_space( struct draw_context *draw,
+				   unsigned nr_verts )
+{
+   if (draw->vcache.overflow + nr_verts < VCACHE_OVERFLOW) {
+      /* The vs queue is sized so that this can never happen:
+       */
+      assert(draw->vs.queue_nr + nr_verts < VS_QUEUE_LENGTH);
+      return TRUE;
+   }
+   else
+      return FALSE;
+}
+
+
+
+/**
+ * Tell the drawing context about the index/element buffer to use
+ * (ala glDrawElements)
+ * If no element buffer is to be used (i.e. glDrawArrays) then this
+ * should be called with eltSize=0 and elements=NULL.
+ *
+ * \param draw  the drawing context
+ * \param eltSize  size of each element (1, 2 or 4 bytes)
+ * \param elements  the element buffer ptr
+ */
+void
+draw_set_mapped_element_buffer( struct draw_context *draw,
+                                unsigned eltSize, void *elements )
+{
+//   draw_statechange( draw );
+
+   /* choose the get_vertex() function to use */
+   switch (eltSize) {
+   case 0:
+      draw->vcache.get_vertex = get_vertex;
+      break;
+   case 1:
+      draw->vcache.get_vertex = get_ubyte_elt_vertex;
+      break;
+   case 2:
+      draw->vcache.get_vertex = get_ushort_elt_vertex;
+      break;
+   case 4:
+      draw->vcache.get_vertex = get_uint_elt_vertex;
+      break;
+   default:
+      assert(0);
+   }
+   draw->user.elts = elements;
+   draw->user.eltSize = eltSize;
+}
+
diff --git a/src/gallium/auxiliary/draw/draw_vertex_fetch.c b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
new file mode 100644
index 00000000000..e13df04605c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex_fetch.c
@@ -0,0 +1,510 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+#include "draw_context.h"
+
+
+#define DRAW_DBG 0
+
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define FETCH_ATTRIB( NAME, SZ, CVT )			\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   static const float defaults[4] = { 0,0,0,1 };	\
+   int i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = CVT;					\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}
+
+#define CVT_64_FLOAT   (float) ((double *) ptr)[i]
+#define CVT_32_FLOAT   ((float *) ptr)[i]
+
+#define CVT_8_USCALED  (float) ((unsigned char *) ptr)[i]
+#define CVT_16_USCALED (float) ((unsigned short *) ptr)[i]
+#define CVT_32_USCALED (float) ((unsigned int *) ptr)[i]
+
+#define CVT_8_SSCALED  (float) ((char *) ptr)[i]
+#define CVT_16_SSCALED (float) ((short *) ptr)[i]
+#define CVT_32_SSCALED (float) ((int *) ptr)[i]
+
+#define CVT_8_UNORM    (float) ((unsigned char *) ptr)[i] / 255.0f
+#define CVT_16_UNORM   (float) ((unsigned short *) ptr)[i] / 65535.0f
+#define CVT_32_UNORM   (float) ((unsigned int *) ptr)[i] / 4294967295.0f
+
+#define CVT_8_SNORM    (float) ((char *) ptr)[i] / 127.0f
+#define CVT_16_SNORM   (float) ((short *) ptr)[i] / 32767.0f
+#define CVT_32_SNORM   (float) ((int *) ptr)[i] / 2147483647.0f
+
+FETCH_ATTRIB( R64G64B64A64_FLOAT,   4, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64B64_FLOAT,      3, CVT_64_FLOAT )
+FETCH_ATTRIB( R64G64_FLOAT,         2, CVT_64_FLOAT )
+FETCH_ATTRIB( R64_FLOAT,            1, CVT_64_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_FLOAT,   4, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32B32_FLOAT,      3, CVT_32_FLOAT )
+FETCH_ATTRIB( R32G32_FLOAT,         2, CVT_32_FLOAT )
+FETCH_ATTRIB( R32_FLOAT,            1, CVT_32_FLOAT )
+
+FETCH_ATTRIB( R32G32B32A32_USCALED, 4, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32B32_USCALED,    3, CVT_32_USCALED )
+FETCH_ATTRIB( R32G32_USCALED,       2, CVT_32_USCALED )
+FETCH_ATTRIB( R32_USCALED,          1, CVT_32_USCALED )
+
+FETCH_ATTRIB( R32G32B32A32_SSCALED, 4, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32B32_SSCALED,    3, CVT_32_SSCALED )
+FETCH_ATTRIB( R32G32_SSCALED,       2, CVT_32_SSCALED )
+FETCH_ATTRIB( R32_SSCALED,          1, CVT_32_SSCALED )
+
+FETCH_ATTRIB( R32G32B32A32_UNORM, 4, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32B32_UNORM,    3, CVT_32_UNORM )
+FETCH_ATTRIB( R32G32_UNORM,       2, CVT_32_UNORM )
+FETCH_ATTRIB( R32_UNORM,          1, CVT_32_UNORM )
+
+FETCH_ATTRIB( R32G32B32A32_SNORM, 4, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32B32_SNORM,    3, CVT_32_SNORM )
+FETCH_ATTRIB( R32G32_SNORM,       2, CVT_32_SNORM )
+FETCH_ATTRIB( R32_SNORM,          1, CVT_32_SNORM )
+
+FETCH_ATTRIB( R16G16B16A16_USCALED, 4, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16B16_USCALED,    3, CVT_16_USCALED )
+FETCH_ATTRIB( R16G16_USCALED,       2, CVT_16_USCALED )
+FETCH_ATTRIB( R16_USCALED,          1, CVT_16_USCALED )
+
+FETCH_ATTRIB( R16G16B16A16_SSCALED, 4, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16B16_SSCALED,    3, CVT_16_SSCALED )
+FETCH_ATTRIB( R16G16_SSCALED,       2, CVT_16_SSCALED )
+FETCH_ATTRIB( R16_SSCALED,          1, CVT_16_SSCALED )
+
+FETCH_ATTRIB( R16G16B16A16_UNORM, 4, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16B16_UNORM,    3, CVT_16_UNORM )
+FETCH_ATTRIB( R16G16_UNORM,       2, CVT_16_UNORM )
+FETCH_ATTRIB( R16_UNORM,          1, CVT_16_UNORM )
+
+FETCH_ATTRIB( R16G16B16A16_SNORM, 4, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16B16_SNORM,    3, CVT_16_SNORM )
+FETCH_ATTRIB( R16G16_SNORM,       2, CVT_16_SNORM )
+FETCH_ATTRIB( R16_SNORM,          1, CVT_16_SNORM )
+
+FETCH_ATTRIB( R8G8B8A8_USCALED,   4, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8B8_USCALED,     3, CVT_8_USCALED )
+FETCH_ATTRIB( R8G8_USCALED,       2, CVT_8_USCALED )
+FETCH_ATTRIB( R8_USCALED,         1, CVT_8_USCALED )
+
+FETCH_ATTRIB( R8G8B8A8_SSCALED,  4, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8B8_SSCALED,    3, CVT_8_SSCALED )
+FETCH_ATTRIB( R8G8_SSCALED,      2, CVT_8_SSCALED )
+FETCH_ATTRIB( R8_SSCALED,        1, CVT_8_SSCALED )
+
+FETCH_ATTRIB( R8G8B8A8_UNORM,  4, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8B8_UNORM,    3, CVT_8_UNORM )
+FETCH_ATTRIB( R8G8_UNORM,      2, CVT_8_UNORM )
+FETCH_ATTRIB( R8_UNORM,        1, CVT_8_UNORM )
+
+FETCH_ATTRIB( R8G8B8A8_SNORM,  4, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8B8_SNORM,    3, CVT_8_SNORM )
+FETCH_ATTRIB( R8G8_SNORM,      2, CVT_8_SNORM )
+FETCH_ATTRIB( R8_SNORM,        1, CVT_8_SNORM )
+
+FETCH_ATTRIB( A8R8G8B8_UNORM,       4, CVT_8_UNORM )
+//FETCH_ATTRIB( R8G8B8A8_UNORM,       4, CVT_8_UNORM )
+
+
+
+static fetch_func get_fetch_func( enum pipe_format format )
+{
+#if 0
+   {
+      char tmp[80];
+      pf_sprint_name(tmp, format);
+      debug_printf("%s: %s\n", __FUNCTION__, tmp);
+   }
+#endif
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return fetch_A8R8G8B8_UNORM;
+
+   case 0:
+      return NULL;		/* not sure why this is needed */
+
+   default:
+      assert(0);
+      return NULL;
+   }
+}
+
+
+static void 
+transpose_4x4( float *out, const float *in )
+{
+   /* This can be achieved in 12 sse instructions, plus the final
+    * stores I guess.  This is probably a bit more than that - maybe
+    * 32 or so?
+    */
+   out[0] = in[0];  out[1] = in[4];  out[2] = in[8];   out[3] = in[12];
+   out[4] = in[1];  out[5] = in[5];  out[6] = in[9];   out[7] = in[13];
+   out[8] = in[2];  out[9] = in[6];  out[10] = in[10]; out[11] = in[14];
+   out[12] = in[3]; out[13] = in[7]; out[14] = in[11]; out[15] = in[15];
+}
+
+
+
+static void fetch_xyz_rgb( struct draw_context *draw,
+			   struct tgsi_exec_machine *machine,
+			   const unsigned *elts,
+			   unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   assert(count <= 4);
+
+//   debug_printf("%s\n", __FUNCTION__);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+static void fetch_xyz_rgb_st( struct draw_context *draw,
+			      struct tgsi_exec_machine *machine,
+			      const unsigned *elts,
+			      unsigned count )
+{
+   const unsigned *pitch   = draw->vertex_fetch.pitch;
+   const ubyte **src       = draw->vertex_fetch.src_ptr;
+   int i;
+
+   assert(count <= 4);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+
+   for (i = 0; i < 4; i++) {
+      {
+	 const float *in = (const float *)(src[0] + elts[i] * pitch[0]);
+	 float *out = &machine->Inputs[0].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[1] + elts[i] * pitch[1]);
+	 float *out = &machine->Inputs[1].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = in[2];
+ 	 out[12] = 1.0f;
+      }
+
+      {
+	 const float *in = (const float *)(src[2] + elts[i] * pitch[2]);
+	 float *out = &machine->Inputs[2].xyzw[0].f[i];
+	 out[0] = in[0];
+	 out[4] = in[1];
+	 out[8] = 0.0f;
+ 	 out[12] = 1.0f;
+      }
+   }
+}
+
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void generic_vertex_fetch( struct draw_context *draw,
+				  struct tgsi_exec_machine *machine,
+				  const unsigned *elts,
+				  unsigned count )
+{
+   unsigned nr_attrs = draw->vertex_fetch.nr_attrs;
+   unsigned attr;
+
+   assert(count <= 4);
+
+//   debug_printf("%s %d\n", __FUNCTION__, count);
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (attr = 0; attr < nr_attrs; attr++) {
+
+      const unsigned pitch   = draw->vertex_fetch.pitch[attr];
+      const ubyte *src = draw->vertex_fetch.src_ptr[attr];
+      const fetch_func fetch = draw->vertex_fetch.fetch[attr];
+      unsigned i;
+      float p[4][4];
+
+
+      /* Fetch four attributes for four vertices.  
+       * 
+       * Could fetch directly into AOS format, but this is meant to be
+       * a prototype for an sse implementation, which would have
+       * difficulties doing that.
+       */
+      for (i = 0; i < count; i++) 
+	 fetch( src + elts[i] * pitch, p[i] );
+
+      /* Be nice and zero out any missing vertices: 
+       */
+      for ( ; i < 4; i++) 
+	 p[i][0] = p[i][1] = p[i][2] = p[i][3] = 0;
+      
+      /* Transpose/swizzle into sse-friendly format.  Currently
+       * assuming that all vertex shader inputs are float[4], but this
+       * isn't true -- if the vertex shader only wants tex0.xy, we
+       * could optimize for that.
+       *
+       * To do so fully without codegen would probably require an
+       * excessive number of fetch functions, but we could at least
+       * minimize the transpose step:
+       */
+      transpose_4x4( (float *)&machine->Inputs[attr].xyzw[0].f[0], (float *)p );
+   }
+}
+
+
+			       
+void draw_update_vertex_fetch( struct draw_context *draw )
+{
+   unsigned nr_attrs, i;
+
+//   debug_printf("%s\n", __FUNCTION__);
+   
+   /* this may happend during context init */
+   if (!draw->vertex_shader)
+      return;
+
+   nr_attrs = draw->vertex_shader->state->num_inputs;
+
+   for (i = 0; i < nr_attrs; i++) {
+      unsigned buf = draw->vertex_element[i].vertex_buffer_index;
+      enum pipe_format format  = draw->vertex_element[i].src_format;
+
+      draw->vertex_fetch.src_ptr[i] = (const ubyte *) draw->user.vbuffer[buf] + 
+						       draw->vertex_buffer[buf].buffer_offset + 
+						       draw->vertex_element[i].src_offset;
+
+      draw->vertex_fetch.pitch[i] = draw->vertex_buffer[buf].pitch;
+      draw->vertex_fetch.fetch[i] = get_fetch_func( format );
+   }
+
+   draw->vertex_fetch.nr_attrs = nr_attrs;
+
+   draw->vertex_fetch.fetch_func = generic_vertex_fetch;
+
+   switch (nr_attrs) {
+   case 2:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb;
+      break;
+   case 3:
+      if (draw->vertex_element[0].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[1].src_format == PIPE_FORMAT_R32G32B32_FLOAT &&
+	  draw->vertex_element[2].src_format == PIPE_FORMAT_R32G32_FLOAT)
+	 draw->vertex_fetch.fetch_func = fetch_xyz_rgb_st;
+      break;
+   default:
+      break;
+   }
+
+}
diff --git a/src/gallium/auxiliary/draw/draw_vertex_shader.c b/src/gallium/auxiliary/draw/draw_vertex_shader.c
new file mode 100644
index 00000000000..f68f6e32440
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vertex_shader.c
@@ -0,0 +1,126 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  *   Brian Paul
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+
+
+
+/**
+ * Run the vertex shader on all vertices in the vertex queue.
+ * Called by the draw module when the vertx cache needs to be flushed.
+ */
+void
+draw_vertex_shader_queue_flush(struct draw_context *draw)
+{
+   struct draw_vertex_shader *shader = draw->vertex_shader;
+   unsigned i;
+
+   assert(draw->vs.queue_nr != 0);
+
+   /* XXX: do this on statechange: 
+    */
+   shader->prepare( shader, draw );
+
+//   fprintf(stderr, " q(%d) ", draw->vs.queue_nr );
+
+   /* run vertex shader on vertex cache entries, four per invokation */
+   for (i = 0; i < draw->vs.queue_nr; i += 4) {
+      struct vertex_header *dests[4];
+      unsigned elts[4];
+      int j, n = MIN2(4, draw->vs.queue_nr - i);
+
+      for (j = 0; j < n; j++) {
+         elts[j] = draw->vs.queue[i + j].elt;
+         dests[j] = draw->vs.queue[i + j].dest;
+      }
+
+      for ( ; j < 4; j++) {
+	 elts[j] = elts[0];
+	 dests[j] = dests[0];
+      }
+
+      assert(n > 0);
+      assert(n <= 4);
+
+      shader->run(shader, draw, elts, n, dests);
+   }
+
+   draw->vs.queue_nr = 0;
+}
+
+
+struct draw_vertex_shader *
+draw_create_vertex_shader(struct draw_context *draw,
+                          const struct pipe_shader_state *shader)
+{
+   struct draw_vertex_shader *vs;
+
+   vs = draw_create_vs_llvm( draw, shader );
+   if (vs)
+      return vs;
+
+   vs = draw_create_vs_sse( draw, shader );
+   if (vs)
+      return vs;
+
+   vs = draw_create_vs_exec( draw, shader );
+   assert(vs);
+   return vs;
+}
+
+
+void
+draw_bind_vertex_shader(struct draw_context *draw,
+                        struct draw_vertex_shader *dvs)
+{
+   draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
+
+   draw->vertex_shader = dvs;
+   draw->num_vs_outputs = dvs->state->num_outputs;
+
+   tgsi_exec_machine_init(&draw->machine);
+
+   dvs->prepare( dvs, draw );
+}
+
+
+void
+draw_delete_vertex_shader(struct draw_context *draw,
+                          struct draw_vertex_shader *dvs)
+{
+   dvs->delete( dvs );
+}
diff --git a/src/gallium/auxiliary/draw/draw_vf.c b/src/gallium/auxiliary/draw/draw_vf.c
new file mode 100644
index 00000000000..dc3a5ecd219
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf.c
@@ -0,0 +1,428 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <[email protected]>
+ */
+
+
+#include <stddef.h>
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+#define DRAW_VF_DBG 0
+
+
+/* TODO: remove this */
+extern void 
+_mesa_exec_free( void *addr );
+
+
+static boolean match_fastpath( struct draw_vertex_fetch *vf,
+				 const struct draw_vf_fastpath *fp)
+{
+   unsigned j;
+
+   if (vf->attr_count != fp->attr_count) 
+      return FALSE;
+
+   for (j = 0; j < vf->attr_count; j++) 
+      if (vf->attr[j].format != fp->attr[j].format ||
+	  vf->attr[j].inputsize != fp->attr[j].size ||
+	  vf->attr[j].vertoffset != fp->attr[j].offset) 
+	 return FALSE;
+      
+   if (fp->match_strides) {
+      if (vf->vertex_stride != fp->vertex_stride)
+	 return FALSE;
+
+      for (j = 0; j < vf->attr_count; j++) 
+	 if (vf->attr[j].inputstride != fp->attr[j].stride) 
+	    return FALSE;
+   }
+   
+   return TRUE;
+}
+
+static boolean search_fastpath_emit( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp = vf->fastpath;
+
+   for ( ; fp ; fp = fp->next) {
+      if (match_fastpath(vf, fp)) {
+         vf->emit = fp->func;
+	 return TRUE;
+      }
+   }
+
+   return FALSE;
+}
+
+void draw_vf_register_fastpath( struct draw_vertex_fetch *vf,
+			     boolean match_strides )
+{
+   struct draw_vf_fastpath *fastpath = CALLOC_STRUCT(draw_vf_fastpath);
+   unsigned i;
+
+   fastpath->vertex_stride = vf->vertex_stride;
+   fastpath->attr_count = vf->attr_count;
+   fastpath->match_strides = match_strides;
+   fastpath->func = vf->emit;
+   fastpath->attr = (struct draw_vf_attr_type *)
+      MALLOC(vf->attr_count * sizeof(fastpath->attr[0]));
+
+   for (i = 0; i < vf->attr_count; i++) {
+      fastpath->attr[i].format = vf->attr[i].format;
+      fastpath->attr[i].stride = vf->attr[i].inputstride;
+      fastpath->attr[i].size = vf->attr[i].inputsize;
+      fastpath->attr[i].offset = vf->attr[i].vertoffset;
+   }
+
+   fastpath->next = vf->fastpath;
+   vf->fastpath = fastpath;
+}
+
+
+
+
+/***********************************************************************
+ * Build codegen functions or return generic ones:
+ */
+static void choose_emit_func( struct draw_vertex_fetch *vf, 
+			      unsigned count, 
+			      uint8_t *dest)
+{
+   vf->emit = NULL;
+   
+   /* Does this match an existing (hardwired, codegen or known-bad)
+    * fastpath?
+    */
+   if (search_fastpath_emit(vf)) {
+      /* Use this result.  If it is null, then it is already known
+       * that the current state will fail for codegen and there is no
+       * point trying again.
+       */
+   }
+   else if (vf->codegen_emit) {
+      vf->codegen_emit( vf );
+   }
+
+   if (!vf->emit) {
+      draw_vf_generate_hardwired_emit(vf);
+   }
+
+   /* Otherwise use the generic version:
+    */
+   if (!vf->emit)
+      vf->emit = draw_vf_generic_emit;
+
+   vf->emit( vf, count, dest );
+}
+
+
+
+
+
+/***********************************************************************
+ * Public entrypoints, mostly dispatch to the above:
+ */
+
+
+
+static unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf, 
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride )
+{
+   unsigned offset = 0;
+   unsigned i, j;
+
+   assert(nr < PIPE_ATTRIB_MAX);
+
+   for (j = 0, i = 0; i < nr; i++) {
+      const unsigned format = map[i].format;
+      if (format == DRAW_EMIT_PAD) {
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: pad %d, offset %d\n", i,  
+			 map[i].offset, offset);  
+#endif
+
+	 offset += map[i].offset;
+
+      }
+      else {
+	 vf->attr[j].attrib = map[i].attrib;
+	 vf->attr[j].format = format;
+	 vf->attr[j].insert = draw_vf_format_info[format].insert;
+	 vf->attr[j].vertattrsize = draw_vf_format_info[format].attrsize;
+	 vf->attr[j].vertoffset = offset;
+	 vf->attr[j].isconst = draw_vf_format_info[format].isconst;
+	 if(vf->attr[j].isconst)
+	    memcpy(vf->attr[j].data, &map[i].data, vf->attr[j].vertattrsize);
+	 
+#if (DRAW_VF_DBG)
+	    debug_printf("%d: %s, offset %d\n", i,  
+			 draw_vf_format_info[format].name,
+			 vf->attr[j].vertoffset);   
+#endif
+
+	 offset += draw_vf_format_info[format].attrsize;
+	 j++;
+      }
+   }
+
+   vf->attr_count = j;
+   vf->vertex_stride = vertex_stride ? vertex_stride : offset;
+   vf->emit = choose_emit_func;
+
+   assert(vf->vertex_stride >= offset);
+   return vf->vertex_stride;
+}
+
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size )
+{
+   unsigned i, j, k;
+   struct draw_vf_attr *a = vf->attr;
+   struct draw_vf_attr_map attrs[PIPE_MAX_SHADER_INPUTS];
+   unsigned count = 0;  /* for debug/sanity */
+   unsigned nr_attrs = 0;
+   
+   for (i = 0; i < vinfo->num_attribs; i++) {
+      j = vinfo->src_index[i];
+      switch (vinfo->emit[i]) {
+      case EMIT_OMIT:
+         /* no-op */
+         break;
+      case EMIT_ALL: {
+         /* just copy the whole vertex as-is to the vbuf */
+	 unsigned s = vinfo->size;
+         assert(i == 0);
+         assert(j == 0);
+         /* copy the vertex header */
+         /* XXX: we actually don't copy the header, just pad it */
+	 attrs[nr_attrs].attrib = 0;
+	 attrs[nr_attrs].format = DRAW_EMIT_PAD;
+	 attrs[nr_attrs].offset = offsetof(struct vertex_header, data);
+	 s -= offsetof(struct vertex_header, data)/4;
+         count += offsetof(struct vertex_header, data)/4;
+	 nr_attrs++;
+	 /* copy the vertex data */
+         for(k = 0; k < (s & ~0x3); k += 4) {
+      	    attrs[nr_attrs].attrib = k/4;
+      	    attrs[nr_attrs].format = DRAW_EMIT_4F;
+      	    attrs[nr_attrs].offset = 0;
+      	    nr_attrs++;
+            count += 4;
+         }
+         /* tail */
+         /* XXX: actually, this shouldn't be needed */
+ 	 attrs[nr_attrs].attrib = k/4;
+  	 attrs[nr_attrs].offset = 0;
+         switch(s & 0x3) {
+         case 0:
+            break;
+         case 1:
+      	    attrs[nr_attrs].format = DRAW_EMIT_1F;
+      	    nr_attrs++;
+            count += 1;
+            break;
+         case 2:
+      	    attrs[nr_attrs].format = DRAW_EMIT_2F;
+      	    nr_attrs++;
+            count += 2;
+            break;
+         case 3:
+      	    attrs[nr_attrs].format = DRAW_EMIT_3F;
+      	    nr_attrs++;
+            count += 3;
+            break;
+         }
+         break;
+      }
+      case EMIT_1F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_1F_PSIZE:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_1F_CONST;
+	 attrs[nr_attrs].offset = 0;
+	 attrs[nr_attrs].data.f[0] = point_size;
+	 nr_attrs++;
+         count++;
+         break;
+      case EMIT_2F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_2F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 2;
+         break;
+      case EMIT_3F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_3F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 3;
+         break;
+      case EMIT_4F:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4F;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 4;
+         break;
+      case EMIT_4UB:
+	 attrs[nr_attrs].attrib = j;
+	 attrs[nr_attrs].format = DRAW_EMIT_4UB_4F_BGRA;
+	 attrs[nr_attrs].offset = 0;
+	 nr_attrs++;
+         count += 1;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   
+   assert(count == vinfo->size);  
+   
+   draw_vf_set_vertex_attributes(vf, 
+                                 attrs, 
+                                 nr_attrs, 
+                                 vinfo->size * sizeof(float) );
+
+   for (j = 0; j < vf->attr_count; j++) {
+      a[j].inputsize = 4;
+      a[j].do_insert = a[j].insert[4 - 1];
+      if(a[j].isconst) {
+	 a[j].inputptr = a[j].data;
+	 a[j].inputstride = 0;
+      }
+   }
+}
+
+
+#if 0
+/* Set attribute pointers, adjusted for start position:
+ */
+void draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const sources[],
+		     unsigned start )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      const GLvector4f *vptr = sources[a[j].attrib];
+      
+      if ((a[j].inputstride != vptr->stride) ||
+	  (a[j].inputsize != vptr->size))
+	 vf->emit = choose_emit_func;
+      
+      a[j].inputstride = vptr->stride;
+      a[j].inputsize = vptr->size;
+      a[j].do_insert = a[j].insert[vptr->size - 1]; 
+      a[j].inputptr = ((uint8_t *)vptr->data) + start * vptr->stride;
+   }
+}
+#endif
+
+
+/**
+ * Emit a vertex to dest.  
+ */
+void draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                          struct vertex_header *vertex,
+                          void *dest )
+{
+   struct draw_vf_attr *a = vf->attr;
+   unsigned j;
+   
+   for (j = 0; j < vf->attr_count; j++) {
+      if (!a[j].isconst) {
+	 a[j].inputptr = (uint8_t *)&vertex->data[a[j].attrib][0];
+	 a[j].inputstride = 0; /* XXX: one-vertex-max ATM */
+      }
+   }
+   
+   vf->emit( vf, 1, (uint8_t*) dest );
+}
+
+
+
+struct draw_vertex_fetch *draw_vf_create( void )
+{
+   struct draw_vertex_fetch *vf = CALLOC_STRUCT(draw_vertex_fetch);
+   unsigned i;
+
+   for (i = 0; i < PIPE_ATTRIB_MAX; i++)
+      vf->attr[i].vf = vf;
+
+   vf->identity[0] = 0.0;
+   vf->identity[1] = 0.0;
+   vf->identity[2] = 0.0;
+   vf->identity[3] = 1.0;
+
+   vf->codegen_emit = NULL;
+
+#ifdef USE_SSE_ASM
+   if (!GETENV("GALLIUM_NO_CODEGEN"))
+      vf->codegen_emit = draw_vf_generate_sse_emit;
+#endif
+
+   return vf;
+}
+
+
+void draw_vf_destroy( struct draw_vertex_fetch *vf )
+{
+   struct draw_vf_fastpath *fp, *tmp;
+
+   for (fp = vf->fastpath ; fp ; fp = tmp) {
+      tmp = fp->next;
+      FREE(fp->attr);
+
+      /* KW: At the moment, fp->func is constrained to be allocated by
+       * _mesa_exec_alloc(), as the hardwired fastpaths in
+       * t_vertex_generic.c are handled specially.  It would be nice
+       * to unify them, but this probably won't change until this
+       * module gets another overhaul.
+       */
+      //_mesa_exec_free((void *) fp->func);
+      FREE(fp);
+   }
+   
+   vf->fastpath = NULL;
+   FREE(vf);
+}
diff --git a/src/gallium/auxiliary/draw/draw_vf.h b/src/gallium/auxiliary/draw/draw_vf.h
new file mode 100644
index 00000000000..011c8f0ff1c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf.h
@@ -0,0 +1,236 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Vertex fetch/store/convert code.  This functionality is used in two places:
+ * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
+ *    arrays and convert to format needed by vertex shaders.
+ * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
+ *    (which is the organization used throughout the draw/prim pipeline) to
+ *    hardware-specific formats and emit into hardware vertex buffers.
+ *
+ *
+ * Authors:
+ *    Keith Whitwell <[email protected]>
+ */
+
+#ifndef DRAW_VF_H
+#define DRAW_VF_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+
+#include "draw_vertex.h"
+#include "draw_private.h" /* for vertex_header */
+
+
+enum draw_vf_attr_format {
+   DRAW_EMIT_1F,
+   DRAW_EMIT_2F,
+   DRAW_EMIT_3F,
+   DRAW_EMIT_4F,
+   DRAW_EMIT_3F_XYW,			/**< for projective texture */
+   DRAW_EMIT_1UB_1F,			/**< for fog coordinate */
+   DRAW_EMIT_3UB_3F_RGB,		/**< for specular color */
+   DRAW_EMIT_3UB_3F_BGR,		/**< for specular color */
+   DRAW_EMIT_4UB_4F_RGBA,		/**< for color */
+   DRAW_EMIT_4UB_4F_BGRA,		/**< for color */
+   DRAW_EMIT_4UB_4F_ARGB,		/**< for color */
+   DRAW_EMIT_4UB_4F_ABGR,		/**< for color */
+   DRAW_EMIT_1F_CONST,
+   DRAW_EMIT_2F_CONST,
+   DRAW_EMIT_3F_CONST,
+   DRAW_EMIT_4F_CONST,
+   DRAW_EMIT_PAD,			/**< leave a hole of 'offset' bytes */
+   DRAW_EMIT_MAX
+};
+
+struct draw_vf_attr_map 
+{
+   /** Input attribute number */
+   unsigned attrib;
+   
+   enum draw_vf_attr_format format;
+   
+   unsigned offset;
+   
+   /** 
+    * Constant data for DRAW_EMIT_*_CONST 
+    */
+   union {
+      uint8_t ub[4];
+      float f[4];
+   } data;
+};
+
+struct draw_vertex_fetch;
+
+
+
+#if 0
+unsigned 
+draw_vf_set_vertex_attributes( struct draw_vertex_fetch *vf,
+                               const struct draw_vf_attr_map *map,
+                               unsigned nr, 
+                               unsigned vertex_stride );
+#endif
+
+void draw_vf_set_vertex_info( struct draw_vertex_fetch *vf, 
+                              const struct vertex_info *vinfo,
+                              float point_size );
+
+#if 0
+void 
+draw_vf_set_sources( struct draw_vertex_fetch *vf,
+		     GLvector4f * const attrib[],
+		     unsigned start );
+#endif
+
+void 
+draw_vf_emit_vertex( struct draw_vertex_fetch *vf,
+                     struct vertex_header *vertex,
+                     void *dest );
+
+struct draw_vertex_fetch *
+draw_vf_create( void );
+
+void 
+draw_vf_destroy( struct draw_vertex_fetch *vf );
+
+
+
+/***********************************************************************
+ * Internal functions and structs:
+ */
+
+struct draw_vf_attr;
+
+typedef void (*draw_vf_extract_func)( const struct draw_vf_attr *a, 
+				      float *out, 
+				      const uint8_t *v );
+
+typedef void (*draw_vf_insert_func)( const struct draw_vf_attr *a, 
+				     uint8_t *v, 
+				     const float *in );
+
+typedef void (*draw_vf_emit_func)( struct draw_vertex_fetch *vf,
+      				   unsigned count, 
+      				   uint8_t *dest );
+
+
+
+/**
+ * Describes how to convert/move a vertex attribute from a vertex
+ * array to a vertex structure.
+ */
+struct draw_vf_attr
+{
+   struct draw_vertex_fetch *vf;
+
+   unsigned format;
+   unsigned inputsize;
+   unsigned inputstride;
+   unsigned vertoffset;      /**< position of the attrib in the vertex struct */
+   
+   boolean isconst;              /**< read from const data below */
+   uint8_t data[16];
+
+   unsigned attrib;          /**< which vertex attrib (0=position, etc) */
+   unsigned vertattrsize;    /**< size of the attribute in bytes */
+
+   uint8_t *inputptr;
+   const draw_vf_insert_func *insert;
+   draw_vf_insert_func do_insert;
+   draw_vf_extract_func extract;
+};
+
+struct draw_vertex_fetch
+{
+   struct draw_vf_attr attr[PIPE_ATTRIB_MAX];
+   unsigned attr_count;
+   unsigned vertex_stride;
+
+   draw_vf_emit_func emit;
+
+   /* Parameters and constants for codegen:
+    */
+   float identity[4];
+
+   struct draw_vf_fastpath *fastpath;
+   
+   void (*codegen_emit)( struct draw_vertex_fetch *vf );
+};
+
+
+struct draw_vf_attr_type {
+   unsigned format;
+   unsigned size;
+   unsigned stride;
+   unsigned offset;
+};
+
+/** XXX this could be moved into draw_vf.c */
+struct draw_vf_fastpath {
+   unsigned vertex_stride;
+   unsigned attr_count;
+   boolean match_strides;
+
+   struct draw_vf_attr_type *attr;
+
+   draw_vf_emit_func func;
+   struct draw_vf_fastpath *next;
+};
+
+
+void 
+draw_vf_register_fastpath( struct draw_vertex_fetch *vtx,
+                           boolean match_strides );
+
+void 
+draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+                      unsigned count,
+                      uint8_t *v );
+
+void 
+draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf );
+
+void 
+draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf );
+
+
+/** XXX this type and function could probably be moved into draw_vf.c */
+struct draw_vf_format_info {
+   const char *name;
+   draw_vf_insert_func insert[4];
+   const unsigned attrsize;
+   const boolean isconst;
+};
+
+extern const struct draw_vf_format_info 
+draw_vf_format_info[DRAW_EMIT_MAX];
+
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vf_generic.c b/src/gallium/auxiliary/draw/draw_vf_generic.c
new file mode 100644
index 00000000000..7a60a9db9c3
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf_generic.c
@@ -0,0 +1,585 @@
+
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+
+#include "draw_vf.h"
+
+
+
+static INLINE void insert_4f_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = in[3];
+}
+
+static INLINE void insert_4f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_4f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+   out[3] = 1;
+}
+
+static INLINE void insert_3f_xyw_4( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[3];
+}
+
+static INLINE void insert_3f_xyw_err( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+   assert(0);
+}
+
+static INLINE void insert_3f_3( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = in[2];
+}
+
+static INLINE void insert_3f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+   out[2] = 0;
+}
+
+static INLINE void insert_3f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+   out[2] = 0;
+}
+
+
+static INLINE void insert_2f_2( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = in[1];
+}
+
+static INLINE void insert_2f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+   
+   out[0] = in[0];
+   out[1] = 0;
+}
+
+static INLINE void insert_1f_1( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   float *out = (float *)(v);
+   (void) a;
+
+   out[0] = in[0];
+}
+
+static INLINE void insert_null( const struct draw_vf_attr *a, uint8_t *v, const float *in )
+{
+   (void) a; (void) v; (void) in;
+}
+
+static INLINE void insert_4ub_4f_rgba_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_rgba_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_rgba_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[3]);
+}
+
+static INLINE void insert_4ub_4f_bgra_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_bgra_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+   v[3] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_argb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_argb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[0]);
+   v[2] = 0x00;
+   v[3] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_4( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[3]);
+}
+
+static INLINE void insert_4ub_4f_abgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[2]);
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[1]);
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_4ub_4f_abgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					 const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[3], in[0]);
+   v[2] = 0x00;
+   v[1] = 0x00;
+   v[0] = 0xff;
+}
+
+static INLINE void insert_3ub_3f_rgb_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[2]);
+}
+
+static INLINE void insert_3ub_3f_rgb_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_rgb_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+   v[1] = 0;
+   v[2] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_3( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[2]);
+}
+
+static INLINE void insert_3ub_3f_bgr_2( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   UNCLAMPED_FLOAT_TO_UBYTE(v[1], in[1]);
+   v[0] = 0;
+}
+
+static INLINE void insert_3ub_3f_bgr_1( const struct draw_vf_attr *a, uint8_t *v, 
+					const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[2], in[0]);
+   v[1] = 0;
+   v[0] = 0;
+}
+
+
+static INLINE void insert_1ub_1f_1( const struct draw_vf_attr *a, uint8_t *v, 
+				    const float *in )
+{
+   (void) a;
+   UNCLAMPED_FLOAT_TO_UBYTE(v[0], in[0]);
+}
+
+
+const struct draw_vf_format_info draw_vf_format_info[DRAW_EMIT_MAX] = 
+{
+   { "1f",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), FALSE },
+
+   { "2f",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), FALSE },
+
+   { "3f",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), FALSE },
+
+   { "4f",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), FALSE },
+
+   { "3f_xyw",
+     { insert_3f_xyw_err, insert_3f_xyw_err, insert_3f_xyw_err, 
+       insert_3f_xyw_4 },
+     3 * sizeof(float), FALSE },
+
+   { "1ub_1f",
+     { insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1, insert_1ub_1f_1 },
+     sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_rgb",
+     { insert_3ub_3f_rgb_1, insert_3ub_3f_rgb_2, insert_3ub_3f_rgb_3,
+       insert_3ub_3f_rgb_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "3ub_3f_bgr",
+     { insert_3ub_3f_bgr_1, insert_3ub_3f_bgr_2, insert_3ub_3f_bgr_3,
+       insert_3ub_3f_bgr_3 },
+     3 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_rgba",
+     { insert_4ub_4f_rgba_1, insert_4ub_4f_rgba_2, insert_4ub_4f_rgba_3, 
+       insert_4ub_4f_rgba_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_bgra",
+     { insert_4ub_4f_bgra_1, insert_4ub_4f_bgra_2, insert_4ub_4f_bgra_3,
+       insert_4ub_4f_bgra_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_argb",
+     { insert_4ub_4f_argb_1, insert_4ub_4f_argb_2, insert_4ub_4f_argb_3,
+       insert_4ub_4f_argb_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "4ub_4f_abgr",
+     { insert_4ub_4f_abgr_1, insert_4ub_4f_abgr_2, insert_4ub_4f_abgr_3,
+       insert_4ub_4f_abgr_4 },
+     4 * sizeof(uint8_t), FALSE },
+
+   { "1f_const",
+     { insert_1f_1, insert_1f_1, insert_1f_1, insert_1f_1 },
+     sizeof(float), TRUE },
+   
+   { "2f_const",
+     { insert_2f_1, insert_2f_2, insert_2f_2, insert_2f_2 },
+     2 * sizeof(float), TRUE },
+   
+   { "3f_const",
+     { insert_3f_1, insert_3f_2, insert_3f_3, insert_3f_3 },
+     3 * sizeof(float), TRUE },
+   
+   { "4f_const",
+     { insert_4f_1, insert_4f_2, insert_4f_3, insert_4f_4 },
+     4 * sizeof(float), TRUE },
+
+   { "pad",
+     { NULL, NULL, NULL, NULL },
+     0, FALSE },
+
+};
+
+
+
+    
+/***********************************************************************
+ * Hardwired fastpaths for emitting whole vertices or groups of
+ * vertices
+ */
+#define EMIT5(NR, F0, F1, F2, F3, F4, NAME)				\
+static void NAME( struct draw_vertex_fetch *vf,				\
+		  unsigned count,						\
+		  uint8_t *v )						\
+{									\
+   struct draw_vf_attr *a = vf->attr;				\
+   unsigned i;								\
+									\
+   for (i = 0 ; i < count ; i++, v += vf->vertex_stride) {		\
+      if (NR > 0) {							\
+	 F0( &a[0], v + a[0].vertoffset, (float *)a[0].inputptr );	\
+	 a[0].inputptr += a[0].inputstride;				\
+      }									\
+      									\
+      if (NR > 1) {							\
+	 F1( &a[1], v + a[1].vertoffset, (float *)a[1].inputptr );	\
+	 a[1].inputptr += a[1].inputstride;				\
+      }									\
+      									\
+      if (NR > 2) {							\
+	 F2( &a[2], v + a[2].vertoffset, (float *)a[2].inputptr );	\
+	 a[2].inputptr += a[2].inputstride;				\
+      }									\
+      									\
+      if (NR > 3) {							\
+	 F3( &a[3], v + a[3].vertoffset, (float *)a[3].inputptr );	\
+	 a[3].inputptr += a[3].inputstride;				\
+      }									\
+									\
+      if (NR > 4) {							\
+	 F4( &a[4], v + a[4].vertoffset, (float *)a[4].inputptr );	\
+	 a[4].inputptr += a[4].inputstride;				\
+      }									\
+   }									\
+}
+
+   
+#define EMIT2(F0, F1, NAME) EMIT5(2, F0, F1, insert_null, \
+				  insert_null, insert_null, NAME)
+
+#define EMIT3(F0, F1, F2, NAME) EMIT5(3, F0, F1, F2, insert_null, \
+				      insert_null, NAME)
+   
+#define EMIT4(F0, F1, F2, F3, NAME) EMIT5(4, F0, F1, F2, F3, \
+				          insert_null, NAME)
+   
+
+EMIT2(insert_3f_3, insert_4ub_4f_rgba_4, emit_xyz3_rgba4)
+
+EMIT3(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, emit_xyzw4_rgba4_st2)
+
+EMIT4(insert_4f_4, insert_4ub_4f_rgba_4, insert_2f_2, insert_2f_2, emit_xyzw4_rgba4_st2_st2)
+
+
+/* Use the codegen paths to select one of a number of hardwired
+ * fastpaths.
+ */
+void draw_vf_generate_hardwired_emit( struct draw_vertex_fetch *vf )
+{
+   draw_vf_emit_func func = NULL;
+
+   /* Does it fit a hardwired fastpath?  Help! this is growing out of
+    * control!
+    */
+   switch (vf->attr_count) {
+   case 2:
+      if (vf->attr[0].do_insert == insert_3f_3 &&
+	  vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+ 	 func = emit_xyz3_rgba4; 
+      }
+      break;
+   case 3:
+      if (vf->attr[2].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2;
+	 }
+      }
+      break;
+   case 4:
+      if (vf->attr[2].do_insert == insert_2f_2 &&
+	  vf->attr[3].do_insert == insert_2f_2) {
+	 if (vf->attr[1].do_insert == insert_4ub_4f_rgba_4) {
+	    if (vf->attr[0].do_insert == insert_4f_4) 
+	       func = emit_xyzw4_rgba4_st2_st2;
+	 }
+      }
+      break;
+   }
+
+   vf->emit = func;
+}
+
+/***********************************************************************
+ * Generic (non-codegen) functions for whole vertices or groups of
+ * vertices
+ */
+
+void draw_vf_generic_emit( struct draw_vertex_fetch *vf,
+		      unsigned count,
+		      uint8_t *v )
+{
+   struct draw_vf_attr *a = vf->attr;
+   const unsigned attr_count = vf->attr_count;
+   const unsigned stride = vf->vertex_stride;
+   unsigned i, j;
+
+   for (i = 0 ; i < count ; i++, v += stride) {
+      for (j = 0; j < attr_count; j++) {
+	 float *in = (float *)a[j].inputptr;
+	 a[j].inputptr += a[j].inputstride;
+	 a[j].do_insert( &a[j], v + a[j].vertoffset, in );
+      }
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_vf_sse.c b/src/gallium/auxiliary/draw/draw_vf_sse.c
new file mode 100644
index 00000000000..1ad2ae756dd
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vf_sse.c
@@ -0,0 +1,614 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <[email protected]>
+ */
+
+
+#include "simple_list.h"
+
+#include "pipe/p_compiler.h"
+
+#include "draw_vf.h"
+
+
+#if defined(USE_SSE_ASM)
+
+#include "x86/rtasm/x86sse.h"
+#include "x86/common_x86_asm.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+struct x86_program {
+   struct x86_function func;
+
+   struct draw_vertex_fetch *vf;
+   boolean inputs_safe;
+   boolean outputs_safe;
+   boolean have_sse2;
+   
+   struct x86_reg identity;
+   struct x86_reg chan0;
+};
+
+
+static struct x86_reg get_identity( struct x86_program *p )
+{
+   return p->identity;
+}
+
+static void emit_load4f_4( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_load4f_3( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Initialize from identity, then pull in low two words:
+    */
+   sse_movups(&p->func, dest, get_identity(p));
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load4f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Pull in low word, then swizzle in identity */
+   sse_movss(&p->func, dest, arg0);
+   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
+}
+
+
+
+static void emit_load3f_3( struct x86_program *p, 			   
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   /* Over-reads by 1 dword - potential SEGV if input is a vertex
+    * array.
+    */
+   if (p->inputs_safe) {
+      sse_movups(&p->func, dest, arg0);
+   } 
+   else {
+      /* c 0 0 0
+       * c c c c
+       * a b c c 
+       */
+      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
+      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
+      sse_movlps(&p->func, dest, arg0);
+   }
+}
+
+static void emit_load3f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_2(p, dest, arg0);
+}
+
+static void emit_load3f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load2f_2( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_load2f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   emit_load4f_1(p, dest, arg0);
+}
+
+static void emit_load1f_1( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+static void (*load[4][4])( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 ) = {
+   { emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1, 
+     emit_load1f_1 },
+
+   { emit_load2f_1, 
+     emit_load2f_2, 
+     emit_load2f_2, 
+     emit_load2f_2 },
+
+   { emit_load3f_1, 
+     emit_load3f_2, 
+     emit_load3f_3, 
+     emit_load3f_3 },
+
+   { emit_load4f_1, 
+     emit_load4f_2, 
+     emit_load4f_3, 
+     emit_load4f_4 } 
+};
+
+static void emit_load( struct x86_program *p,
+		       struct x86_reg dest,
+		       unsigned sz,
+		       struct x86_reg src,
+		       unsigned src_sz)
+{
+   load[sz-1][src_sz-1](p, dest, src);
+}
+
+static void emit_store4f( struct x86_program *p, 			   
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movups(&p->func, dest, arg0);
+}
+
+static void emit_store3f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   if (p->outputs_safe) {
+      /* Emit the extra dword anyway.  This may hurt writecombining,
+       * may cause other problems.
+       */
+      sse_movups(&p->func, dest, arg0);
+   }
+   else {
+      /* Alternate strategy - emit two, shuffle, emit one.
+       */
+      sse_movlps(&p->func, dest, arg0);
+      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
+   }
+}
+
+static void emit_store2f( struct x86_program *p, 
+			   struct x86_reg dest,
+			   struct x86_reg arg0 )
+{
+   sse_movlps(&p->func, dest, arg0);
+}
+
+static void emit_store1f( struct x86_program *p, 
+			  struct x86_reg dest,
+			  struct x86_reg arg0 )
+{
+   sse_movss(&p->func, dest, arg0);
+}
+
+
+static void (*store[4])( struct x86_program *p, 
+			 struct x86_reg dest,
+			 struct x86_reg arg0 ) = 
+{
+   emit_store1f, 
+   emit_store2f, 
+   emit_store3f, 
+   emit_store4f 
+};
+
+static void emit_store( struct x86_program *p,
+			struct x86_reg dest,
+			unsigned sz,
+			struct x86_reg temp )
+
+{
+   store[sz-1](p, dest, temp);
+}
+
+static void emit_pack_store_4ub( struct x86_program *p,
+				 struct x86_reg dest,
+				 struct x86_reg temp )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(&p->func, temp, p->chan0);
+
+   if (p->have_sse2) {
+      sse2_cvtps2dq(&p->func, temp, temp);
+      sse2_packssdw(&p->func, temp, temp);
+      sse2_packuswb(&p->func, temp, temp);
+      sse_movss(&p->func, dest, temp);
+   }
+   else {
+      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
+      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
+      sse_cvtps2pi(&p->func, mmx0, temp);
+      sse_movhlps(&p->func, temp, temp);
+      sse_cvtps2pi(&p->func, mmx1, temp);
+      mmx_packssdw(&p->func, mmx0, mmx1);
+      mmx_packuswb(&p->func, mmx0, mmx0);
+      mmx_movd(&p->func, dest, mmx0);
+   }
+}
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+/* Not much happens here.  Eventually use this function to try and
+ * avoid saving/reloading the source pointers each vertex (if some of
+ * them can fit in registers).
+ */
+static void get_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+   /* Load current a[j].inputptr
+    */
+   x86_mov(&p->func, srcREG, ptr_to_src);
+}
+
+static void update_src_ptr( struct x86_program *p,
+			 struct x86_reg srcREG,
+			 struct x86_reg vfREG,
+			 struct draw_vf_attr *a )
+{
+   if (a->inputstride) {
+      struct draw_vertex_fetch *vf = p->vf;
+      struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));
+
+      /* add a[j].inputstride (hardcoded value - could just as easily
+       * pull the stride value from memory each time).
+       */
+      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
+      
+      /* save new value of a[j].inputptr 
+       */
+      x86_mov(&p->func, ptr_to_src, srcREG);
+   }
+}
+
+
+/* Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct x86_program *p )
+{
+   struct draw_vertex_fetch *vf = p->vf;
+   unsigned j = 0;
+
+   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
+   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
+   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
+   struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
+   struct x86_reg temp = x86_make_reg(file_XMM, 0);
+   uint8_t *fixup, *label;
+
+   /* Push a few regs?
+    */
+   x86_push(&p->func, countEBP);
+   x86_push(&p->func, vfESI);
+
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(&p->func, srcECX, srcECX);
+   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
+   x86_cmp(&p->func, countEBP, srcECX);
+   fixup = x86_jcc_forward(&p->func, cc_E);
+
+   /* Initialize destination register. 
+    */
+   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
+
+   /* Move argument 1 (vf) into a reg:
+    */
+   x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));
+
+   
+   /* always load, needed or not:
+    */
+   sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));
+
+   /* Note address for loop jump */
+   label = x86_get_label(&p->func);
+
+   /* Emit code for each of the attributes.  Currently routes
+    * everything through SSE registers, even when it might be more
+    * efficient to stick with regular old x86.  No optimization or
+    * other tricks - enough new ground to cover here just getting
+    * things working.
+    */
+   while (j < vf->attr_count) {
+      struct draw_vf_attr *a = &vf->attr[j];
+      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
+
+      /* Now, load an XMM reg from src, perhaps transform, then save.
+       * Could be shortcircuited in specific cases:
+       */
+      switch (a->format) {
+      case DRAW_EMIT_1F:
+      case DRAW_EMIT_1F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 1, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_2F:
+      case DRAW_EMIT_2F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 2, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F:
+      case DRAW_EMIT_3F_CONST:
+	 /* Potentially the worst case - hardcode 2+1 copying:
+	  */
+	 if (0) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 3, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
+	    emit_store(p, dest, 2, temp);
+	    if (a->inputsize > 2) {
+	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
+	       emit_store(p, x86_make_disp(dest,8), 1, temp);
+	    }
+	    else {
+	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
+	    }
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 break;
+      case DRAW_EMIT_4F:
+      case DRAW_EMIT_4F_CONST:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_store(p, dest, 4, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_3F_XYW:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
+	 emit_store(p, dest, 3, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+
+      case DRAW_EMIT_1UB_1F:	 
+	 /* Test for PAD3 + 1UB:
+	  */
+	 if (j > 0 &&
+	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
+	 {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
+	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
+	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 else {
+	    debug_printf("Can't emit 1ub %x %x %d\n", 
+	            a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
+	    return FALSE;
+	 }
+	 break;
+      case DRAW_EMIT_3UB_3F_RGB:
+      case DRAW_EMIT_3UB_3F_BGR:
+	 /* Test for 3UB + PAD1:
+	  */
+	 if (j == vf->attr_count - 1 ||
+	     a[1].vertoffset >= a->vertoffset + 4) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	    emit_pack_store_4ub(p, dest, temp);
+	    update_src_ptr(p, srcECX, vfESI, a);
+	 }
+	 /* Test for 3UB + 1UB:
+	  */
+	 else if (j < vf->attr_count - 1 &&
+		  a[1].format == DRAW_EMIT_1UB_1F &&
+		  a[1].vertoffset == a->vertoffset + 3) {
+	    get_src_ptr(p, srcECX, vfESI, a);
+	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
+	    update_src_ptr(p, srcECX, vfESI, a);
+
+	    /* Make room for incoming value:
+	     */
+	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+
+	    get_src_ptr(p, srcECX, vfESI, &a[1]);
+	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
+	    update_src_ptr(p, srcECX, vfESI, &a[1]);
+
+	    /* Rearrange and possibly do BGR conversion:
+	     */
+	    if (a->format == DRAW_EMIT_3UB_3F_BGR)
+	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	    else
+	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
+
+	    emit_pack_store_4ub(p, dest, temp);
+	    j++;		/* NOTE: two attrs consumed */
+	 }
+	 else {
+	    debug_printf("Can't emit 3ub\n");
+	 }
+	 return FALSE;	/* add this later */
+	 break;
+
+      case DRAW_EMIT_4UB_4F_RGBA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_BGRA:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ARGB:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      case DRAW_EMIT_4UB_4F_ABGR:
+	 get_src_ptr(p, srcECX, vfESI, a);
+	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
+	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
+	 emit_pack_store_4ub(p, dest, temp);
+	 update_src_ptr(p, srcECX, vfESI, a);
+	 break;
+      default:
+	 debug_printf("unknown a[%d].format %d\n", j, a->format);
+	 return FALSE;	/* catch any new opcodes */
+      }
+      
+      /* Increment j by at least 1 - may have been incremented above also:
+       */
+      j++;
+   }
+
+   /* Next vertex:
+    */
+   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(&p->func, countEBP);
+   x86_test(&p->func, countEBP, countEBP); 
+   x86_jcc(&p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func.need_emms)
+      mmx_emms(&p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(&p->func, fixup);
+
+   /* Pop regs and return
+    */
+   x86_pop(&p->func, x86_get_base_reg(vfESI));
+   x86_pop(&p->func, countEBP);
+   x86_ret(&p->func);
+
+   vf->emit = (draw_vf_emit_func)x86_get_func(&p->func);
+   return TRUE;
+}
+
+
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   struct x86_program p;   
+
+   if (!cpu_has_xmm) {
+      vf->codegen_emit = NULL;
+      return;
+   }
+
+   memset(&p, 0, sizeof(p));
+
+   p.vf = vf;
+   p.inputs_safe = 0;		/* for now */
+   p.outputs_safe = 1;		/* for now */
+   p.have_sse2 = cpu_has_xmm2;
+   p.identity = x86_make_reg(file_XMM, 6);
+   p.chan0 = x86_make_reg(file_XMM, 7);
+
+   x86_init_func(&p.func);
+
+   if (build_vertex_emit(&p)) {
+      draw_vf_register_fastpath( vf, TRUE );
+   }
+   else {
+      /* Note the failure so that we don't keep trying to codegen an
+       * impossible state:
+       */
+      draw_vf_register_fastpath( vf, FALSE );
+      x86_release_func(&p.func);
+   }
+}
+
+#else
+
+void draw_vf_generate_sse_emit( struct draw_vertex_fetch *vf )
+{
+   /* Dummy version for when USE_SSE_ASM not defined */
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
new file mode 100644
index 00000000000..4ee7e705e93
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -0,0 +1,50 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+#ifndef DRAW_VS_H
+#define DRAW_VS_H
+
+struct draw_vertex_shader;
+struct draw_context;
+struct pipe_shader_state;
+
+struct draw_vertex_shader *
+draw_create_vs_exec(struct draw_context *draw,
+		    const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
+draw_create_vs_sse(struct draw_context *draw,
+		   const struct pipe_shader_state *templ);
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *templ);
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
new file mode 100644
index 00000000000..8588879400a
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -0,0 +1,186 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  *   Brian Paul
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+static void
+vs_exec_prepare( struct draw_vertex_shader *shader,
+		 struct draw_context *draw )
+{
+   /* specify the vertex program to interpret/execute */
+   tgsi_exec_machine_bind_shader(&draw->machine,
+				 shader->state->tokens,
+				 PIPE_MAX_SAMPLERS,
+				 NULL /*samplers*/ );
+
+   draw_update_vertex_fetch( draw );
+}
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+vs_exec_run( struct draw_vertex_shader *shader,
+	     struct draw_context *draw,
+	     const unsigned *elts, 
+	     unsigned count,
+	     struct vertex_header *vOut[] )
+{
+   struct tgsi_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+   assert(draw->vertex_shader->state->output_semantic_name[0]
+          == TGSI_SEMANTIC_POSITION);
+
+   machine->Consts = (float (*)[4]) draw->user.constants;
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
+
+   /* run interpreter */
+   tgsi_exec_machine_run( machine );
+
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      /* Handle attr[0] (position) specially:
+       *
+       * XXX: Computing the clipmask should be done in the vertex
+       * program as a set of DP4 instructions appended to the
+       * user-provided code.
+       */
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+   } /* loop over vertices */
+}
+
+
+
+static void
+vs_exec_delete( struct draw_vertex_shader *dvs )
+{
+   FREE( dvs );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_exec(struct draw_context *draw,
+		    const struct pipe_shader_state *state)
+{
+   struct draw_vertex_shader *vs = CALLOC_STRUCT( draw_vertex_shader );
+
+   if (vs == NULL) 
+      return NULL;
+
+   vs->state = state;
+   vs->prepare = vs_exec_prepare;
+   vs->run = vs_exec_run;
+   vs->delete = vs_exec_delete;
+
+   return vs;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
new file mode 100644
index 00000000000..44022b6e077
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -0,0 +1,237 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin
+  *   Keith Whitwell <[email protected]>
+  *   Brian Paul
+  */
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+#include "draw_context.h"
+#include "draw_vs.h"
+
+#ifdef MESA_LLVM
+
+#include "llvm/gallivm.h"
+
+struct draw_llvm_vertex_shader {
+   struct draw_vertex_shader base;
+   struct gallivm_prog *llvm_prog;
+};
+
+
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+
+static void
+vs_llvm_prepare( struct draw_vertex_shader *base,
+		 struct draw_context *draw )
+{
+   draw_update_vertex_fetch( draw );
+}
+
+
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+vs_llvm_run( struct draw_vertex_shader *base,
+	     struct draw_context *draw, 
+	     const unsigned *elts, 
+	     unsigned count,
+	     struct vertex_header *vOut[] )
+{
+   struct draw_llvm_vertex_shader *shader = 
+      (struct draw_llvm_vertex_shader *)base;
+
+   struct tgsi_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+
+   assert(count <= 4);
+   assert(draw->vertex_shader->state->output_semantic_name[0]
+          == TGSI_SEMANTIC_POSITION);
+
+   /* Consts does not require 16 byte alignment. */
+   machine->Consts = (float (*)[4]) draw->user.constants;
+
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
+
+   /* run shader */
+   gallivm_cpu_vs_exec(shader->llvm_prog,
+                       machine->Inputs,
+                       machine->Outputs,
+                       machine->Consts,
+                       machine->Temps);
+
+   /* store machine results */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+   } /* loop over vertices */
+}
+
+static void
+vs_llvm_delete( struct draw_vertex_shader *base )
+{
+   struct draw_llvm_vertex_shader *shader = 
+      (struct draw_llvm_vertex_shader *)base;
+
+   /* Do something to free compiled shader:
+    */
+
+   FREE( shader );
+}
+
+
+
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+		    const struct pipe_shader_state *templ)
+{
+   struct draw_llvm_vertex_shader *vs;
+
+   vs = CALLOC_STRUCT( draw_llvm_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   vs->base.state = templ;
+   vs->base.prepare = vs_llvm_prepare;
+   vs->base.run = vs_llvm_run;
+   vs->base.delete = vs_llvm_delete;
+
+   {
+      struct gallivm_ir *ir = gallivm_ir_new(GALLIVM_VS);
+      gallivm_ir_set_layout(ir, GALLIVM_SOA);
+      gallivm_ir_set_components(ir, 4);
+      gallivm_ir_fill_from_tgsi(ir, vs->base.state->tokens);
+      vs->llvm_prog = gallivm_ir_compile(ir);
+      gallivm_ir_delete(ir);
+   }
+
+   draw->engine = gallivm_global_cpu_engine();
+
+   /* XXX: Why are there two versions of this?  Shouldn't creating the
+    *      engine be a separate operation to compiling a shader?
+    */
+   if (!draw->engine) {
+      draw->engine = gallivm_cpu_engine_create(vs->llvm_prog);
+   }
+   else {
+      gallivm_cpu_jit_compile(draw->engine, vs->llvm_prog);
+   }
+
+   return &vs->base;
+}
+
+
+
+
+
+#else
+
+struct draw_vertex_shader *
+draw_create_vs_llvm(struct draw_context *draw,
+                          const struct pipe_shader_state *shader)
+{
+   return NULL;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
new file mode 100644
index 00000000000..27bc66812c3
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -0,0 +1,251 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  *   Brian Paul
+  */
+
+#include "draw_vs.h"
+
+#if defined(__i386__) || defined(__386__)
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "draw_private.h"
+#include "draw_context.h"
+
+#include "x86/rtasm/x86sse.h"
+#include "tgsi/exec/tgsi_sse2.h"
+
+
+typedef void (XSTDCALL *codegen_function) (
+   const struct tgsi_exec_vector *input,
+   struct tgsi_exec_vector *output,
+   float (*constant)[4],
+   struct tgsi_exec_vector *temporary );
+
+
+struct draw_sse_vertex_shader {
+   struct draw_vertex_shader base;
+   struct x86_function sse2_program;
+   codegen_function func;
+};
+
+
+/* Should be part of the generated shader:
+ */
+static INLINE unsigned
+compute_clipmask(const float *clip, /*const*/ float plane[][4], unsigned nr)
+{
+   unsigned mask = 0;
+   unsigned i;
+
+   /* Do the hardwired planes first:
+    */
+   if (-clip[0] + clip[3] < 0) mask |= CLIP_RIGHT_BIT;
+   if ( clip[0] + clip[3] < 0) mask |= CLIP_LEFT_BIT;
+   if (-clip[1] + clip[3] < 0) mask |= CLIP_TOP_BIT;
+   if ( clip[1] + clip[3] < 0) mask |= CLIP_BOTTOM_BIT;
+   if (-clip[2] + clip[3] < 0) mask |= CLIP_FAR_BIT;
+   if ( clip[2] + clip[3] < 0) mask |= CLIP_NEAR_BIT;
+
+   /* Followed by any remaining ones:
+    */
+   for (i = 6; i < nr; i++) {
+      if (dot4(clip, plane[i]) < 0) 
+         mask |= (1<<i);
+   }
+
+   return mask;
+}
+
+
+static void
+vs_sse_prepare( struct draw_vertex_shader *base,
+		struct draw_context *draw )
+{
+   draw_update_vertex_fetch( draw );
+}
+
+/**
+ * Transform vertices with the current vertex program/shader
+ * Up to four vertices can be shaded at a time.
+ * \param vbuffer  the input vertex data
+ * \param elts  indexes of four input vertices
+ * \param count  number of vertices to shade [1..4]
+ * \param vOut  array of pointers to four output vertices
+ */
+static void
+vs_sse_run( struct draw_vertex_shader *base,
+	    struct draw_context *draw, 
+	    const unsigned *elts, 
+	    unsigned count,
+	    struct vertex_header *vOut[] )
+{
+   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
+   struct tgsi_exec_machine *machine = &draw->machine;
+   unsigned int j;
+
+   ALIGN16_DECL(struct tgsi_exec_vector, inputs, PIPE_ATTRIB_MAX);
+   ALIGN16_DECL(struct tgsi_exec_vector, outputs, PIPE_ATTRIB_MAX);
+   const float *scale = draw->viewport.scale;
+   const float *trans = draw->viewport.translate;
+
+   assert(count <= 4);
+   assert(draw->vertex_shader->state->output_semantic_name[0]
+          == TGSI_SEMANTIC_POSITION);
+
+   /* Consts does not require 16 byte alignment. */
+   machine->Consts = (float (*)[4]) draw->user.constants;
+   machine->Inputs = ALIGN16_ASSIGN(inputs);
+   machine->Outputs = ALIGN16_ASSIGN(outputs);
+
+
+   /* Fetch vertices.  This may at some point be integrated into the
+    * compiled shader -- that would require a reorganization where
+    * multiple versions of the compiled shader might exist,
+    * specialized for each fetch state.
+    */
+   draw->vertex_fetch.fetch_func( draw, machine, elts, count );
+
+
+   /* run compiled shader
+    */   
+   shader->func(
+      machine->Inputs,
+      machine->Outputs,
+      machine->Consts,
+      machine->Temps );
+
+
+   /* XXX: Computing the clipmask and emitting results should be done
+    *      in the vertex program as a set of instructions appended to
+    *      the user-provided code.
+    */
+   for (j = 0; j < count; j++) {
+      unsigned slot;
+      float x, y, z, w;
+
+      x = vOut[j]->clip[0] = machine->Outputs[0].xyzw[0].f[j];
+      y = vOut[j]->clip[1] = machine->Outputs[0].xyzw[1].f[j];
+      z = vOut[j]->clip[2] = machine->Outputs[0].xyzw[2].f[j];
+      w = vOut[j]->clip[3] = machine->Outputs[0].xyzw[3].f[j];
+
+      vOut[j]->clipmask = compute_clipmask(vOut[j]->clip, draw->plane, draw->nr_planes);
+      vOut[j]->edgeflag = 1;
+
+      /* divide by w */
+      w = 1.0f / w;
+      x *= w;
+      y *= w;
+      z *= w;
+
+      /* Viewport mapping */
+      vOut[j]->data[0][0] = x * scale[0] + trans[0];
+      vOut[j]->data[0][1] = y * scale[1] + trans[1];
+      vOut[j]->data[0][2] = z * scale[2] + trans[2];
+      vOut[j]->data[0][3] = w;
+
+      /* Remaining attributes are packed into sequential post-transform
+       * vertex attrib slots.
+       */
+      for (slot = 1; slot < draw->num_vs_outputs; slot++) {
+         vOut[j]->data[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
+         vOut[j]->data[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
+         vOut[j]->data[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
+         vOut[j]->data[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
+      }
+   } 
+}
+
+
+
+static void
+vs_sse_delete( struct draw_vertex_shader *base )
+{
+   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
+   
+   x86_release_func( &shader->sse2_program );
+
+   FREE( shader );
+}
+
+
+struct draw_vertex_shader *
+draw_create_vs_sse(struct draw_context *draw,
+                          const struct pipe_shader_state *templ)
+{
+   struct draw_sse_vertex_shader *vs;
+
+   if (!draw->use_sse) 
+      return NULL;
+
+   vs = CALLOC_STRUCT( draw_sse_vertex_shader );
+   if (vs == NULL) 
+      return NULL;
+
+   vs->base.state = templ;
+   vs->base.prepare = vs_sse_prepare;
+   vs->base.run = vs_sse_run;
+   vs->base.delete = vs_sse_delete;
+   
+   x86_init_func( &vs->sse2_program );
+
+   if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state->tokens,
+			&vs->sse2_program )) 
+      goto fail;
+      
+   vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
+   
+   return &vs->base;
+
+fail:
+   fprintf(stderr, "tgsi_emit_sse2() failed, falling back to interpreter\n");
+
+   x86_release_func( &vs->sse2_program );
+   
+   FREE(vs);
+   return NULL;
+}
+
+
+
+#else
+
+struct draw_vertex_shader *
+draw_create_vs_sse( struct draw_context *draw,
+		    const struct pipe_shader_state *templ )
+{
+   return (void *) 0;
+}
+
+
+#endif
+
diff --git a/src/gallium/auxiliary/draw/draw_wide_prims.c b/src/gallium/auxiliary/draw/draw_wide_prims.c
new file mode 100644
index 00000000000..655774b155f
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_wide_prims.c
@@ -0,0 +1,432 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <[email protected]>
+ */
+
+#include "pipe/p_util.h"
+#include "pipe/p_defines.h"
+#include "pipe/p_shader_tokens.h"
+#include "draw_private.h"
+
+
+struct wide_stage {
+   struct draw_stage stage;
+
+   float half_line_width;
+   float half_point_size;
+
+   uint texcoord_slot[PIPE_MAX_SHADER_OUTPUTS];
+   uint texcoord_mode[PIPE_MAX_SHADER_OUTPUTS];
+   uint num_texcoords;
+
+   int psize_slot;
+};
+
+
+
+static INLINE struct wide_stage *wide_stage( struct draw_stage *stage )
+{
+   return (struct wide_stage *)stage;
+}
+
+
+static void passthrough_point( struct draw_stage *stage,
+                               struct prim_header *header )
+{
+   stage->next->point( stage->next, header );
+}
+
+static void passthrough_line( struct draw_stage *stage,
+                              struct prim_header *header )
+{
+   stage->next->line(stage->next, header);
+}
+
+static void passthrough_tri( struct draw_stage *stage,
+                             struct prim_header *header )
+{
+   stage->next->tri(stage->next, header);
+}
+
+
+/**
+ * Draw a wide line by drawing a quad (two triangles).
+ * XXX need to disable polygon stipple.
+ */
+static void wide_line( struct draw_stage *stage,
+		       struct prim_header *header )
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const float half_width = wide->half_line_width;
+
+   struct prim_header tri;
+
+   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
+   struct vertex_header *v2 = dup_vert(stage, header->v[1], 2);
+   struct vertex_header *v3 = dup_vert(stage, header->v[1], 3);
+
+   float *pos0 = v0->data[0];
+   float *pos1 = v1->data[0];
+   float *pos2 = v2->data[0];
+   float *pos3 = v3->data[0];
+
+   const float dx = FABSF(pos0[0] - pos2[0]);
+   const float dy = FABSF(pos0[1] - pos2[1]);
+   
+   /*
+    * Draw wide line as a quad (two tris) by "stretching" the line along
+    * X or Y.
+    * We need to tweak coords in several ways to be conformant here.
+    */
+
+   if (dx > dy) {
+      /* x-major line */
+      pos0[1] = pos0[1] - half_width - 0.25f;
+      pos1[1] = pos1[1] + half_width - 0.25f;
+      pos2[1] = pos2[1] - half_width - 0.25f;
+      pos3[1] = pos3[1] + half_width - 0.25f;
+      if (pos0[0] < pos2[0]) {
+         /* left to right line */
+         pos0[0] -= 0.5f;
+         pos1[0] -= 0.5f;
+         pos2[0] -= 0.5f;
+         pos3[0] -= 0.5f;
+      }
+      else {
+         /* right to left line */
+         pos0[0] += 0.5f;
+         pos1[0] += 0.5f;
+         pos2[0] += 0.5f;
+         pos3[0] += 0.5f;
+      }
+   }
+   else {
+      /* y-major line */
+      pos0[0] = pos0[0] - half_width + 0.25f;
+      pos1[0] = pos1[0] + half_width + 0.25f;
+      pos2[0] = pos2[0] - half_width + 0.25f;
+      pos3[0] = pos3[0] + half_width + 0.25f;
+      if (pos0[1] < pos2[1]) {
+         /* top to bottom line */
+         pos0[1] -= 0.5f;
+         pos1[1] -= 0.5f;
+         pos2[1] -= 0.5f;
+         pos3[1] -= 0.5f;
+      }
+      else {
+         /* bottom to top line */
+         pos0[1] += 0.5f;
+         pos1[1] += 0.5f;
+         pos2[1] += 0.5f;
+         pos3[1] += 0.5f;
+      }
+   }
+
+   tri.det = header->det;  /* only the sign matters */
+   tri.v[0] = v0;
+   tri.v[1] = v2;
+   tri.v[2] = v3;
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v0;
+   tri.v[1] = v3;
+   tri.v[2] = v1;
+   stage->next->tri( stage->next, &tri );
+}
+
+
+/**
+ * Draw a wide line by drawing a quad, using geometry which will
+ * fullfill GL's antialiased line requirements.
+ */
+static void wide_line_aa(struct draw_stage *stage,
+                         struct prim_header *header)
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const float half_width = wide->half_line_width;
+   struct prim_header tri;
+   struct vertex_header *v[4];
+   float *pos;
+   float dx = header->v[1]->data[0][0] - header->v[0]->data[0][0];
+   float dy = header->v[1]->data[0][1] - header->v[0]->data[0][1];
+   const float len = (float) sqrt(dx * dx + dy * dy);
+   uint i;
+
+   dx = dx * half_width / len;
+   dy = dy * half_width / len;
+
+   /* allocate/dup new verts */
+   for (i = 0; i < 4; i++) {
+      v[i] = dup_vert(stage, header->v[i/2], i);
+   }
+
+   /*
+    * Quad for line from v0 to v1:
+    *
+    *  1                         3
+    *  +-------------------------+
+    *  |                         |
+    *  *v0                     v1*
+    *  |                         |
+    *  +-------------------------+
+    *  0                         2
+    */
+
+   pos = v[0]->data[0];
+   pos[0] += dy;
+   pos[1] -= dx;
+
+   pos = v[1]->data[0];
+   pos[0] -= dy;
+   pos[1] += dx;
+
+   pos = v[2]->data[0];
+   pos[0] += dy;
+   pos[1] -= dx;
+
+   pos = v[3]->data[0];
+   pos[0] -= dy;
+   pos[1] += dx;
+
+   tri.det = header->det;  /* only the sign matters */
+
+   tri.v[0] = v[2];  tri.v[1] = v[1];  tri.v[2] = v[0];
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v[3];  tri.v[1] = v[1];  tri.v[2] = v[2];
+   stage->next->tri( stage->next, &tri );
+
+}
+
+
+/**
+ * Set the vertex texcoords for sprite mode.
+ * Coords may be left untouched or set to a right-side-up or upside-down
+ * orientation.
+ */
+static void set_texcoords(const struct wide_stage *wide,
+                          struct vertex_header *v, const float tc[4])
+{
+   uint i;
+   for (i = 0; i < wide->num_texcoords; i++) {
+      if (wide->texcoord_mode[i] != PIPE_SPRITE_COORD_NONE) {
+         uint j = wide->texcoord_slot[i];
+         v->data[j][0] = tc[0];
+         if (wide->texcoord_mode[i] == PIPE_SPRITE_COORD_LOWER_LEFT)
+            v->data[j][1] = 1.0f - tc[1];
+         else
+            v->data[j][1] = tc[1];
+         v->data[j][2] = tc[2];
+         v->data[j][3] = tc[3];
+      }
+   }
+}
+
+
+/* If there are lots of sprite points (and why wouldn't there be?) it
+ * would probably be more sensible to change hardware setup to
+ * optimize this rather than doing the whole thing in software like
+ * this.
+ */
+static void wide_point( struct draw_stage *stage,
+			struct prim_header *header )
+{
+   const struct wide_stage *wide = wide_stage(stage);
+   const boolean sprite = (boolean) stage->draw->rasterizer->point_sprite;
+   float half_size;
+   float left_adj, right_adj;
+
+   struct prim_header tri;
+
+   /* four dups of original vertex */
+   struct vertex_header *v0 = dup_vert(stage, header->v[0], 0);
+   struct vertex_header *v1 = dup_vert(stage, header->v[0], 1);
+   struct vertex_header *v2 = dup_vert(stage, header->v[0], 2);
+   struct vertex_header *v3 = dup_vert(stage, header->v[0], 3);
+
+   float *pos0 = v0->data[0];
+   float *pos1 = v1->data[0];
+   float *pos2 = v2->data[0];
+   float *pos3 = v3->data[0];
+
+   /* point size is either per-vertex or fixed size */
+   if (wide->psize_slot >= 0) {
+      half_size = 0.5f * header->v[0]->data[wide->psize_slot][0];
+   }
+   else {
+      half_size = wide->half_point_size;
+   }
+
+   left_adj = -half_size + 0.25f;
+   right_adj = half_size + 0.25f;
+
+   pos0[0] += left_adj;
+   pos0[1] -= half_size;
+
+   pos1[0] += left_adj;
+   pos1[1] += half_size;
+
+   pos2[0] += right_adj;
+   pos2[1] -= half_size;
+
+   pos3[0] += right_adj;
+   pos3[1] += half_size;
+
+   if (sprite) {
+      static const float tex00[4] = { 0, 0, 0, 1 };
+      static const float tex01[4] = { 0, 1, 0, 1 };
+      static const float tex11[4] = { 1, 1, 0, 1 };
+      static const float tex10[4] = { 1, 0, 0, 1 };
+      set_texcoords( wide, v0, tex00 );
+      set_texcoords( wide, v1, tex01 );
+      set_texcoords( wide, v2, tex10 );
+      set_texcoords( wide, v3, tex11 );
+   }
+
+   tri.det = header->det;  /* only the sign matters */
+   tri.v[0] = v0;
+   tri.v[1] = v2;
+   tri.v[2] = v3;
+   stage->next->tri( stage->next, &tri );
+
+   tri.v[0] = v0;
+   tri.v[1] = v3;
+   tri.v[2] = v1;
+   stage->next->tri( stage->next, &tri );
+}
+
+
+static void wide_first_point( struct draw_stage *stage, 
+			      struct prim_header *header )
+{
+   struct wide_stage *wide = wide_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   wide->half_point_size = 0.5f * draw->rasterizer->point_size;
+
+   if (draw->rasterizer->point_size != 1.0) {
+      stage->point = wide_point;
+   }
+   else {
+      stage->point = passthrough_point;
+   }
+
+   if (draw->rasterizer->point_sprite) {
+      /* find vertex shader texcoord outputs */
+      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      uint i, j = 0;
+      for (i = 0; i < vs->state->num_outputs; i++) {
+         if (vs->state->output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
+            wide->texcoord_slot[j] = i;
+            wide->texcoord_mode[j] = draw->rasterizer->sprite_coord_mode[j];
+            j++;
+         }
+      }
+      wide->num_texcoords = j;
+   }
+
+   wide->psize_slot = -1;
+
+   if (draw->rasterizer->point_size_per_vertex) {
+      /* find PSIZ vertex output */
+      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      uint i;
+      for (i = 0; i < vs->state->num_outputs; i++) {
+         if (vs->state->output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
+            wide->psize_slot = i;
+            break;
+         }
+      }
+   }
+   
+   stage->point( stage, header );
+}
+
+
+
+static void wide_first_line( struct draw_stage *stage,
+			     struct prim_header *header )
+{
+   struct wide_stage *wide = wide_stage(stage);
+   struct draw_context *draw = stage->draw;
+
+   wide->half_line_width = 0.5f * draw->rasterizer->line_width;
+
+   if (draw->rasterizer->line_width != 1.0) {
+      if (draw->rasterizer->line_smooth)
+         wide->stage.line = wide_line_aa;
+      else
+         wide->stage.line = wide_line;
+   }
+   else {
+      wide->stage.line = passthrough_line;
+   }
+   
+   stage->line( stage, header );
+}
+
+
+static void wide_flush( struct draw_stage *stage, unsigned flags )
+{
+   stage->line = wide_first_line;
+   stage->point = wide_first_point;
+   stage->next->flush( stage->next, flags );
+}
+
+
+static void wide_reset_stipple_counter( struct draw_stage *stage )
+{
+   stage->next->reset_stipple_counter( stage->next );
+}
+
+
+static void wide_destroy( struct draw_stage *stage )
+{
+   draw_free_temp_verts( stage );
+   FREE( stage );
+}
+
+
+struct draw_stage *draw_wide_stage( struct draw_context *draw )
+{
+   struct wide_stage *wide = CALLOC_STRUCT(wide_stage);
+
+   draw_alloc_temp_verts( &wide->stage, 4 );
+
+   wide->stage.draw = draw;
+   wide->stage.next = NULL;
+   wide->stage.point = wide_first_point;
+   wide->stage.line = wide_first_line;
+   wide->stage.tri = passthrough_tri;
+   wide->stage.flush = wide_flush;
+   wide->stage.reset_stipple_counter = wide_reset_stipple_counter;
+   wide->stage.destroy = wide_destroy;
+
+   return &wide->stage;
+}
diff --git a/src/gallium/auxiliary/llvm/Makefile b/src/gallium/auxiliary/llvm/Makefile
new file mode 100644
index 00000000000..e0abf860c17
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/Makefile
@@ -0,0 +1,85 @@
+# -*-makefile-*-
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = gallivm
+
+
+GALLIVM_SOURCES = \
+        gallivm.cpp  \
+        gallivm_cpu.cpp \
+        instructions.cpp  \
+        loweringpass.cpp \
+        tgsitollvm.cpp \
+        storage.cpp \
+        storagesoa.cpp \
+        instructionssoa.cpp
+
+INC_SOURCES = gallivm_builtins.cpp
+
+CPP_SOURCES = \
+	$(GALLIVM_SOURCES)
+
+C_SOURCES =
+ASM_SOURCES =
+
+OBJECTS = $(C_SOURCES:.c=.o) \
+          $(CPP_SOURCES:.cpp=.o) \
+	  $(ASM_SOURCES:.S=.o)
+
+### Include directories
+INCLUDES = \
+	-I. \
+	-I$(TOP)/src/gallium/drivers
+	-I$(TOP)/src/gallium/auxiliary \
+	-I$(TOP)/src/gallium/include \
+	-I$(TOP)/src/mesa \
+	-I$(TOP)/include
+
+
+##### RULES #####
+
+.c.o:
+	$(CC) -c $(INCLUDES) $(LLVM_CFLAGS) $(CFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.cpp.o:
+	$(CXX) -c $(INCLUDES) $(LLVM_CXXFLAGS) $(CXXFLAGS) $(DRIVER_DEFINES) $< -o $@
+
+.S.o:
+	$(CC) -c $(INCLUDES) $(CFLAGS) $(DRIVER_DEFINES)  $< -o $@
+
+##### TARGETS #####
+
+default:: depend symlinks $(LIBNAME)
+
+
+$(LIBNAME): $(OBJECTS) Makefile
+	$(TOP)/bin/mklib -o $@ -static $(OBJECTS)
+
+
+depend: $(C_SOURCES) $(CPP_SOURCES) $(ASM_SOURCES) $(INC_SOURCES)
+	rm -f depend
+	touch depend
+	$(MKDEP) $(MKDEP_OPTIONS) $(DRIVER_DEFINES) $(INCLUDES) $(C_SOURCES) $(CPP_SOURCES) \
+		$(ASM_SOURCES) $(INC_SOURCES) 2> /dev/null
+
+
+gallivm_builtins.cpp: llvm_builtins.c
+	clang --emit-llvm $< |llvm-as|opt -std-compile-opts|llvm2cpp -gen-contents -o=$@ -f -for=shader -funcname=createGallivmBuiltins
+
+
+# Emacs tags
+tags:
+	etags `find . -name \*.[ch]` `find ../include`
+
+
+# Remove .o and backup files
+clean:
+	-rm -f *.o */*.o *~ *.so *~ server/*.o
+	-rm -f depend depend.bak
+	-rm -f gallivm_builtins.cpp
+
+symlinks:
+
+
+include depend
diff --git a/src/gallium/auxiliary/llvm/gallivm.cpp b/src/gallium/auxiliary/llvm/gallivm.cpp
new file mode 100644
index 00000000000..d14bb3b99a8
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/gallivm.cpp
@@ -0,0 +1,327 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "instructions.h"
+#include "loweringpass.h"
+#include "storage.h"
+#include "tgsitollvm.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+static int GLOBAL_ID = 0;
+
+using namespace llvm;
+
+static inline
+void AddStandardCompilePasses(PassManager &PM)
+{
+   PM.add(new LoweringPass());
+   PM.add(createVerifierPass());                  // Verify that input is correct
+
+   PM.add(createLowerSetJmpPass());          // Lower llvm.setjmp/.longjmp
+
+   //PM.add(createStripSymbolsPass(true));
+
+   PM.add(createRaiseAllocationsPass());     // call %malloc -> malloc inst
+   PM.add(createCFGSimplificationPass());    // Clean up disgusting code
+   PM.add(createPromoteMemoryToRegisterPass());// Kill useless allocas
+   PM.add(createGlobalOptimizerPass());      // Optimize out global vars
+   PM.add(createGlobalDCEPass());            // Remove unused fns and globs
+   PM.add(createIPConstantPropagationPass());// IP Constant Propagation
+   PM.add(createDeadArgEliminationPass());   // Dead argument elimination
+   PM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
+   PM.add(createCFGSimplificationPass());    // Clean up after IPCP & DAE
+
+   PM.add(createPruneEHPass());              // Remove dead EH info
+
+   PM.add(createFunctionInliningPass());   // Inline small functions
+   PM.add(createArgumentPromotionPass());    // Scalarize uninlined fn args
+
+   PM.add(createTailDuplicationPass());      // Simplify cfg by copying code
+   PM.add(createInstructionCombiningPass()); // Cleanup for scalarrepl.
+   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
+   PM.add(createScalarReplAggregatesPass()); // Break up aggregate allocas
+   PM.add(createInstructionCombiningPass()); // Combine silly seq's
+   PM.add(createCondPropagationPass());      // Propagate conditionals
+
+   PM.add(createTailCallEliminationPass());  // Eliminate tail calls
+   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
+   PM.add(createReassociatePass());          // Reassociate expressions
+   PM.add(createLoopRotatePass());
+   PM.add(createLICMPass());                 // Hoist loop invariants
+   PM.add(createLoopUnswitchPass());         // Unswitch loops.
+   PM.add(createLoopIndexSplitPass());       // Index split loops.
+   PM.add(createInstructionCombiningPass()); // Clean up after LICM/reassoc
+   PM.add(createIndVarSimplifyPass());       // Canonicalize indvars
+   PM.add(createLoopUnrollPass());           // Unroll small loops
+   PM.add(createInstructionCombiningPass()); // Clean up after the unroller
+   PM.add(createGVNPass());                  // Remove redundancies
+   PM.add(createSCCPPass());                 // Constant prop with SCCP
+
+   // Run instcombine after redundancy elimination to exploit opportunities
+   // opened up by them.
+   PM.add(createInstructionCombiningPass());
+   PM.add(createCondPropagationPass());      // Propagate conditionals
+
+   PM.add(createDeadStoreEliminationPass()); // Delete dead stores
+   PM.add(createAggressiveDCEPass());        // SSA based 'Aggressive DCE'
+   PM.add(createCFGSimplificationPass());    // Merge & remove BBs
+   PM.add(createSimplifyLibCallsPass());     // Library Call Optimizations
+   PM.add(createDeadTypeEliminationPass());  // Eliminate dead types
+   PM.add(createConstantMergePass());        // Merge dup global constants
+}
+
+void gallivm_prog_delete(struct gallivm_prog *prog)
+{
+   delete prog->module;
+   prog->module = 0;
+   prog->function = 0;
+   free(prog);
+}
+
+static inline void
+constant_interpolation(float (*inputs)[16][4],
+                       const struct tgsi_interp_coef *coefs,
+                       unsigned attrib,
+                       unsigned chan)
+{
+   unsigned i;
+
+   for (i = 0; i < QUAD_SIZE; ++i) {
+      inputs[i][attrib][chan] = coefs[attrib].a0[chan];
+   }
+}
+
+static inline void
+linear_interpolation(float (*inputs)[16][4],
+                     const struct tgsi_interp_coef *coefs,
+                     unsigned attrib,
+                     unsigned chan)
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      const float x = inputs[i][0][0];
+      const float y = inputs[i][0][1];
+
+      inputs[i][attrib][chan] =
+         coefs[attrib].a0[chan] +
+         coefs[attrib].dadx[chan] * x +
+         coefs[attrib].dady[chan] * y;
+   }
+}
+
+static inline void
+perspective_interpolation(float (*inputs)[16][4],
+                          const struct tgsi_interp_coef *coefs,
+                          unsigned attrib,
+                          unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      const float x = inputs[i][0][0];
+      const float y = inputs[i][0][1];
+      /* WPOS.w here is really 1/w */
+      const float w = 1.0f / inputs[i][0][3];
+      assert(inputs[i][0][3] != 0.0);
+
+      inputs[i][attrib][chan] =
+         (coefs[attrib].a0[chan] +
+          coefs[attrib].dadx[chan] * x +
+          coefs[attrib].dady[chan] * y) * w;
+   }
+}
+
+void gallivm_ir_dump(struct gallivm_ir *ir, const char *file_prefix)
+{
+   if (!ir || !ir->module)
+      return;
+
+   if (file_prefix) {
+      std::ostringstream stream;
+      stream << file_prefix;
+      stream << ir->id;
+      stream << ".ll";
+      std::string name = stream.str();
+      std::ofstream out(name.c_str());
+      if (!out) {
+         std::cerr<<"Can't open file : "<<stream.str()<<std::endl;;
+         return;
+      }
+      out << (*ir->module);
+      out.close();
+   } else {
+      const llvm::Module::FunctionListType &funcs = ir->module->getFunctionList();
+      llvm::Module::FunctionListType::const_iterator itr;
+      std::cout<<"; ---------- Start shader "<<ir->id<<std::endl;
+      for (itr = funcs.begin(); itr != funcs.end(); ++itr) {
+         const llvm::Function &func = (*itr);
+         std::string name = func.getName();
+         const llvm::Function *found = 0;
+         if (name.find("vs_shader") != std::string::npos ||
+             name.find("fs_shader") != std::string::npos ||
+             name.find("function") != std::string::npos)
+            found = &func;
+         if (found) {
+            std::cout<<*found<<std::endl;
+         }
+      }
+      std::cout<<"; ---------- End shader "<<ir->id<<std::endl;
+   }
+}
+
+
+void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
+                                     float (*inputs)[16][4],
+                                     const struct tgsi_interp_coef *coef)
+{
+   for (int i = 0; i < prog->num_interp; ++i) {
+      const gallivm_interpolate &interp = prog->interpolators[i];
+      switch (interp.type) {
+      case TGSI_INTERPOLATE_CONSTANT:
+         constant_interpolation(inputs, coef, interp.attrib, interp.chan);
+         break;
+
+      case TGSI_INTERPOLATE_LINEAR:
+         linear_interpolation(inputs, coef, interp.attrib, interp.chan);
+         break;
+
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+         perspective_interpolation(inputs, coef, interp.attrib, interp.chan);
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+}
+
+
+struct gallivm_ir * gallivm_ir_new(enum gallivm_shader_type type)
+{
+   struct gallivm_ir *ir =
+      (struct gallivm_ir *)calloc(1, sizeof(struct gallivm_ir));
+   ++GLOBAL_ID;
+   ir->id   = GLOBAL_ID;
+   ir->type = type;
+
+   return ir;
+}
+
+void gallivm_ir_set_layout(struct gallivm_ir *ir,
+                           enum gallivm_vector_layout layout)
+{
+   ir->layout = layout;
+}
+
+void gallivm_ir_set_components(struct gallivm_ir *ir, int num)
+{
+   ir->num_components = num;
+}
+
+void gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
+                               const struct tgsi_token *tokens)
+{
+   std::cout << "Creating llvm from: " <<std::endl;
+   tgsi_dump(tokens, 0);
+
+
+   llvm::Module *mod = tgsi_to_llvmir(ir, tokens);
+
+   //llvm::Module *mod = tgsi_to_llvm(ir, tokens);
+   ir->module = mod;
+   gallivm_ir_dump(ir, 0);
+}
+
+void gallivm_ir_delete(struct gallivm_ir *ir)
+{
+   delete ir->module;
+   free(ir);
+}
+
+struct gallivm_prog * gallivm_ir_compile(struct gallivm_ir *ir)
+{
+   struct gallivm_prog *prog =
+      (struct gallivm_prog *)calloc(1, sizeof(struct gallivm_prog));
+   llvm::Module *mod = llvm::CloneModule(ir->module);
+   prog->num_consts = ir->num_consts;
+   memcpy(prog->interpolators, ir->interpolators, sizeof(prog->interpolators));
+   prog->num_interp = ir->num_interp;
+
+   /* Run optimization passes over it */
+   PassManager passes;
+   passes.add(new TargetData(mod));
+   AddStandardCompilePasses(passes);
+   passes.run(*mod);
+   prog->module = mod;
+
+   std::cout << "After optimizations:"<<std::endl;
+   mod->dump();
+
+   return prog;
+}
+
+#endif /* MESA_LLVM */
diff --git a/src/gallium/auxiliary/llvm/gallivm.h b/src/gallium/auxiliary/llvm/gallivm.h
new file mode 100644
index 00000000000..92da4bca7f4
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/gallivm.h
@@ -0,0 +1,103 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+
+#ifndef GALLIVM_H
+#define GALLIVM_H
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#include "pipe/p_state.h"
+
+#ifdef MESA_LLVM
+
+struct tgsi_token;
+
+struct gallivm_ir;
+struct gallivm_prog;
+struct gallivm_cpu_engine;
+struct tgsi_interp_coef;
+struct tgsi_sampler;
+struct tgsi_exec_vector;
+
+enum gallivm_shader_type {
+   GALLIVM_VS,
+   GALLIVM_FS
+};
+
+enum gallivm_vector_layout {
+   GALLIVM_AOS,
+   GALLIVM_SOA
+};
+
+struct gallivm_ir *gallivm_ir_new(enum gallivm_shader_type type);
+void               gallivm_ir_set_layout(struct gallivm_ir *ir,
+                                         enum gallivm_vector_layout layout);
+void               gallivm_ir_set_components(struct gallivm_ir *ir, int num);
+void               gallivm_ir_fill_from_tgsi(struct gallivm_ir *ir,
+                                             const struct tgsi_token *tokens);
+void               gallivm_ir_delete(struct gallivm_ir *ir);
+
+
+struct gallivm_prog *gallivm_ir_compile(struct gallivm_ir *ir);
+
+void gallivm_prog_inputs_interpolate(struct gallivm_prog *prog,
+                                     float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
+                                     const struct tgsi_interp_coef *coefs);
+void gallivm_prog_dump(struct gallivm_prog *prog, const char *file_prefix);
+
+
+struct gallivm_cpu_engine *gallivm_cpu_engine_create(struct gallivm_prog *prog);
+struct gallivm_cpu_engine *gallivm_global_cpu_engine();
+int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
+                        struct tgsi_exec_vector       *inputs,
+                        struct tgsi_exec_vector       *dests,
+                        float (*consts)[4],
+                        struct tgsi_exec_vector       *temps);
+int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
+                        float x, float y,
+                        float (*dests)[PIPE_MAX_SHADER_INPUTS][4],
+                        float (*inputs)[PIPE_MAX_SHADER_INPUTS][4],
+                        float (*consts)[4],
+                        struct tgsi_sampler *samplers);
+void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *ee, struct gallivm_prog *prog);
+void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *ee);
+
+
+#endif /* MESA_LLVM */
+
+#if defined __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/gallivm_builtins.cpp b/src/gallium/auxiliary/llvm/gallivm_builtins.cpp
new file mode 100644
index 00000000000..1796f0a1772
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/gallivm_builtins.cpp
@@ -0,0 +1,567 @@
+// Generated by llvm2cpp - DO NOT MODIFY!
+
+
+Module* createGallivmBuiltins(Module *mod) {
+
+mod->setModuleIdentifier("shader");
+
+// Type Definitions
+ArrayType* ArrayTy_0 = ArrayType::get(IntegerType::get(8), 25);
+
+PointerType* PointerTy_1 = PointerType::get(ArrayTy_0, 0);
+
+std::vector<const Type*>FuncTy_2_args;
+FuncTy_2_args.push_back(Type::FloatTy);
+FuncTy_2_args.push_back(Type::FloatTy);
+FunctionType* FuncTy_2 = FunctionType::get(
+  /*Result=*/Type::FloatTy,
+  /*Params=*/FuncTy_2_args,
+  /*isVarArg=*/false);
+
+PointerType* PointerTy_3 = PointerType::get(FuncTy_2, 0);
+
+VectorType* VectorTy_4 = VectorType::get(Type::FloatTy, 4);
+
+std::vector<const Type*>FuncTy_5_args;
+FuncTy_5_args.push_back(VectorTy_4);
+FunctionType* FuncTy_5 = FunctionType::get(
+  /*Result=*/VectorTy_4,
+  /*Params=*/FuncTy_5_args,
+  /*isVarArg=*/false);
+
+std::vector<const Type*>FuncTy_6_args;
+FuncTy_6_args.push_back(VectorTy_4);
+FuncTy_6_args.push_back(VectorTy_4);
+FuncTy_6_args.push_back(VectorTy_4);
+FunctionType* FuncTy_6 = FunctionType::get(
+  /*Result=*/VectorTy_4,
+  /*Params=*/FuncTy_6_args,
+  /*isVarArg=*/false);
+
+VectorType* VectorTy_7 = VectorType::get(IntegerType::get(32), 4);
+
+std::vector<const Type*>FuncTy_9_args;
+FunctionType* FuncTy_9 = FunctionType::get(
+  /*Result=*/IntegerType::get(32),
+  /*Params=*/FuncTy_9_args,
+  /*isVarArg=*/true);
+
+PointerType* PointerTy_8 = PointerType::get(FuncTy_9, 0);
+
+PointerType* PointerTy_10 = PointerType::get(IntegerType::get(8), 0);
+
+std::vector<const Type*>FuncTy_12_args;
+FuncTy_12_args.push_back(Type::FloatTy);
+FunctionType* FuncTy_12 = FunctionType::get(
+  /*Result=*/Type::FloatTy,
+  /*Params=*/FuncTy_12_args,
+  /*isVarArg=*/false);
+
+PointerType* PointerTy_11 = PointerType::get(FuncTy_12, 0);
+
+std::vector<const Type*>FuncTy_13_args;
+FuncTy_13_args.push_back(VectorTy_4);
+FunctionType* FuncTy_13 = FunctionType::get(
+  /*Result=*/IntegerType::get(32),
+  /*Params=*/FuncTy_13_args,
+  /*isVarArg=*/false);
+
+
+// Function Declarations
+
+Function* func_approx = new Function(
+  /*Type=*/FuncTy_2,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"approx", mod); 
+func_approx->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_approx_PAL = 0;
+func_approx->setParamAttrs(func_approx_PAL);
+
+Function* func_powf = new Function(
+  /*Type=*/FuncTy_2,
+  /*Linkage=*/GlobalValue::ExternalLinkage,
+  /*Name=*/"powf", mod); // (external, no body)
+func_powf->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_powf_PAL = 0;
+func_powf->setParamAttrs(func_powf_PAL);
+
+Function* func_lit = new Function(
+  /*Type=*/FuncTy_5,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"lit", mod); 
+func_lit->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_lit_PAL = 0;
+func_lit->setParamAttrs(func_lit_PAL);
+
+Function* func_cmp = new Function(
+  /*Type=*/FuncTy_6,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"cmp", mod); 
+func_cmp->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_cmp_PAL = 0;
+{
+  ParamAttrsVector Attrs;
+  ParamAttrsWithIndex PAWI;
+  PAWI.index = 0; PAWI.attrs = 0  | ParamAttr::NoUnwind;
+  Attrs.push_back(PAWI);
+  func_cmp_PAL = ParamAttrsList::get(Attrs);
+  
+}
+func_cmp->setParamAttrs(func_cmp_PAL);
+
+Function* func_vcos = new Function(
+  /*Type=*/FuncTy_5,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"vcos", mod); 
+func_vcos->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_vcos_PAL = 0;
+func_vcos->setParamAttrs(func_vcos_PAL);
+
+Function* func_printf = new Function(
+  /*Type=*/FuncTy_9,
+  /*Linkage=*/GlobalValue::ExternalLinkage,
+  /*Name=*/"printf", mod); // (external, no body)
+func_printf->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_printf_PAL = 0;
+func_printf->setParamAttrs(func_printf_PAL);
+
+Function* func_cosf = new Function(
+  /*Type=*/FuncTy_12,
+  /*Linkage=*/GlobalValue::ExternalLinkage,
+  /*Name=*/"cosf", mod); // (external, no body)
+func_cosf->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_cosf_PAL = 0;
+func_cosf->setParamAttrs(func_cosf_PAL);
+
+Function* func_scs = new Function(
+  /*Type=*/FuncTy_5,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"scs", mod); 
+func_scs->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_scs_PAL = 0;
+func_scs->setParamAttrs(func_scs_PAL);
+
+Function* func_sinf = new Function(
+  /*Type=*/FuncTy_12,
+  /*Linkage=*/GlobalValue::ExternalLinkage,
+  /*Name=*/"sinf", mod); // (external, no body)
+func_sinf->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_sinf_PAL = 0;
+func_sinf->setParamAttrs(func_sinf_PAL);
+
+Function* func_vsin = new Function(
+  /*Type=*/FuncTy_5,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"vsin", mod); 
+func_vsin->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_vsin_PAL = 0;
+func_vsin->setParamAttrs(func_vsin_PAL);
+
+Function* func_kilp = new Function(
+  /*Type=*/FuncTy_13,
+  /*Linkage=*/GlobalValue::WeakLinkage,
+  /*Name=*/"kilp", mod); 
+func_kilp->setCallingConv(CallingConv::C);
+const ParamAttrsList *func_kilp_PAL = 0;
+{
+  ParamAttrsVector Attrs;
+  ParamAttrsWithIndex PAWI;
+  PAWI.index = 0; PAWI.attrs = 0  | ParamAttr::NoUnwind;
+  Attrs.push_back(PAWI);
+  func_kilp_PAL = ParamAttrsList::get(Attrs);
+  
+}
+func_kilp->setParamAttrs(func_kilp_PAL);
+
+// Global Variable Declarations
+
+
+GlobalVariable* gvar_array__str = new GlobalVariable(
+/*Type=*/ArrayTy_0,
+/*isConstant=*/true,
+/*Linkage=*/GlobalValue::InternalLinkage,
+/*Initializer=*/0, // has initializer, specified below
+/*Name=*/".str",
+mod);
+
+GlobalVariable* gvar_array__str1 = new GlobalVariable(
+/*Type=*/ArrayTy_0,
+/*isConstant=*/true,
+/*Linkage=*/GlobalValue::InternalLinkage,
+/*Initializer=*/0, // has initializer, specified below
+/*Name=*/".str1",
+mod);
+
+// Constant Definitions
+Constant* const_array_14 = ConstantArray::get("VEC IN   is %f %f %f %f\x0A", true);
+Constant* const_array_15 = ConstantArray::get("VEC OUT  is %f %f %f %f\x0A", true);
+ConstantFP* const_float_16 = ConstantFP::get(Type::FloatTy, APFloat(-1.280000e+02f));
+ConstantFP* const_float_17 = ConstantFP::get(Type::FloatTy, APFloat(1.280000e+02f));
+Constant* const_float_18 = Constant::getNullValue(Type::FloatTy);
+Constant* const_int32_19 = Constant::getNullValue(IntegerType::get(32));
+std::vector<Constant*> const_packed_20_elems;
+ConstantFP* const_float_21 = ConstantFP::get(Type::FloatTy, APFloat(1.000000e+00f));
+const_packed_20_elems.push_back(const_float_21);
+UndefValue* const_float_22 = UndefValue::get(Type::FloatTy);
+const_packed_20_elems.push_back(const_float_22);
+const_packed_20_elems.push_back(const_float_22);
+const_packed_20_elems.push_back(const_float_21);
+Constant* const_packed_20 = ConstantVector::get(VectorTy_4, const_packed_20_elems);
+ConstantInt* const_int32_23 = ConstantInt::get(APInt(32,  "1", 10));
+ConstantInt* const_int32_24 = ConstantInt::get(APInt(32,  "3", 10));
+ConstantInt* const_int32_25 = ConstantInt::get(APInt(32,  "2", 10));
+std::vector<Constant*> const_packed_26_elems;
+const_packed_26_elems.push_back(const_float_21);
+const_packed_26_elems.push_back(const_float_18);
+const_packed_26_elems.push_back(const_float_18);
+const_packed_26_elems.push_back(const_float_21);
+Constant* const_packed_26 = ConstantVector::get(VectorTy_4, const_packed_26_elems);
+Constant* const_double_27 = Constant::getNullValue(Type::DoubleTy);
+std::vector<Constant*> const_packed_28_elems;
+const_packed_28_elems.push_back(const_int32_19);
+ConstantInt* const_int32_29 = ConstantInt::get(APInt(32,  "5", 10));
+const_packed_28_elems.push_back(const_int32_29);
+const_packed_28_elems.push_back(const_int32_25);
+const_packed_28_elems.push_back(const_int32_24);
+Constant* const_packed_28 = ConstantVector::get(VectorTy_7, const_packed_28_elems);
+std::vector<Constant*> const_packed_30_elems;
+const_packed_30_elems.push_back(const_int32_19);
+const_packed_30_elems.push_back(const_int32_23);
+ConstantInt* const_int32_31 = ConstantInt::get(APInt(32,  "6", 10));
+const_packed_30_elems.push_back(const_int32_31);
+const_packed_30_elems.push_back(const_int32_24);
+Constant* const_packed_30 = ConstantVector::get(VectorTy_7, const_packed_30_elems);
+std::vector<Constant*> const_packed_32_elems;
+const_packed_32_elems.push_back(const_int32_19);
+const_packed_32_elems.push_back(const_int32_23);
+const_packed_32_elems.push_back(const_int32_25);
+ConstantInt* const_int32_33 = ConstantInt::get(APInt(32,  "7", 10));
+const_packed_32_elems.push_back(const_int32_33);
+Constant* const_packed_32 = ConstantVector::get(VectorTy_7, const_packed_32_elems);
+std::vector<Constant*> const_ptr_34_indices;
+const_ptr_34_indices.push_back(const_int32_19);
+const_ptr_34_indices.push_back(const_int32_19);
+Constant* const_ptr_34 = ConstantExpr::getGetElementPtr(gvar_array__str, &const_ptr_34_indices[0], const_ptr_34_indices.size() );
+UndefValue* const_packed_35 = UndefValue::get(VectorTy_4);
+std::vector<Constant*> const_ptr_36_indices;
+const_ptr_36_indices.push_back(const_int32_19);
+const_ptr_36_indices.push_back(const_int32_19);
+Constant* const_ptr_36 = ConstantExpr::getGetElementPtr(gvar_array__str1, &const_ptr_36_indices[0], const_ptr_36_indices.size() );
+
+// Global Variable Definitions
+gvar_array__str->setInitializer(const_array_14);
+gvar_array__str1->setInitializer(const_array_15);
+
+// Function Definitions
+
+// Function: approx (func_approx)
+{
+  Function::arg_iterator args = func_approx->arg_begin();
+  Value* float_a = args++;
+  float_a->setName("a");
+  Value* float_b = args++;
+  float_b->setName("b");
+  
+  BasicBlock* label_entry = new BasicBlock("entry",func_approx,0);
+  
+  // Block entry (label_entry)
+  FCmpInst* int1_cmp = new FCmpInst(FCmpInst::FCMP_OLT, float_b, const_float_16, "cmp", label_entry);
+  SelectInst* float_b_addr_0 = new SelectInst(int1_cmp, const_float_16, float_b, "b.addr.0", label_entry);
+  FCmpInst* int1_cmp3 = new FCmpInst(FCmpInst::FCMP_OGT, float_b_addr_0, const_float_17, "cmp3", label_entry);
+  SelectInst* float_b_addr_1 = new SelectInst(int1_cmp3, const_float_17, float_b_addr_0, "b.addr.1", label_entry);
+  FCmpInst* int1_cmp7 = new FCmpInst(FCmpInst::FCMP_OLT, float_a, const_float_18, "cmp7", label_entry);
+  SelectInst* float_a_addr_0 = new SelectInst(int1_cmp7, const_float_18, float_a, "a.addr.0", label_entry);
+  std::vector<Value*> float_call_params;
+  float_call_params.push_back(float_a_addr_0);
+  float_call_params.push_back(float_b_addr_1);
+  CallInst* float_call = new CallInst(func_powf, float_call_params.begin(), float_call_params.end(), "call", label_entry);
+  float_call->setCallingConv(CallingConv::C);
+  float_call->setTailCall(true);const ParamAttrsList *float_call_PAL = 0;
+  float_call->setParamAttrs(float_call_PAL);
+  
+  new ReturnInst(float_call, label_entry);
+  
+}
+
+// Function: lit (func_lit)
+{
+  Function::arg_iterator args = func_lit->arg_begin();
+  Value* packed_tmp = args++;
+  packed_tmp->setName("tmp");
+  
+  BasicBlock* label_entry_38 = new BasicBlock("entry",func_lit,0);
+  BasicBlock* label_ifthen = new BasicBlock("ifthen",func_lit,0);
+  BasicBlock* label_UnifiedReturnBlock = new BasicBlock("UnifiedReturnBlock",func_lit,0);
+  
+  // Block entry (label_entry_38)
+  ExtractElementInst* float_tmp6 = new ExtractElementInst(packed_tmp, const_int32_19, "tmp6", label_entry_38);
+  FCmpInst* int1_cmp_39 = new FCmpInst(FCmpInst::FCMP_OGT, float_tmp6, const_float_18, "cmp", label_entry_38);
+  new BranchInst(label_ifthen, label_UnifiedReturnBlock, int1_cmp_39, label_entry_38);
+  
+  // Block ifthen (label_ifthen)
+  InsertElementInst* packed_tmp10 = new InsertElementInst(const_packed_20, float_tmp6, const_int32_23, "tmp10", label_ifthen);
+  ExtractElementInst* float_tmp12 = new ExtractElementInst(packed_tmp, const_int32_23, "tmp12", label_ifthen);
+  ExtractElementInst* float_tmp14 = new ExtractElementInst(packed_tmp, const_int32_24, "tmp14", label_ifthen);
+  std::vector<Value*> float_call_41_params;
+  float_call_41_params.push_back(float_tmp12);
+  float_call_41_params.push_back(float_tmp14);
+  CallInst* float_call_41 = new CallInst(func_approx, float_call_41_params.begin(), float_call_41_params.end(), "call", label_ifthen);
+  float_call_41->setCallingConv(CallingConv::C);
+  float_call_41->setTailCall(true);const ParamAttrsList *float_call_41_PAL = 0;
+  float_call_41->setParamAttrs(float_call_41_PAL);
+  
+  InsertElementInst* packed_tmp16 = new InsertElementInst(packed_tmp10, float_call_41, const_int32_25, "tmp16", label_ifthen);
+  new ReturnInst(packed_tmp16, label_ifthen);
+  
+  // Block UnifiedReturnBlock (label_UnifiedReturnBlock)
+  new ReturnInst(const_packed_26, label_UnifiedReturnBlock);
+  
+}
+
+// Function: cmp (func_cmp)
+{
+  Function::arg_iterator args = func_cmp->arg_begin();
+  Value* packed_tmp0 = args++;
+  packed_tmp0->setName("tmp0");
+  Value* packed_tmp1 = args++;
+  packed_tmp1->setName("tmp1");
+  Value* packed_tmp2 = args++;
+  packed_tmp2->setName("tmp2");
+  
+  BasicBlock* label_entry_44 = new BasicBlock("entry",func_cmp,0);
+  BasicBlock* label_cond__14 = new BasicBlock("cond.?14",func_cmp,0);
+  BasicBlock* label_cond_cont20 = new BasicBlock("cond.cont20",func_cmp,0);
+  BasicBlock* label_cond__28 = new BasicBlock("cond.?28",func_cmp,0);
+  BasicBlock* label_cond_cont34 = new BasicBlock("cond.cont34",func_cmp,0);
+  BasicBlock* label_cond__42 = new BasicBlock("cond.?42",func_cmp,0);
+  BasicBlock* label_cond_cont48 = new BasicBlock("cond.cont48",func_cmp,0);
+  
+  // Block entry (label_entry_44)
+  ExtractElementInst* float_tmp3 = new ExtractElementInst(packed_tmp0, const_int32_19, "tmp3", label_entry_44);
+  CastInst* double_conv = new FPExtInst(float_tmp3, Type::DoubleTy, "conv", label_entry_44);
+  FCmpInst* int1_cmp_45 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv, const_double_27, "cmp", label_entry_44);
+  ExtractElementInst* float_tmp11 = new ExtractElementInst(packed_tmp0, const_int32_23, "tmp11", label_entry_44);
+  CastInst* double_conv12 = new FPExtInst(float_tmp11, Type::DoubleTy, "conv12", label_entry_44);
+  FCmpInst* int1_cmp13 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv12, const_double_27, "cmp13", label_entry_44);
+  SelectInst* packed_tmp1_tmp2 = new SelectInst(int1_cmp_45, packed_tmp1, packed_tmp2, "tmp1.tmp2", label_entry_44);
+  new BranchInst(label_cond__14, label_cond_cont20, int1_cmp13, label_entry_44);
+  
+  // Block cond.?14 (label_cond__14)
+  ShuffleVectorInst* packed_tmp233 = new ShuffleVectorInst(packed_tmp1_tmp2, packed_tmp1, const_packed_28, "tmp233", label_cond__14);
+  ExtractElementInst* float_tmp254 = new ExtractElementInst(packed_tmp0, const_int32_25, "tmp254", label_cond__14);
+  CastInst* double_conv265 = new FPExtInst(float_tmp254, Type::DoubleTy, "conv265", label_cond__14);
+  FCmpInst* int1_cmp276 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv265, const_double_27, "cmp276", label_cond__14);
+  new BranchInst(label_cond__28, label_cond_cont34, int1_cmp276, label_cond__14);
+  
+  // Block cond.cont20 (label_cond_cont20)
+  ShuffleVectorInst* packed_tmp23 = new ShuffleVectorInst(packed_tmp1_tmp2, packed_tmp2, const_packed_28, "tmp23", label_cond_cont20);
+  ExtractElementInst* float_tmp25 = new ExtractElementInst(packed_tmp0, const_int32_25, "tmp25", label_cond_cont20);
+  CastInst* double_conv26 = new FPExtInst(float_tmp25, Type::DoubleTy, "conv26", label_cond_cont20);
+  FCmpInst* int1_cmp27 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv26, const_double_27, "cmp27", label_cond_cont20);
+  new BranchInst(label_cond__28, label_cond_cont34, int1_cmp27, label_cond_cont20);
+  
+  // Block cond.?28 (label_cond__28)
+  PHINode* packed_tmp23_reg2mem_0 = new PHINode(VectorTy_4, "tmp23.reg2mem.0", label_cond__28);
+  packed_tmp23_reg2mem_0->reserveOperandSpace(2);
+  packed_tmp23_reg2mem_0->addIncoming(packed_tmp233, label_cond__14);
+  packed_tmp23_reg2mem_0->addIncoming(packed_tmp23, label_cond_cont20);
+  
+  ShuffleVectorInst* packed_tmp378 = new ShuffleVectorInst(packed_tmp23_reg2mem_0, packed_tmp1, const_packed_30, "tmp378", label_cond__28);
+  ExtractElementInst* float_tmp399 = new ExtractElementInst(packed_tmp0, const_int32_24, "tmp399", label_cond__28);
+  CastInst* double_conv4010 = new FPExtInst(float_tmp399, Type::DoubleTy, "conv4010", label_cond__28);
+  FCmpInst* int1_cmp4111 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv4010, const_double_27, "cmp4111", label_cond__28);
+  new BranchInst(label_cond__42, label_cond_cont48, int1_cmp4111, label_cond__28);
+  
+  // Block cond.cont34 (label_cond_cont34)
+  PHINode* packed_tmp23_reg2mem_1 = new PHINode(VectorTy_4, "tmp23.reg2mem.1", label_cond_cont34);
+  packed_tmp23_reg2mem_1->reserveOperandSpace(2);
+  packed_tmp23_reg2mem_1->addIncoming(packed_tmp233, label_cond__14);
+  packed_tmp23_reg2mem_1->addIncoming(packed_tmp23, label_cond_cont20);
+  
+  ShuffleVectorInst* packed_tmp37 = new ShuffleVectorInst(packed_tmp23_reg2mem_1, packed_tmp2, const_packed_30, "tmp37", label_cond_cont34);
+  ExtractElementInst* float_tmp39 = new ExtractElementInst(packed_tmp0, const_int32_24, "tmp39", label_cond_cont34);
+  CastInst* double_conv40 = new FPExtInst(float_tmp39, Type::DoubleTy, "conv40", label_cond_cont34);
+  FCmpInst* int1_cmp41 = new FCmpInst(FCmpInst::FCMP_OLT, double_conv40, const_double_27, "cmp41", label_cond_cont34);
+  new BranchInst(label_cond__42, label_cond_cont48, int1_cmp41, label_cond_cont34);
+  
+  // Block cond.?42 (label_cond__42)
+  PHINode* packed_tmp37_reg2mem_0 = new PHINode(VectorTy_4, "tmp37.reg2mem.0", label_cond__42);
+  packed_tmp37_reg2mem_0->reserveOperandSpace(2);
+  packed_tmp37_reg2mem_0->addIncoming(packed_tmp378, label_cond__28);
+  packed_tmp37_reg2mem_0->addIncoming(packed_tmp37, label_cond_cont34);
+  
+  ShuffleVectorInst* packed_tmp5113 = new ShuffleVectorInst(packed_tmp37_reg2mem_0, packed_tmp1, const_packed_32, "tmp5113", label_cond__42);
+  new ReturnInst(packed_tmp5113, label_cond__42);
+  
+  // Block cond.cont48 (label_cond_cont48)
+  PHINode* packed_tmp37_reg2mem_1 = new PHINode(VectorTy_4, "tmp37.reg2mem.1", label_cond_cont48);
+  packed_tmp37_reg2mem_1->reserveOperandSpace(2);
+  packed_tmp37_reg2mem_1->addIncoming(packed_tmp378, label_cond__28);
+  packed_tmp37_reg2mem_1->addIncoming(packed_tmp37, label_cond_cont34);
+  
+  ShuffleVectorInst* packed_tmp51 = new ShuffleVectorInst(packed_tmp37_reg2mem_1, packed_tmp2, const_packed_32, "tmp51", label_cond_cont48);
+  new ReturnInst(packed_tmp51, label_cond_cont48);
+  
+}
+
+// Function: vcos (func_vcos)
+{
+  Function::arg_iterator args = func_vcos->arg_begin();
+  Value* packed_val = args++;
+  packed_val->setName("val");
+  
+  BasicBlock* label_entry_53 = new BasicBlock("entry",func_vcos,0);
+  
+  // Block entry (label_entry_53)
+  ExtractElementInst* float_tmp1 = new ExtractElementInst(packed_val, const_int32_19, "tmp1", label_entry_53);
+  CastInst* double_conv_54 = new FPExtInst(float_tmp1, Type::DoubleTy, "conv", label_entry_53);
+  ExtractElementInst* float_tmp3_55 = new ExtractElementInst(packed_val, const_int32_23, "tmp3", label_entry_53);
+  CastInst* double_conv4 = new FPExtInst(float_tmp3_55, Type::DoubleTy, "conv4", label_entry_53);
+  ExtractElementInst* float_tmp6_56 = new ExtractElementInst(packed_val, const_int32_25, "tmp6", label_entry_53);
+  CastInst* double_conv7 = new FPExtInst(float_tmp6_56, Type::DoubleTy, "conv7", label_entry_53);
+  ExtractElementInst* float_tmp9 = new ExtractElementInst(packed_val, const_int32_24, "tmp9", label_entry_53);
+  CastInst* double_conv10 = new FPExtInst(float_tmp9, Type::DoubleTy, "conv10", label_entry_53);
+  std::vector<Value*> int32_call_params;
+  int32_call_params.push_back(const_ptr_34);
+  int32_call_params.push_back(double_conv_54);
+  int32_call_params.push_back(double_conv4);
+  int32_call_params.push_back(double_conv7);
+  int32_call_params.push_back(double_conv10);
+  CallInst* int32_call = new CallInst(func_printf, int32_call_params.begin(), int32_call_params.end(), "call", label_entry_53);
+  int32_call->setCallingConv(CallingConv::C);
+  int32_call->setTailCall(true);const ParamAttrsList *int32_call_PAL = 0;
+  int32_call->setParamAttrs(int32_call_PAL);
+  
+  CallInst* float_call13 = new CallInst(func_cosf, float_tmp1, "call13", label_entry_53);
+  float_call13->setCallingConv(CallingConv::C);
+  float_call13->setTailCall(true);const ParamAttrsList *float_call13_PAL = 0;
+  float_call13->setParamAttrs(float_call13_PAL);
+  
+  InsertElementInst* packed_tmp15 = new InsertElementInst(const_packed_35, float_call13, const_int32_19, "tmp15", label_entry_53);
+  CallInst* float_call18 = new CallInst(func_cosf, float_tmp1, "call18", label_entry_53);
+  float_call18->setCallingConv(CallingConv::C);
+  float_call18->setTailCall(true);const ParamAttrsList *float_call18_PAL = 0;
+  float_call18->setParamAttrs(float_call18_PAL);
+  
+  InsertElementInst* packed_tmp20 = new InsertElementInst(packed_tmp15, float_call18, const_int32_23, "tmp20", label_entry_53);
+  CallInst* float_call23 = new CallInst(func_cosf, float_tmp1, "call23", label_entry_53);
+  float_call23->setCallingConv(CallingConv::C);
+  float_call23->setTailCall(true);const ParamAttrsList *float_call23_PAL = 0;
+  float_call23->setParamAttrs(float_call23_PAL);
+  
+  InsertElementInst* packed_tmp25 = new InsertElementInst(packed_tmp20, float_call23, const_int32_25, "tmp25", label_entry_53);
+  CallInst* float_call28 = new CallInst(func_cosf, float_tmp1, "call28", label_entry_53);
+  float_call28->setCallingConv(CallingConv::C);
+  float_call28->setTailCall(true);const ParamAttrsList *float_call28_PAL = 0;
+  float_call28->setParamAttrs(float_call28_PAL);
+  
+  InsertElementInst* packed_tmp30 = new InsertElementInst(packed_tmp25, float_call28, const_int32_24, "tmp30", label_entry_53);
+  CastInst* double_conv33 = new FPExtInst(float_call13, Type::DoubleTy, "conv33", label_entry_53);
+  CastInst* double_conv36 = new FPExtInst(float_call18, Type::DoubleTy, "conv36", label_entry_53);
+  CastInst* double_conv39 = new FPExtInst(float_call23, Type::DoubleTy, "conv39", label_entry_53);
+  CastInst* double_conv42 = new FPExtInst(float_call28, Type::DoubleTy, "conv42", label_entry_53);
+  std::vector<Value*> int32_call43_params;
+  int32_call43_params.push_back(const_ptr_36);
+  int32_call43_params.push_back(double_conv33);
+  int32_call43_params.push_back(double_conv36);
+  int32_call43_params.push_back(double_conv39);
+  int32_call43_params.push_back(double_conv42);
+  CallInst* int32_call43 = new CallInst(func_printf, int32_call43_params.begin(), int32_call43_params.end(), "call43", label_entry_53);
+  int32_call43->setCallingConv(CallingConv::C);
+  int32_call43->setTailCall(true);const ParamAttrsList *int32_call43_PAL = 0;
+  int32_call43->setParamAttrs(int32_call43_PAL);
+  
+  new ReturnInst(packed_tmp30, label_entry_53);
+  
+}
+
+// Function: scs (func_scs)
+{
+  Function::arg_iterator args = func_scs->arg_begin();
+  Value* packed_val_58 = args++;
+  packed_val_58->setName("val");
+  
+  BasicBlock* label_entry_59 = new BasicBlock("entry",func_scs,0);
+  
+  // Block entry (label_entry_59)
+  ExtractElementInst* float_tmp2 = new ExtractElementInst(packed_val_58, const_int32_19, "tmp2", label_entry_59);
+  CallInst* float_call_60 = new CallInst(func_cosf, float_tmp2, "call", label_entry_59);
+  float_call_60->setCallingConv(CallingConv::C);
+  float_call_60->setTailCall(true);const ParamAttrsList *float_call_60_PAL = 0;
+  float_call_60->setParamAttrs(float_call_60_PAL);
+  
+  InsertElementInst* packed_tmp5 = new InsertElementInst(const_packed_35, float_call_60, const_int32_19, "tmp5", label_entry_59);
+  CallInst* float_call7 = new CallInst(func_sinf, float_tmp2, "call7", label_entry_59);
+  float_call7->setCallingConv(CallingConv::C);
+  float_call7->setTailCall(true);const ParamAttrsList *float_call7_PAL = 0;
+  float_call7->setParamAttrs(float_call7_PAL);
+  
+  InsertElementInst* packed_tmp9 = new InsertElementInst(packed_tmp5, float_call7, const_int32_23, "tmp9", label_entry_59);
+  new ReturnInst(packed_tmp9, label_entry_59);
+  
+}
+
+// Function: vsin (func_vsin)
+{
+  Function::arg_iterator args = func_vsin->arg_begin();
+  Value* packed_val_62 = args++;
+  packed_val_62->setName("val");
+  
+  BasicBlock* label_entry_63 = new BasicBlock("entry",func_vsin,0);
+  
+  // Block entry (label_entry_63)
+  ExtractElementInst* float_tmp2_64 = new ExtractElementInst(packed_val_62, const_int32_19, "tmp2", label_entry_63);
+  CallInst* float_call_65 = new CallInst(func_sinf, float_tmp2_64, "call", label_entry_63);
+  float_call_65->setCallingConv(CallingConv::C);
+  float_call_65->setTailCall(true);const ParamAttrsList *float_call_65_PAL = 0;
+  float_call_65->setParamAttrs(float_call_65_PAL);
+  
+  InsertElementInst* packed_tmp6 = new InsertElementInst(const_packed_35, float_call_65, const_int32_19, "tmp6", label_entry_63);
+  InsertElementInst* packed_tmp9_66 = new InsertElementInst(packed_tmp6, float_call_65, const_int32_23, "tmp9", label_entry_63);
+  InsertElementInst* packed_tmp12 = new InsertElementInst(packed_tmp9_66, float_call_65, const_int32_25, "tmp12", label_entry_63);
+  InsertElementInst* packed_tmp15_67 = new InsertElementInst(packed_tmp12, float_call_65, const_int32_24, "tmp15", label_entry_63);
+  new ReturnInst(packed_tmp15_67, label_entry_63);
+  
+}
+
+// Function: kilp (func_kilp)
+{
+  Function::arg_iterator args = func_kilp->arg_begin();
+  Value* packed_val_69 = args++;
+  packed_val_69->setName("val");
+  
+  BasicBlock* label_entry_70 = new BasicBlock("entry",func_kilp,0);
+  BasicBlock* label_lor_rhs = new BasicBlock("lor_rhs",func_kilp,0);
+  BasicBlock* label_lor_rhs5 = new BasicBlock("lor_rhs5",func_kilp,0);
+  BasicBlock* label_lor_rhs11 = new BasicBlock("lor_rhs11",func_kilp,0);
+  BasicBlock* label_UnifiedReturnBlock_71 = new BasicBlock("UnifiedReturnBlock",func_kilp,0);
+  
+  // Block entry (label_entry_70)
+  ExtractElementInst* float_tmp1_72 = new ExtractElementInst(packed_val_69, const_int32_19, "tmp1", label_entry_70);
+  FCmpInst* int1_cmp_73 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp1_72, const_float_18, "cmp", label_entry_70);
+  new BranchInst(label_UnifiedReturnBlock_71, label_lor_rhs, int1_cmp_73, label_entry_70);
+  
+  // Block lor_rhs (label_lor_rhs)
+  ExtractElementInst* float_tmp3_75 = new ExtractElementInst(packed_val_69, const_int32_23, "tmp3", label_lor_rhs);
+  FCmpInst* int1_cmp4 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp3_75, const_float_18, "cmp4", label_lor_rhs);
+  new BranchInst(label_UnifiedReturnBlock_71, label_lor_rhs5, int1_cmp4, label_lor_rhs);
+  
+  // Block lor_rhs5 (label_lor_rhs5)
+  ExtractElementInst* float_tmp7 = new ExtractElementInst(packed_val_69, const_int32_25, "tmp7", label_lor_rhs5);
+  FCmpInst* int1_cmp8 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp7, const_float_18, "cmp8", label_lor_rhs5);
+  new BranchInst(label_UnifiedReturnBlock_71, label_lor_rhs11, int1_cmp8, label_lor_rhs5);
+  
+  // Block lor_rhs11 (label_lor_rhs11)
+  ExtractElementInst* float_tmp13 = new ExtractElementInst(packed_val_69, const_int32_24, "tmp13", label_lor_rhs11);
+  FCmpInst* int1_cmp14 = new FCmpInst(FCmpInst::FCMP_OLT, float_tmp13, const_float_18, "cmp14", label_lor_rhs11);
+  CastInst* int32_retval = new ZExtInst(int1_cmp14, IntegerType::get(32), "retval", label_lor_rhs11);
+  new ReturnInst(int32_retval, label_lor_rhs11);
+  
+  // Block UnifiedReturnBlock (label_UnifiedReturnBlock_71)
+  new ReturnInst(const_int32_23, label_UnifiedReturnBlock_71);
+  
+}
+
+return mod;
+
+}
diff --git a/src/gallium/auxiliary/llvm/gallivm_cpu.cpp b/src/gallium/auxiliary/llvm/gallivm_cpu.cpp
new file mode 100644
index 00000000000..8f9830d0b1e
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/gallivm_cpu.cpp
@@ -0,0 +1,202 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "instructions.h"
+#include "loweringpass.h"
+#include "storage.h"
+#include "tgsitollvm.h"
+
+#include "pipe/p_context.h"
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+struct gallivm_cpu_engine {
+   llvm::ExecutionEngine *engine;
+};
+
+static struct gallivm_cpu_engine *CPU = 0;
+
+typedef int (*fragment_shader_runner)(float x, float y,
+                                      float (*dests)[16][4],
+                                      float (*inputs)[16][4],
+                                      int num_attribs,
+                                      float (*consts)[4], int num_consts,
+                                      struct tgsi_sampler *samplers);
+
+int gallivm_cpu_fs_exec(struct gallivm_prog *prog,
+                        float fx, float fy,
+                        float (*dests)[16][4],
+                        float (*inputs)[16][4],
+                        float (*consts)[4],
+                        struct tgsi_sampler *samplers)
+{
+   fragment_shader_runner runner = reinterpret_cast<fragment_shader_runner>(prog->function);
+   assert(runner);
+
+   return runner(fx, fy, dests, inputs, prog->num_interp,
+                 consts, prog->num_consts,
+                 samplers);
+}
+
+static inline llvm::Function *func_for_shader(struct gallivm_prog *prog)
+{
+   llvm::Module *mod = prog->module;
+   llvm::Function *func = 0;
+
+   switch (prog->type) {
+   case GALLIVM_VS:
+      func = mod->getFunction("vs_shader");
+      break;
+   case GALLIVM_FS:
+      func = mod->getFunction("fs_shader");
+      break;
+   default:
+      assert(!"Unknown shader type!");
+      break;
+   }
+   return func;
+}
+
+/*!
+  This function creates a CPU based execution engine for the given gallivm_prog.
+  gallivm_cpu_engine should be used as a singleton throughout the library. Before
+  executing gallivm_prog_exec one needs to call gallivm_cpu_jit_compile.
+  The gallivm_prog instance which is being passed to the constructor is being
+  automatically JIT compiled so one shouldn't call gallivm_cpu_jit_compile
+  with it again.
+ */
+struct gallivm_cpu_engine * gallivm_cpu_engine_create(struct gallivm_prog *prog)
+{
+   struct gallivm_cpu_engine *cpu = (struct gallivm_cpu_engine *)
+                                    calloc(1, sizeof(struct gallivm_cpu_engine));
+   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
+   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
+   llvm::ExecutionEngine *ee = llvm::ExecutionEngine::create(mp, false);
+   ee->DisableLazyCompilation();
+   cpu->engine = ee;
+
+   llvm::Function *func = func_for_shader(prog);
+
+   prog->function = ee->getPointerToFunction(func);
+   CPU = cpu;
+   return cpu;
+}
+
+
+/*!
+  This function JIT compiles the given gallivm_prog with the given cpu based execution engine.
+  The reference to the generated machine code entry point will be stored
+  in the gallivm_prog program. After executing this function one can call gallivm_prog_exec
+  in order to execute the gallivm_prog on the CPU.
+ */
+void gallivm_cpu_jit_compile(struct gallivm_cpu_engine *cpu, struct gallivm_prog *prog)
+{
+   llvm::Module *mod = static_cast<llvm::Module*>(prog->module);
+   llvm::ExistingModuleProvider *mp = new llvm::ExistingModuleProvider(mod);
+   llvm::ExecutionEngine *ee = cpu->engine;
+   assert(ee);
+   /*FIXME : remove */
+   ee->DisableLazyCompilation();
+   ee->addModuleProvider(mp);
+
+   llvm::Function *func = func_for_shader(prog);
+   prog->function = ee->getPointerToFunction(func);
+}
+
+void gallivm_cpu_engine_delete(struct gallivm_cpu_engine *cpu)
+{
+   free(cpu);
+}
+
+struct gallivm_cpu_engine * gallivm_global_cpu_engine()
+{
+   return CPU;
+}
+
+
+typedef void (*vertex_shader_runner)(void *ainputs,
+                                     void *dests,
+                                     float (*aconsts)[4],
+                                     void *temps);
+
+
+/*!
+  This function is used to execute the gallivm_prog in software. Before calling
+  this function the gallivm_prog has to be JIT compiled with the gallivm_cpu_jit_compile
+  function.
+ */
+int gallivm_cpu_vs_exec(struct gallivm_prog *prog,
+                        struct tgsi_exec_vector       *inputs,
+                        struct tgsi_exec_vector       *dests,
+                        float (*consts)[4],
+                        struct tgsi_exec_vector       *temps)
+{
+   vertex_shader_runner runner = reinterpret_cast<vertex_shader_runner>(prog->function);
+   assert(runner);
+   /*FIXME*/
+   runner(inputs, dests, consts, temps);
+
+   return 0;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/gallivm_p.h b/src/gallium/auxiliary/llvm/gallivm_p.h
new file mode 100644
index 00000000000..cfe7b1901b3
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/gallivm_p.h
@@ -0,0 +1,110 @@
+#ifndef GALLIVM_P_H
+#define GALLIVM_P_H
+
+#ifdef MESA_LLVM
+
+#include "gallivm.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_compiler.h"
+
+namespace llvm {
+   class Module;
+}
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+enum gallivm_shader_type;
+enum gallivm_vector_layout;
+
+struct gallivm_interpolate {
+   int attrib;
+   int chan;
+   int type;
+};
+
+struct gallivm_ir {
+   llvm::Module *module;
+   int id;
+   enum gallivm_shader_type type;
+   enum gallivm_vector_layout layout;
+   int num_components;
+   int   num_consts;
+
+   //FIXME: this might not be enough for some shaders
+   struct gallivm_interpolate interpolators[32*4];
+   int   num_interp;
+};
+
+struct gallivm_prog {
+   llvm::Module *module;
+   void *function;
+
+   int   id;
+   enum gallivm_shader_type type;
+
+   int   num_consts;
+
+   //FIXME: this might not be enough for some shaders
+   struct gallivm_interpolate interpolators[32*4];
+   int   num_interp;
+};
+
+static INLINE void gallivm_swizzle_components(int swizzle,
+                                              int *xc, int *yc,
+                                              int *zc, int *wc)
+{
+   int x = swizzle / 1000; swizzle -= x * 1000;
+   int y = swizzle / 100;  swizzle -= y * 100;
+   int z = swizzle / 10;   swizzle -= z * 10;
+   int w = swizzle;
+
+   if (xc) *xc = x;
+   if (yc) *yc = y;
+   if (zc) *zc = z;
+   if (wc) *wc = w;
+}
+
+static INLINE boolean gallivm_is_swizzle(int swizzle)
+{
+   const int NO_SWIZZLE = TGSI_SWIZZLE_X * 1000 + TGSI_SWIZZLE_Y * 100 +
+                          TGSI_SWIZZLE_Z * 10 + TGSI_SWIZZLE_W;
+   return swizzle != NO_SWIZZLE;
+}
+
+static INLINE int gallivm_x_swizzle(int swizzle)
+{
+   int x;
+   gallivm_swizzle_components(swizzle, &x, 0, 0, 0);
+   return x;
+}
+
+static INLINE int gallivm_y_swizzle(int swizzle)
+{
+   int y;
+   gallivm_swizzle_components(swizzle, 0, &y, 0, 0);
+   return y;
+}
+
+static INLINE int gallivm_z_swizzle(int swizzle)
+{
+   int z;
+   gallivm_swizzle_components(swizzle, 0, 0, &z, 0);
+   return z;
+}
+
+static INLINE int gallivm_w_swizzle(int swizzle)
+{
+   int w;
+   gallivm_swizzle_components(swizzle, 0, 0, 0, &w);
+   return w;
+}
+
+#endif /* MESA_LLVM */
+
+#if defined __cplusplus
+} // extern "C"
+#endif
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/instructions.cpp b/src/gallium/auxiliary/llvm/instructions.cpp
new file mode 100644
index 00000000000..55d39fa5f12
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/instructions.cpp
@@ -0,0 +1,889 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+#ifdef MESA_LLVM
+
+#include "instructions.h"
+
+#include "storage.h"
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Function.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ParameterAttributes.h>
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+
+#include "gallivm_builtins.cpp"
+
+static inline std::string createFuncName(int label)
+{
+   std::ostringstream stream;
+   stream << "function";
+   stream << label;
+   return stream.str();
+}
+
+Instructions::Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicBlock *block,
+                           Storage *storage)
+   :  m_mod(mod), m_func(func), m_builder(block), m_idx(0),
+      m_storage(storage)
+{
+   m_floatVecType = VectorType::get(Type::FloatTy, 4);
+
+   m_llvmFSqrt = 0;
+   m_llvmFAbs  = 0;
+   m_llvmPow   = 0;
+   m_llvmFloor = 0;
+   m_llvmFlog  = 0;
+   m_llvmLit  = 0;
+   m_fmtPtr = 0;
+
+   createGallivmBuiltins(m_mod);
+}
+
+llvm::Value * Instructions::add(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateAdd(in1, in2, name("add"));
+}
+
+llvm::Value * Instructions::madd(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   Value *mulRes = mul(in1, in2);
+   return add(mulRes, in3);
+}
+ 
+llvm::Value * Instructions::mul(llvm::Value *in1, llvm::Value *in2)
+{
+   return m_builder.CreateMul(in1, in2, name("mul"));
+}
+
+const char * Instructions::name(const char *prefix)
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::Value * Instructions::dp3(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   Value *x = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(0),
+                                                          name("extractx"));
+   Value *y = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(1),
+                                                          name("extracty"));
+   Value *z = m_builder.CreateExtractElement(mulRes,
+                                                          m_storage->constantInt(2),
+                                                          name("extractz"));
+   Value *xy = m_builder.CreateAdd(x, y,name("xy"));
+   Value *dot3 = m_builder.CreateAdd(xy, z, name("dot3"));
+   return vectorFromVals(dot3, dot3, dot3, dot3);
+}
+
+llvm::Value *Instructions::callFSqrt(llvm::Value *val)
+{
+   if (!m_llvmFSqrt) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fsqrtArgs;
+      fsqrtArgs.push_back(Type::FloatTy);
+      ParamAttrsList *fsqrtPal = 0;
+      FunctionType* fsqrtType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fsqrtArgs,
+         /*isVarArg=*/false);
+      m_llvmFSqrt = new Function(
+         /*Type=*/fsqrtType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.sqrt.f32", m_mod);
+      m_llvmFSqrt->setCallingConv(CallingConv::C);
+      m_llvmFSqrt->setParamAttrs(fsqrtPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFSqrt, val,
+                                         name("sqrt"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::rsq(llvm::Value *in1)
+{
+   Value *x = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *abs  = callFAbs(x);
+   Value *sqrt = callFSqrt(abs);
+
+   Value *rsqrt = m_builder.CreateFDiv(ConstantFP::get(Type::FloatTy,
+                                                                APFloat(1.f)),
+                                                sqrt,
+                                                name("rsqrt"));
+   return vectorFromVals(rsqrt, rsqrt, rsqrt, rsqrt);
+}
+
+llvm::Value * Instructions::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                           llvm::Value *z, llvm::Value *w)
+{
+   Constant *const_vec = Constant::getNullValue(m_floatVecType);
+   Value *res = m_builder.CreateInsertElement(const_vec, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
+}
+
+llvm::Value *Instructions::callFAbs(llvm::Value *val)
+{
+   if (!m_llvmFAbs) {
+      // predeclare the intrinsic
+      std::vector<const Type*> fabsArgs;
+      fabsArgs.push_back(Type::FloatTy);
+      ParamAttrsList *fabsPal = 0;
+      FunctionType* fabsType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/fabsArgs,
+         /*isVarArg=*/false);
+      m_llvmFAbs = new Function(
+         /*Type=*/fabsType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"fabs", m_mod);
+      m_llvmFAbs->setCallingConv(CallingConv::C);
+      m_llvmFAbs->setParamAttrs(fabsPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFAbs, val,
+                                         name("fabs"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::lit(llvm::Value *in)
+{
+   if (!m_llvmLit) {
+      m_llvmLit = m_mod->getFunction("lit");
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmLit, in, name("litres"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::sub(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *res = m_builder.CreateSub(in1, in2, name("sub"));
+   return res;
+}
+
+llvm::Value * Instructions::callPow(llvm::Value *val1, llvm::Value *val2)
+{
+   if (!m_llvmPow) {
+      // predeclare the intrinsic
+      std::vector<const Type*> powArgs;
+      powArgs.push_back(Type::FloatTy);
+      powArgs.push_back(Type::FloatTy);
+      ParamAttrsList *powPal = 0;
+      FunctionType* powType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/powArgs,
+         /*isVarArg=*/false);
+      m_llvmPow = new Function(
+         /*Type=*/powType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"llvm.pow.f32", m_mod);
+      m_llvmPow->setCallingConv(CallingConv::C);
+      m_llvmPow->setParamAttrs(powPal);
+   }
+   std::vector<Value*> params;
+   params.push_back(val1);
+   params.push_back(val2);
+   CallInst *call = m_builder.CreateCall(m_llvmPow, params.begin(), params.end(),
+                                         name("pow"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::pow(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   llvm::Value *val = callPow(x1, x2);
+   return vectorFromVals(val, val, val, val);
+}
+
+llvm::Value * Instructions::rcp(llvm::Value *in1)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *res = m_builder.CreateFDiv(ConstantFP::get(Type::FloatTy,
+                                                              APFloat(1.f)),
+                                              x1, name("rcp"));
+   return vectorFromVals(res, res, res, res);
+}
+
+llvm::Value * Instructions::dp4(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   std::vector<llvm::Value*> vec = extractVector(mulRes);
+   Value *xy = m_builder.CreateAdd(vec[0], vec[1], name("xy"));
+   Value *xyz = m_builder.CreateAdd(xy, vec[2], name("xyz"));
+   Value *dot4 = m_builder.CreateAdd(xyz, vec[3], name("dot4"));
+   return vectorFromVals(dot4, dot4, dot4, dot4);
+}
+
+llvm::Value * Instructions::dph(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *mulRes = mul(in1, in2);
+   std::vector<llvm::Value*> vec1 = extractVector(mulRes);
+   Value *xy = m_builder.CreateAdd(vec1[0], vec1[1], name("xy"));
+   Value *xyz = m_builder.CreateAdd(xy, vec1[2], name("xyz"));
+   Value *dph = m_builder.CreateAdd(xyz, vec1[3], name("dph"));
+   return vectorFromVals(dph, dph, dph, dph);
+}
+
+llvm::Value * Instructions::dst(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z = m_builder.CreateExtractElement(in1,
+                                             m_storage->constantInt(2),
+                                             name("z"));
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *w = m_builder.CreateExtractElement(in2,
+                                             m_storage->constantInt(3),
+                                             name("w"));
+   Value *ry = m_builder.CreateMul(y1, y2, name("tyuy"));
+   return vectorFromVals(ConstantFP::get(Type::FloatTy, APFloat(1.f)),
+                         ry, z, w);
+}
+
+llvm::Value * Instructions::ex2(llvm::Value *in)
+{
+   llvm::Value *val = callPow(ConstantFP::get(Type::FloatTy, APFloat(2.f)),
+                              m_builder.CreateExtractElement(
+                                 in, m_storage->constantInt(0),
+                                 name("x1")));
+   return vectorFromVals(val, val, val, val);
+}
+
+llvm::Value * Instructions::callFloor(llvm::Value *val)
+{
+   if (!m_llvmFloor) {
+      // predeclare the intrinsic
+      std::vector<const Type*> floorArgs;
+      floorArgs.push_back(Type::FloatTy);
+      ParamAttrsList *floorPal = 0;
+      FunctionType* floorType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/floorArgs,
+         /*isVarArg=*/false);
+      m_llvmFloor = new Function(
+         /*Type=*/floorType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"floorf", m_mod);
+      m_llvmFloor->setCallingConv(CallingConv::C);
+      m_llvmFloor->setParamAttrs(floorPal);
+   }
+   CallInst *call =  m_builder.CreateCall(m_llvmFloor, val,
+                                          name("floorf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::floor(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   return vectorFromVals(callFloor(vec[0]), callFloor(vec[1]),
+                         callFloor(vec[2]), callFloor(vec[3]));
+}
+
+llvm::Value * Instructions::arl(llvm::Value *in)
+{
+   return floor(in);
+}
+
+llvm::Value * Instructions::frc(llvm::Value *in)
+{
+   llvm::Value *flr = floor(in);
+   return sub(in, flr);
+}
+
+llvm::Value * Instructions::callFLog(llvm::Value *val)
+{
+   if (!m_llvmFlog) {
+      // predeclare the intrinsic
+      std::vector<const Type*> flogArgs;
+      flogArgs.push_back(Type::FloatTy);
+      ParamAttrsList *flogPal = 0;
+      FunctionType* flogType = FunctionType::get(
+         /*Result=*/Type::FloatTy,
+         /*Params=*/flogArgs,
+         /*isVarArg=*/false);
+      m_llvmFlog = new Function(
+         /*Type=*/flogType,
+         /*Linkage=*/GlobalValue::ExternalLinkage,
+         /*Name=*/"logf", m_mod);
+      m_llvmFlog->setCallingConv(CallingConv::C);
+      m_llvmFlog->setParamAttrs(flogPal);
+   }
+   CallInst *call = m_builder.CreateCall(m_llvmFlog, val,
+                                         name("logf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::lg2(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   llvm::Value *const_vec = constVector(1.442695f, 1.442695f,
+                                        1.442695f, 1.442695f);
+   return mul(vectorFromVals(callFLog(vec[0]), callFLog(vec[1]),
+                             callFLog(vec[2]), callFLog(vec[3])), const_vec);
+}
+
+llvm::Value * Instructions::min(llvm::Value *in1, llvm::Value *in2)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp  = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
+}
+
+llvm::Value * Instructions::max(llvm::Value *in1, llvm::Value *in2)
+{
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp  = m_builder.CreateFCmpOGT(vec1[0], vec2[0],
+                                          name("xcmp"));
+   Value *selx = m_builder.CreateSelect(xcmp, vec1[0], vec2[0],
+                                        name("selx"));
+
+   Value *ycmp  = m_builder.CreateFCmpOGT(vec1[1], vec2[1],
+                                          name("ycmp"));
+   Value *sely = m_builder.CreateSelect(ycmp, vec1[1], vec2[1],
+                                        name("sely"));
+
+   Value *zcmp  = m_builder.CreateFCmpOGT(vec1[2], vec2[2],
+                                          name("zcmp"));
+   Value *selz = m_builder.CreateSelect(zcmp, vec1[2], vec2[2],
+                                        name("selz"));
+
+   Value *wcmp  = m_builder.CreateFCmpOGT(vec1[3], vec2[3],
+                                          name("wcmp"));
+   Value *selw = m_builder.CreateSelect(wcmp, vec1[3], vec2[3],
+                                        name("selw"));
+
+   return vectorFromVals(selx, sely, selz, selw);
+}
+
+void Instructions::printVector(llvm::Value *val)
+{
+   static const char *frmt = "Vector is [%f, %f, %f, %f]\x0A";
+
+   if (!m_fmtPtr) {
+      Constant *format = ConstantArray::get(frmt, true);
+      ArrayType *arrayTy = ArrayType::get(IntegerType::get(8), strlen(frmt) + 1);
+      GlobalVariable* globalFormat = new GlobalVariable(
+         /*Type=*/arrayTy,
+         /*isConstant=*/true,
+         /*Linkage=*/GlobalValue::InternalLinkage,
+         /*Initializer=*/0, // has initializer, specified below
+         /*Name=*/name(".str"),
+         m_mod);
+      globalFormat->setInitializer(format);
+
+      Constant* const_int0 = Constant::getNullValue(IntegerType::get(32));
+      std::vector<Constant*> const_ptr_21_indices;
+      const_ptr_21_indices.push_back(const_int0);
+      const_ptr_21_indices.push_back(const_int0);
+      m_fmtPtr = ConstantExpr::getGetElementPtr(globalFormat,
+                                                &const_ptr_21_indices[0], const_ptr_21_indices.size());
+   }
+
+   Function *func_printf = m_mod->getFunction("printf");
+   if (!func_printf)
+      func_printf = declarePrintf();
+   assert(func_printf);
+   std::vector<llvm::Value*> vec = extractVector(val);
+   Value *dx = m_builder.CreateFPExt(vec[0], Type::DoubleTy, name("dx"));
+   Value *dy = m_builder.CreateFPExt(vec[1], Type::DoubleTy, name("dy"));
+   Value *dz = m_builder.CreateFPExt(vec[2], Type::DoubleTy, name("dz"));
+   Value *dw = m_builder.CreateFPExt(vec[3], Type::DoubleTy, name("dw"));
+   std::vector<Value*> params;
+   params.push_back(m_fmtPtr);
+   params.push_back(dx);
+   params.push_back(dy);
+   params.push_back(dz);
+   params.push_back(dw);
+   CallInst *call = m_builder.CreateCall(func_printf, params.begin(), params.end(),
+                                         name("printf"));
+   call->setCallingConv(CallingConv::C);
+   call->setTailCall(true);
+}
+
+llvm::Function * Instructions::declarePrintf()
+{
+   std::vector<const Type*> args;
+   ParamAttrsList *params = 0;
+   FunctionType* funcTy = FunctionType::get(
+      /*Result=*/IntegerType::get(32),
+      /*Params=*/args,
+      /*isVarArg=*/true);
+   Function* func_printf = new Function(
+      /*Type=*/funcTy,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/"printf", m_mod);
+   func_printf->setCallingConv(CallingConv::C);
+   func_printf->setParamAttrs(params);
+   return func_printf;
+}
+
+
+llvm::Value * Instructions::sgt(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(Type::FloatTy, APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+   Value *xcmp = m_builder.CreateFCmpOGT(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOGT(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOGT(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOGT(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+llvm::Value * Instructions::sge(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(Type::FloatTy, APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOGE(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOGE(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOGE(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOGE(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+
+llvm::Value * Instructions::slt(llvm::Value *in1, llvm::Value *in2)
+{
+   Constant *const1f = ConstantFP::get(Type::FloatTy, APFloat(1.000000e+00f));
+   Constant *const0f = Constant::getNullValue(Type::FloatTy);
+
+   std::vector<llvm::Value*> vec1 = extractVector(in1);
+   std::vector<llvm::Value*> vec2 = extractVector(in2);
+
+   Value *xcmp = m_builder.CreateFCmpOLT(vec1[0], vec2[0], name("xcmp"));
+   Value *x = m_builder.CreateSelect(xcmp, const1f, const0f, name("xsel"));
+
+   Value *ycmp = m_builder.CreateFCmpOLT(vec1[1], vec2[1], name("ycmp"));
+   Value *y = m_builder.CreateSelect(ycmp, const1f, const0f, name("ysel"));
+
+   Value *zcmp = m_builder.CreateFCmpOLT(vec1[2], vec2[2], name("zcmp"));
+   Value *z = m_builder.CreateSelect(zcmp, const1f, const0f, name("zsel"));
+
+   Value *wcmp = m_builder.CreateFCmpOLT(vec1[3], vec2[3], name("wcmp"));
+   Value *w = m_builder.CreateSelect(wcmp, const1f, const0f, name("wsel"));
+
+   return vectorFromVals(x, y, z, w);
+}
+
+llvm::Value * Instructions::cross(llvm::Value *in1, llvm::Value *in2)
+{
+   Value *x1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(0),
+                                              name("x1"));
+   Value *y1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(1),
+                                              name("y1"));
+   Value *z1 = m_builder.CreateExtractElement(in1,
+                                              m_storage->constantInt(2),
+                                              name("z1"));
+
+   Value *x2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(0),
+                                              name("x2"));
+   Value *y2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(1),
+                                              name("y2"));
+   Value *z2 = m_builder.CreateExtractElement(in2,
+                                              m_storage->constantInt(2),
+                                              name("z2"));
+   Value *y1z2 = mul(y1, z2);
+   Value *z1y2 = mul(z1, y2);
+
+   Value *z1x2 = mul(z1, x2);
+   Value *x1z2 = mul(x1, z2);
+
+   Value *x1y2 = mul(x1, y2);
+   Value *y1x2 = mul(y1, x2);
+
+   return vectorFromVals(sub(y1z2, z1y2), sub(z1x2, x1z2), sub(x1y2, y1x2));
+}
+
+
+llvm::Value * Instructions::abs(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *xabs  = callFAbs(vec[0]);
+   Value *yabs  = callFAbs(vec[1]);
+   Value *zabs  = callFAbs(vec[2]);
+   Value *wabs  = callFAbs(vec[3]);
+   return vectorFromVals(xabs, yabs, zabs, wabs);
+}
+
+void Instructions::ifop(llvm::Value *in)
+{
+   BasicBlock *ifthen = new BasicBlock(name("ifthen"), m_func,0);
+   BasicBlock *ifend = new BasicBlock(name("ifthenend"), m_func,0);
+
+   //BasicBlock *yblock = new BasicBlock(name("yblock"), m_func,0);
+   //BasicBlock *zblock = new BasicBlock(name("zblock"), m_func,0);
+   //BasicBlock *wblock = new BasicBlock(name("wblock"), m_func,0);
+
+   Constant *float0 = Constant::getNullValue(Type::FloatTy);
+
+   Value *x = m_builder.CreateExtractElement(in, m_storage->constantInt(0),
+                                             name("extractx"));
+   Value *xcmp = m_builder.CreateFCmpUNE(x, float0, name("xcmp"));
+   m_builder.CreateCondBr(xcmp, ifthen, ifend);
+   //m_builder.SetInsertPoint(yblock);
+
+   m_builder.SetInsertPoint(ifthen);
+   m_ifStack.push(ifend);
+}
+
+llvm::BasicBlock * Instructions::currentBlock() const
+{
+   return m_builder.GetInsertBlock();
+}
+
+void Instructions::elseop()
+{
+   assert(!m_ifStack.empty());
+   BasicBlock *ifend = new BasicBlock(name("ifend"), m_func,0);
+   m_builder.CreateBr(ifend);
+   m_builder.SetInsertPoint(m_ifStack.top());
+   currentBlock()->setName(name("ifelse"));
+   m_ifStack.pop();
+   m_ifStack.push(ifend);
+}
+
+void Instructions::endif()
+{
+   assert(!m_ifStack.empty());
+   m_builder.CreateBr(m_ifStack.top());
+   m_builder.SetInsertPoint(m_ifStack.top());
+   m_ifStack.pop();
+}
+
+llvm::Value * Instructions::lerp(llvm::Value *in1, llvm::Value *in2,
+                                 llvm::Value *in3)
+{
+   llvm::Value *m = mul(in1, in2);
+   llvm::Value *vec1 = constVector(1.f, 1.f, 1.f, 1.f);
+   llvm::Value *s = sub(vec1, in1);
+   return add(m, mul(s, in3));
+}
+
+void Instructions::beginLoop()
+{
+   BasicBlock *begin = new BasicBlock(name("loop"), m_func,0);
+   BasicBlock *end = new BasicBlock(name("endloop"), m_func,0);
+
+   m_builder.CreateBr(begin);
+   Loop loop;
+   loop.begin = begin;
+   loop.end   = end;
+   m_builder.SetInsertPoint(begin);
+   m_loopStack.push(loop);
+}
+
+void Instructions::endLoop()
+{
+   assert(!m_loopStack.empty());
+   Loop loop = m_loopStack.top();
+   m_builder.CreateBr(loop.begin);
+   loop.end->moveAfter(currentBlock());
+   m_builder.SetInsertPoint(loop.end);
+   m_loopStack.pop();
+}
+
+void Instructions::brk()
+{
+   assert(!m_loopStack.empty());
+   BasicBlock *unr = new BasicBlock(name("unreachable"), m_func,0);
+   m_builder.CreateBr(m_loopStack.top().end);
+   m_builder.SetInsertPoint(unr);
+}
+
+llvm::Value * Instructions::trunc(llvm::Value *in)
+{
+   std::vector<llvm::Value*> vec = extractVector(in);
+   Value *icastx = m_builder.CreateFPToSI(vec[0], IntegerType::get(32),
+                                          name("ftoix"));
+   Value *icasty = m_builder.CreateFPToSI(vec[1], IntegerType::get(32),
+                                          name("ftoiy"));
+   Value *icastz = m_builder.CreateFPToSI(vec[2], IntegerType::get(32),
+                                          name("ftoiz"));
+   Value *icastw = m_builder.CreateFPToSI(vec[3], IntegerType::get(32),
+                                          name("ftoiw"));
+   Value *fx = m_builder.CreateSIToFP(icastx, Type::FloatTy,
+                                      name("fx"));
+   Value *fy = m_builder.CreateSIToFP(icasty, Type::FloatTy,
+                                      name("fy"));
+   Value *fz = m_builder.CreateSIToFP(icastz, Type::FloatTy,
+                                      name("fz"));
+   Value *fw = m_builder.CreateSIToFP(icastw, Type::FloatTy,
+                                      name("fw"));
+   return vectorFromVals(fx, fy, fz, fw);
+}
+
+void Instructions::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+void Instructions::cal(int label, llvm::Value *input)
+{
+   std::vector<Value*> params;
+   params.push_back(input);
+   llvm::Function *func = findFunction(label);
+
+   m_builder.CreateCall(func, params.begin(), params.end());
+}
+
+llvm::Function * Instructions::declareFunc(int label)
+{
+   PointerType *vecPtr = PointerType::getUnqual(m_floatVecType);
+   std::vector<const Type*> args;
+   args.push_back(vecPtr);
+   args.push_back(vecPtr);
+   args.push_back(vecPtr);
+   args.push_back(vecPtr);
+   ParamAttrsList *params = 0;
+   FunctionType *funcType = FunctionType::get(
+      /*Result=*/Type::VoidTy,
+      /*Params=*/args,
+      /*isVarArg=*/false);
+   std::string name = createFuncName(label);
+   Function *func = new Function(
+      /*Type=*/funcType,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Name=*/name.c_str(), m_mod);
+   func->setCallingConv(CallingConv::C);
+   func->setParamAttrs(params);
+   return func;
+}
+
+void Instructions::bgnSub(unsigned label)
+{
+   llvm::Function *func = findFunction(label);
+
+   Function::arg_iterator args = func->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("INPUT");
+   m_storage->pushArguments(ptr_INPUT);
+
+   llvm::BasicBlock *entry = new BasicBlock("entry", func, 0);
+
+   m_func = func;
+   m_builder.SetInsertPoint(entry);
+}
+
+void Instructions::endSub()
+{
+   m_func = 0;
+   m_builder.SetInsertPoint(0);
+}
+
+llvm::Function * Instructions::findFunction(int label)
+{
+   llvm::Function *func = m_functions[label];
+   if (!func) {
+      func = declareFunc(label);
+      m_functions[label] = func;
+   }
+   return func;
+}
+
+llvm::Value * Instructions::constVector(float x, float y, float z, float w)
+{
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(Type::FloatTy, APFloat(x));
+   vec[1] = ConstantFP::get(Type::FloatTy, APFloat(y));
+   vec[2] = ConstantFP::get(Type::FloatTy, APFloat(z));
+   vec[3] = ConstantFP::get(Type::FloatTy, APFloat(w));
+   return ConstantVector::get(m_floatVecType, vec);
+}
+
+
+std::vector<llvm::Value*> Instructions::extractVector(llvm::Value *vec)
+{
+   std::vector<llvm::Value*> elems(4);
+   elems[0] = m_builder.CreateExtractElement(vec, m_storage->constantInt(0),
+                                             name("x"));
+   elems[1] = m_builder.CreateExtractElement(vec, m_storage->constantInt(1),
+                                             name("y"));
+   elems[2] = m_builder.CreateExtractElement(vec, m_storage->constantInt(2),
+                                             name("z"));
+   elems[3] = m_builder.CreateExtractElement(vec, m_storage->constantInt(3),
+                                             name("w"));
+   return elems;
+}
+
+llvm::Value * Instructions::cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3)
+{
+   llvm::Function *func = m_mod->getFunction("cmp");
+   assert(func);
+
+   std::vector<Value*> params;
+   params.push_back(in1);
+   params.push_back(in2);
+   params.push_back(in3);
+   CallInst *call = m_builder.CreateCall(func, params.begin(), params.end(), name("cmpres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::cos(llvm::Value *in)
+{
+#if 0
+   llvm::Function *func = m_mod->getFunction("vcos");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("cosres"));
+   call->setTailCall(false);
+   return call;
+#else
+   std::vector<llvm::Value*> elems = extractVector(in);
+   Function *func = m_mod->getFunction("cosf");
+   assert(func);
+   CallInst *cos = m_builder.CreateCall(func, elems[0], name("cosres"));
+   cos->setCallingConv(CallingConv::C);
+   cos->setTailCall(true);
+   return vectorFromVals(cos, cos, cos, cos);
+#endif
+}
+
+llvm::Value * Instructions::scs(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("scs");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("scsres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::kilp(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("kilp");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("kilpres"));
+   call->setTailCall(false);
+   return call;
+}
+
+llvm::Value * Instructions::sin(llvm::Value *in)
+{
+   llvm::Function *func = m_mod->getFunction("vsin");
+   assert(func);
+
+   CallInst *call = m_builder.CreateCall(func, in, name("sinres"));
+   call->setTailCall(false);
+   return call;
+}
+#endif //MESA_LLVM
+
+
diff --git a/src/gallium/auxiliary/llvm/instructions.h b/src/gallium/auxiliary/llvm/instructions.h
new file mode 100644
index 00000000000..9ebc17dd8ec
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/instructions.h
@@ -0,0 +1,152 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+
+#ifndef INSTRUCTIONS_H
+#define INSTRUCTIONS_H
+
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+#include <llvm/Support/LLVMBuilder.h>
+
+#include <map>
+#include <stack>
+
+namespace llvm {
+   class VectorType;
+   class Function;
+}
+
+class Storage;
+
+class Instructions
+{
+public:
+   Instructions(llvm::Module *mod, llvm::Function *func, llvm::BasicBlock *block,
+                Storage *storage);
+
+   llvm::BasicBlock *currentBlock() const;
+
+   llvm::Value *abs(llvm::Value *in1);
+   llvm::Value *arl(llvm::Value *in1);
+   llvm::Value *add(llvm::Value *in1, llvm::Value *in2);
+   void         beginLoop();
+   void         bgnSub(unsigned);
+   void         brk();
+   void         cal(int label, llvm::Value *input);
+   llvm::Value *cmp(llvm::Value *in1, llvm::Value *in2, llvm::Value *in3);
+   llvm::Value *cos(llvm::Value *in);
+   llvm::Value *cross(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dp3(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dp4(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dph(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *dst(llvm::Value *in1, llvm::Value *in2);
+   void         elseop();
+   void         endif();
+   void         endLoop();
+   void         end();
+   void         endSub();
+   llvm::Value *ex2(llvm::Value *in);
+   llvm::Value *floor(llvm::Value *in);
+   llvm::Value *frc(llvm::Value *in);
+   void         ifop(llvm::Value *in);
+   llvm::Value *kilp(llvm::Value *in);
+   llvm::Value *lerp(llvm::Value *in1, llvm::Value *in2,
+                     llvm::Value *in3);
+   llvm::Value *lit(llvm::Value *in);
+   llvm::Value *lg2(llvm::Value *in);
+   llvm::Value *madd(llvm::Value *in1, llvm::Value *in2,
+                     llvm::Value *in2);
+   llvm::Value *min(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *max(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *mul(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *pow(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *rcp(llvm::Value *in);
+   llvm::Value *rsq(llvm::Value *in);
+   llvm::Value *scs(llvm::Value *in);
+   llvm::Value *sge(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sgt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sin(llvm::Value *in);
+   llvm::Value *slt(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *sub(llvm::Value *in1, llvm::Value *in2);
+   llvm::Value *trunc(llvm::Value *in);
+
+   void printVector(llvm::Value *val);
+private:
+   const char *name(const char *prefix);
+
+   llvm::Value *callFAbs(llvm::Value *val);
+   llvm::Value *callFloor(llvm::Value *val);
+   llvm::Value *callFSqrt(llvm::Value *val);
+   llvm::Value *callFLog(llvm::Value *val);
+   llvm::Value *callPow(llvm::Value *val1, llvm::Value *val2);
+
+   llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
+                               llvm::Value *z, llvm::Value *w=0);
+
+   llvm::Value *constVector(float x, float y, float z, float w);
+
+   llvm::Function *declarePrintf();
+   llvm::Function *declareFunc(int label);
+
+   llvm::Function *findFunction(int label);
+
+   std::vector<llvm::Value*> extractVector(llvm::Value *vec);
+private:
+   llvm::Module             *m_mod;
+   llvm::Function           *m_func;
+   char                      m_name[32];
+   llvm::LLVMFoldingBuilder  m_builder;
+   int                       m_idx;
+
+   llvm::VectorType *m_floatVecType;
+
+   llvm::Function   *m_llvmFSqrt;
+   llvm::Function   *m_llvmFAbs;
+   llvm::Function   *m_llvmPow;
+   llvm::Function   *m_llvmFloor;
+   llvm::Function   *m_llvmFlog;
+   llvm::Function   *m_llvmLit;
+
+   llvm::Constant   *m_fmtPtr;
+
+   std::stack<llvm::BasicBlock*> m_ifStack;
+   struct Loop {
+      llvm::BasicBlock *begin;
+      llvm::BasicBlock *end;
+   };
+   std::stack<Loop> m_loopStack;
+   std::map<int, llvm::Function*> m_functions;
+   Storage *m_storage;
+};
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/instructionssoa.cpp b/src/gallium/auxiliary/llvm/instructionssoa.cpp
new file mode 100644
index 00000000000..a4d50466373
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/instructionssoa.cpp
@@ -0,0 +1,121 @@
+#include "instructionssoa.h"
+
+#include "storagesoa.h"
+
+#include <llvm/Constants.h>
+
+using namespace llvm;
+
+InstructionsSoa::InstructionsSoa(llvm::Module *mod, llvm::Function *func,
+                                 llvm::BasicBlock *block, StorageSoa *storage)
+   : m_builder(block),
+     m_storage(storage),
+     m_idx(0)
+{
+}
+
+const char * InstructionsSoa::name(const char *prefix) const
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::Value * InstructionsSoa::vectorFromVals(llvm::Value *x, llvm::Value *y,
+                                              llvm::Value *z, llvm::Value *w)
+{
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   Constant *constVector = Constant::getNullValue(vectorType);
+   Value *res = m_builder.CreateInsertElement(constVector, x,
+                                              m_storage->constantInt(0),
+                                              name("vecx"));
+   res = m_builder.CreateInsertElement(res, y, m_storage->constantInt(1),
+                               name("vecxy"));
+   res = m_builder.CreateInsertElement(res, z, m_storage->constantInt(2),
+                               name("vecxyz"));
+   if (w)
+      res = m_builder.CreateInsertElement(res, w, m_storage->constantInt(3),
+                                          name("vecxyzw"));
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::arl(const std::vector<llvm::Value*> in)
+{
+   std::vector<llvm::Value*> res(4);
+
+   //Extract x's
+   llvm::Value *x1 = m_builder.CreateExtractElement(in[0],
+                                                    m_storage->constantInt(0),
+                                                    name("extractX"));
+   //cast it to an unsigned int
+   x1 = m_builder.CreateFPToUI(x1, IntegerType::get(32), name("x1IntCast"));
+
+   res[0] = x1;//vectorFromVals(x1, x2, x3, x4);
+   //only x is valid. the others shouldn't be necessary
+   /*
+   res[1] = Constant::getNullValue(m_floatVecType);
+   res[2] = Constant::getNullValue(m_floatVecType);
+   res[3] = Constant::getNullValue(m_floatVecType);
+   */
+
+   return res;
+}
+
+
+std::vector<llvm::Value*> InstructionsSoa::add(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateAdd(in1[0], in2[0], name("addx"));
+   res[1] = m_builder.CreateAdd(in1[1], in2[1], name("addy"));
+   res[2] = m_builder.CreateAdd(in1[2], in2[2], name("addz"));
+   res[3] = m_builder.CreateAdd(in1[3], in2[3], name("addw"));
+
+   return res;
+}
+
+std::vector<llvm::Value*> InstructionsSoa::mul(const std::vector<llvm::Value*> in1,
+                                               const std::vector<llvm::Value*> in2)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = m_builder.CreateMul(in1[0], in2[0], name("mulx"));
+   res[1] = m_builder.CreateMul(in1[1], in2[1], name("muly"));
+   res[2] = m_builder.CreateMul(in1[2], in2[2], name("mulz"));
+   res[3] = m_builder.CreateMul(in1[3], in2[3], name("mulw"));
+
+   return res;
+}
+
+void InstructionsSoa::end()
+{
+   m_builder.CreateRetVoid();
+}
+
+std::vector<llvm::Value*> InstructionsSoa::madd(const std::vector<llvm::Value*> in1,
+                                                const std::vector<llvm::Value*> in2,
+                                                const std::vector<llvm::Value*> in3)
+{
+   std::vector<llvm::Value*> res = mul(in1, in2);
+   return add(res, in3);
+}
+
+std::vector<llvm::Value*> InstructionsSoa::extractVector(llvm::Value *vector)
+{
+   std::vector<llvm::Value*> res(4);
+   res[0] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(0),
+                                           name("extract1X"));
+   res[1] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(1),
+                                           name("extract2X"));
+   res[2] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(2),
+                                           name("extract3X"));
+   res[3] = m_builder.CreateExtractElement(vector,
+                                           m_storage->constantInt(3),
+                                           name("extract4X"));
+
+   return res;
+}
diff --git a/src/gallium/auxiliary/llvm/instructionssoa.h b/src/gallium/auxiliary/llvm/instructionssoa.h
new file mode 100644
index 00000000000..4169dcbb2eb
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/instructionssoa.h
@@ -0,0 +1,74 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef INSTRUCTIONSSOA_H
+#define INSTRUCTIONSSOA_H
+
+#include <llvm/Support/LLVMBuilder.h>
+
+#include <vector>
+
+namespace llvm {
+   class Module;
+   class Function;
+   class BasicBlock;
+   class Value;
+}
+class StorageSoa;
+
+class InstructionsSoa
+{
+public:
+   InstructionsSoa(llvm::Module *mod, llvm::Function *func,
+                   llvm::BasicBlock *block, StorageSoa *storage);
+
+   std::vector<llvm::Value*> arl(const std::vector<llvm::Value*> in);
+
+   std::vector<llvm::Value*> add(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   std::vector<llvm::Value*> madd(const std::vector<llvm::Value*> in1,
+                                  const std::vector<llvm::Value*> in2,
+                                  const std::vector<llvm::Value*> in3);
+   std::vector<llvm::Value*> mul(const std::vector<llvm::Value*> in1,
+                                 const std::vector<llvm::Value*> in2);
+   void         end();
+
+   std::vector<llvm::Value*> extractVector(llvm::Value *vector);
+private:
+   const char * name(const char *prefix) const;
+   llvm::Value *vectorFromVals(llvm::Value *x, llvm::Value *y,
+                               llvm::Value *z, llvm::Value *w);
+private:
+   llvm::LLVMFoldingBuilder  m_builder;
+   StorageSoa *m_storage;
+private:
+   mutable int  m_idx;
+   mutable char m_name[32];
+};
+
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/llvm_builtins.c b/src/gallium/auxiliary/llvm/llvm_builtins.c
new file mode 100644
index 00000000000..4f98d754baa
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/llvm_builtins.c
@@ -0,0 +1,115 @@
+/*clang --emit-llvm llvm_builtins.c |llvm-as|opt -std-compile-opts|llvm2cpp -gen-contents -o=gallivm_builtins.cpp -f -for=shader -funcname=createGallivmBuiltins*/
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+typedef __attribute__(( ocu_vector_type(4) )) float float4;
+
+extern float powf(float a, float b);
+
+inline float approx(float a, float b)
+{
+    if (b < -128.0f) b = -128.0f;
+    if (b > 128.0f)   b = 128.0f;
+    if (a < 0) a = 0;
+    return powf(a, b);
+}
+
+inline float4 lit(float4 tmp)
+{
+    float4 result;
+    result.x = 1.0;
+    result.w = 1.0;
+    if (tmp.x > 0) {
+        result.y = tmp.x;
+        result.z = approx(tmp.y, tmp.w);
+    } else {
+        result.y = 0;
+        result.z = 0;
+    }
+    return result;
+}
+
+inline float4 cmp(float4 tmp0, float4 tmp1, float4 tmp2)
+{
+   float4 result;
+
+   result.x = (tmp0.x < 0.0) ? tmp1.x : tmp2.x;
+   result.y = (tmp0.y < 0.0) ? tmp1.y : tmp2.y;
+   result.z = (tmp0.z < 0.0) ? tmp1.z : tmp2.z;
+   result.w = (tmp0.w < 0.0) ? tmp1.w : tmp2.w;
+
+   return result;
+}
+
+extern float cosf(float  val);
+extern float sinf(float  val);
+
+inline float4 vcos(float4 val)
+{
+   float4 result;
+   printf("VEC IN   is %f %f %f %f\n", val.x, val.y, val.z, val.w);
+   result.x = cosf(val.x);
+   result.y = cosf(val.x);
+   result.z = cosf(val.x);
+   result.w = cosf(val.x);
+   printf("VEC OUT  is %f %f %f %f\n", result.x, result.y, result.z, result.w);
+   return result;
+}
+
+inline float4 scs(float4 val)
+{
+   float4 result;
+   float tmp = val.x;
+   result.x = cosf(tmp);
+   result.y = sinf(tmp);
+   return result;
+}
+
+
+inline float4 vsin(float4 val)
+{
+   float4 result;
+   float tmp = val.x;
+   float res = sinf(tmp);
+   result.x = res;
+   result.y = res;
+   result.z = res;
+   result.w = res;
+   return result;
+}
+
+inline int kilp(float4 val)
+{
+   if (val.x < 0 || val.y < 0 || val.z < 0 || val.w < 0)
+      return 1;
+   else
+      return 0;
+}
diff --git a/src/gallium/auxiliary/llvm/loweringpass.cpp b/src/gallium/auxiliary/llvm/loweringpass.cpp
new file mode 100644
index 00000000000..556dbec3661
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/loweringpass.cpp
@@ -0,0 +1,17 @@
+#include "loweringpass.h"
+
+using namespace llvm;
+
+char LoweringPass::ID = 0;
+RegisterPass<LoweringPass> X("lowering", "Lowering Pass");
+
+LoweringPass::LoweringPass()
+   :  ModulePass((intptr_t)&ID)
+{
+}
+
+bool LoweringPass::runOnModule(Module &m)
+{
+   llvm::cerr << "Hello: " << m.getModuleIdentifier() << "\n";
+   return false;
+}
diff --git a/src/gallium/auxiliary/llvm/loweringpass.h b/src/gallium/auxiliary/llvm/loweringpass.h
new file mode 100644
index 00000000000..f62dcf6ba73
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/loweringpass.h
@@ -0,0 +1,15 @@
+#ifndef LOWERINGPASS_H
+#define LOWERINGPASS_H
+
+#include "llvm/Pass.h"
+#include "llvm/Module.h"
+
+struct LoweringPass : public llvm::ModulePass
+{
+   static char ID;
+   LoweringPass();
+
+   virtual bool runOnModule(llvm::Module &m);
+};
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/storage.cpp b/src/gallium/auxiliary/llvm/storage.cpp
new file mode 100644
index 00000000000..c4326de8c53
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/storage.cpp
@@ -0,0 +1,364 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+#ifdef MESA_LLVM
+
+#include "storage.h"
+
+#include "gallivm_p.h"
+
+#include "pipe/p_shader_tokens.h"
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+
+using namespace llvm;
+
+Storage::Storage(llvm::BasicBlock *block, llvm::Value *input)
+   : m_block(block),
+     m_INPUT(input),
+     m_addrs(32),
+     m_idx(0)
+{
+   m_floatVecType = VectorType::get(Type::FloatTy, 4);
+   m_intVecType   = VectorType::get(IntegerType::get(32), 4);
+
+   m_undefFloatVec = UndefValue::get(m_floatVecType);
+   m_undefIntVec   = UndefValue::get(m_intVecType);
+   m_extSwizzleVec = 0;
+
+   m_numConsts = 0;
+}
+
+//can only build vectors with all members in the [0, 9] range
+llvm::Constant *Storage::shuffleMask(int vec)
+{
+   if (!m_extSwizzleVec) {
+      std::vector<Constant*> elems;
+      elems.push_back(ConstantFP::get(Type::FloatTy, APFloat(0.f)));
+      elems.push_back(ConstantFP::get(Type::FloatTy, APFloat(1.f)));
+      elems.push_back(ConstantFP::get(Type::FloatTy, APFloat(0.f)));
+      elems.push_back(ConstantFP::get(Type::FloatTy, APFloat(1.f)));
+      m_extSwizzleVec = ConstantVector::get(m_floatVecType, elems);
+   }
+
+   if (m_intVecs.find(vec) != m_intVecs.end()) {
+      return m_intVecs[vec];
+   }
+   int origVec = vec;
+   Constant* const_vec = 0;
+   if (origVec == 0) {
+      const_vec = Constant::getNullValue(m_intVecType);
+   } else {
+      int x = gallivm_x_swizzle(vec);
+      int y = gallivm_y_swizzle(vec);
+      int z = gallivm_z_swizzle(vec);
+      int w = gallivm_w_swizzle(vec);
+      std::vector<Constant*> elems;
+      elems.push_back(constantInt(x));
+      elems.push_back(constantInt(y));
+      elems.push_back(constantInt(z));
+      elems.push_back(constantInt(w));
+      const_vec = ConstantVector::get(m_intVecType, elems);
+   }
+
+   m_intVecs[origVec] = const_vec;
+   return const_vec;
+}
+
+llvm::ConstantInt *Storage::constantInt(int idx)
+{
+   if (m_constInts.find(idx) != m_constInts.end()) {
+      return m_constInts[idx];
+   }
+   ConstantInt *const_int = ConstantInt::get(APInt(32,  idx));
+   m_constInts[idx] = const_int;
+   return const_int;
+}
+
+llvm::Value *Storage::inputElement(int idx, llvm::Value *indIdx)
+{
+   Value *val = element(InputsArg, idx, indIdx);
+   LoadInst *load = new LoadInst(val, name("input"), false, m_block);
+   load->setAlignment(8);
+
+   return load;
+}
+
+llvm::Value *Storage::constElement(int idx, llvm::Value *indIdx)
+{
+   m_numConsts = ((idx + 1) > m_numConsts) ? (idx + 1) : m_numConsts;
+
+   Value *elem = element(ConstsArg, idx, indIdx);
+   LoadInst *load = new LoadInst(elem, name("const"), false, m_block);
+   load->setAlignment(8);
+   return load;
+}
+
+llvm::Value *Storage::shuffleVector(llvm::Value *vec, int shuffle)
+{
+   Constant *mask = shuffleMask(shuffle);
+   ShuffleVectorInst *res =
+      new ShuffleVectorInst(vec, m_extSwizzleVec, mask,
+                            name("shuffle"), m_block);
+   return res;
+}
+
+
+llvm::Value *Storage::tempElement(int idx, llvm::Value *indIdx)
+{
+   Value *elem = element(TempsArg, idx, indIdx);
+
+   LoadInst *load = new LoadInst(elem, name("temp"), false, m_block);
+   load->setAlignment(8);
+
+   return load;
+}
+
+void Storage::setTempElement(int idx, llvm::Value *val, int mask)
+{
+   if (mask != TGSI_WRITEMASK_XYZW) {
+      llvm::Value *templ = 0;
+      if (m_tempWriteMap[idx])
+         templ = tempElement(idx);
+      val = maskWrite(val, mask, templ);
+   }
+   Value *elem = element(TempsArg, idx);
+   StoreInst *st = new StoreInst(val, elem, false, m_block);
+   st->setAlignment(8);
+   m_tempWriteMap[idx] = true;
+}
+
+void Storage::setOutputElement(int dstIdx, llvm::Value *val, int mask)
+{
+   if (mask != TGSI_WRITEMASK_XYZW) {
+      llvm::Value *templ = 0;
+      if (m_destWriteMap[dstIdx])
+         templ = outputElement(dstIdx);
+      val = maskWrite(val, mask, templ);
+   }
+
+   Value *elem = element(DestsArg, dstIdx);
+   StoreInst *st = new StoreInst(val, elem, false, m_block);
+   st->setAlignment(8);
+   m_destWriteMap[dstIdx] = true;
+}
+
+llvm::Value *Storage::maskWrite(llvm::Value *src, int mask, llvm::Value *templ)
+{
+   llvm::Value *dst = templ;
+   if (!dst)
+      dst = Constant::getNullValue(m_floatVecType);
+   if ((mask & TGSI_WRITEMASK_X)) {
+      llvm::Value *x = new ExtractElementInst(src, unsigned(0),
+                                              name("x"), m_block);
+      dst = new InsertElementInst(dst, x, unsigned(0),
+                                  name("dstx"), m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Y)) {
+      llvm::Value *y = new ExtractElementInst(src, unsigned(1),
+                                              name("y"), m_block);
+      dst = new InsertElementInst(dst, y, unsigned(1),
+                                  name("dsty"), m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Z)) {
+      llvm::Value *z = new ExtractElementInst(src, unsigned(2),
+                                              name("z"), m_block);
+      dst = new InsertElementInst(dst, z, unsigned(2),
+                                  name("dstz"), m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_W)) {
+      llvm::Value *w = new ExtractElementInst(src, unsigned(3),
+                                              name("w"), m_block);
+      dst = new InsertElementInst(dst, w, unsigned(3),
+                                  name("dstw"), m_block);
+   }
+   return dst;
+}
+
+const char * Storage::name(const char *prefix)
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+int Storage::numConsts() const
+{
+   return m_numConsts;
+}
+
+llvm::Value * Storage::addrElement(int idx) const
+{
+   Value *ret = m_addrs[idx];
+   if (!ret)
+      return m_undefFloatVec;
+   return ret;
+}
+
+void Storage::setAddrElement(int idx, llvm::Value *val, int mask)
+{
+   if (mask != TGSI_WRITEMASK_XYZW) {
+      llvm::Value *templ = m_addrs[idx];
+      val = maskWrite(val, mask, templ);
+   }
+   m_addrs[idx] = val;
+}
+
+llvm::Value * Storage::extractIndex(llvm::Value *vec)
+{
+   llvm::Value *x = new ExtractElementInst(vec, unsigned(0),
+                                           name("x"), m_block);
+   return new FPToSIInst(x, IntegerType::get(32), name("intidx"), m_block);
+}
+
+void Storage::setCurrentBlock(llvm::BasicBlock *block)
+{
+   m_block = block;
+}
+
+llvm::Value * Storage::outputElement(int idx, llvm::Value *indIdx)
+{
+   Value *elem = element(DestsArg, idx, indIdx);
+   LoadInst *load = new LoadInst(elem, name("output"), false, m_block);
+   load->setAlignment(8);
+
+   return load;
+}
+
+llvm::Value * Storage::inputPtr() const
+{
+   return m_INPUT;
+}
+
+void Storage::pushArguments(llvm::Value *input)
+{
+   m_argStack.push(m_INPUT);
+
+   m_INPUT = input;
+}
+
+void Storage::popArguments()
+{
+   m_INPUT = m_argStack.top();
+   m_argStack.pop();
+}
+
+void Storage::pushTemps()
+{
+   m_extSwizzleVec = 0;
+}
+
+void Storage::popTemps()
+{
+}
+
+llvm::Value * Storage::immediateElement(int idx)
+{
+   return m_immediates[idx];
+}
+
+void Storage::addImmediate(float *val)
+{
+   std::vector<Constant*> vec(4);
+   vec[0] = ConstantFP::get(Type::FloatTy, APFloat(val[0]));
+   vec[1] = ConstantFP::get(Type::FloatTy, APFloat(val[1]));
+   vec[2] = ConstantFP::get(Type::FloatTy, APFloat(val[2]));
+   vec[3] = ConstantFP::get(Type::FloatTy, APFloat(val[3]));
+   m_immediates.push_back(ConstantVector::get(m_floatVecType, vec));
+}
+
+
+llvm::Value * Storage::elemPtr(Args arg)
+{
+   std::vector<Value*> indices;
+   indices.push_back(constantInt(0));
+   indices.push_back(constantInt(static_cast<int>(arg)));
+   GetElementPtrInst *getElem = new GetElementPtrInst(m_INPUT,
+                                                      indices.begin(),
+                                                      indices.end(),
+                                                      name("input_ptr"),
+                                                      m_block);
+   return new LoadInst(getElem, name("input_field"), false, m_block);
+}
+
+llvm::Value * Storage::elemIdx(llvm::Value *ptr, int idx,
+                               llvm::Value *indIdx )
+{
+   GetElementPtrInst *getElem = 0;
+
+   if (indIdx) {
+      getElem = new GetElementPtrInst(ptr,
+                                      BinaryOperator::create(Instruction::Add,
+                                                             indIdx,
+                                                             constantInt(idx),
+                                                             name("add"),
+                                                             m_block),
+                                      name("field"),
+                                      m_block);
+   } else {
+      getElem = new GetElementPtrInst(ptr,
+                                      constantInt(idx),
+                                      name("field"),
+                                      m_block);
+   }
+   return getElem;
+}
+
+llvm::Value * Storage::element(Args arg, int idx, llvm::Value *indIdx )
+{
+   Value *val = elemPtr(arg);
+   return elemIdx(val, idx, indIdx);
+}
+
+void Storage::setKilElement(llvm::Value *val)
+{
+   std::vector<Value*> indices;
+   indices.push_back(constantInt(0));
+   indices.push_back(constantInt(static_cast<int>(KilArg)));
+   GetElementPtrInst *elem = new GetElementPtrInst(m_INPUT,
+                                                   indices.begin(),
+                                                   indices.end(),
+                                                   name("kil_ptr"),
+                                                   m_block);
+   StoreInst *st = new StoreInst(val, elem, false, m_block);
+   st->setAlignment(8);
+}
+
+#endif //MESA_LLVM
+
+
diff --git a/src/gallium/auxiliary/llvm/storage.h b/src/gallium/auxiliary/llvm/storage.h
new file mode 100644
index 00000000000..8574f7554e3
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/storage.h
@@ -0,0 +1,133 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Zack Rusin [email protected]
+  */
+
+#ifndef STORAGE_H
+#define STORAGE_H
+
+#include <map>
+#include <set>
+#include <stack>
+#include <vector>
+
+namespace llvm {
+   class BasicBlock;
+   class Constant;
+   class ConstantInt;
+   class LoadInst;
+   class Value;
+   class VectorType;
+}
+
+class Storage
+{
+public:
+   Storage(llvm::BasicBlock *block,
+           llvm::Value *input);
+
+   llvm::Value *inputPtr() const;
+
+   void setCurrentBlock(llvm::BasicBlock *block);
+
+   llvm::ConstantInt *constantInt(int);
+   llvm::Constant *shuffleMask(int vec);
+   llvm::Value *inputElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *constElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *outputElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *tempElement(int idx, llvm::Value *indIdx =0);
+   llvm::Value *immediateElement(int idx);
+
+   void setOutputElement(int dstIdx, llvm::Value *val, int mask);
+   void setTempElement(int idx, llvm::Value *val, int mask);
+
+   llvm::Value *addrElement(int idx) const;
+   void setAddrElement(int idx, llvm::Value *val, int mask);
+
+   void setKilElement(llvm::Value *val);
+
+   llvm::Value *shuffleVector(llvm::Value *vec, int shuffle);
+
+   llvm::Value *extractIndex(llvm::Value *vec);
+
+   int numConsts() const;
+
+   void pushArguments(llvm::Value *input);
+   void popArguments();
+   void pushTemps();
+   void popTemps();
+
+   void addImmediate(float *val);
+
+private:
+   llvm::Value *maskWrite(llvm::Value *src, int mask, llvm::Value *templ);
+   const char *name(const char *prefix);
+
+   enum Args {
+      DestsArg   = 0,
+      InputsArg  = 1,
+      TempsArg   = 2,
+      ConstsArg  = 3,
+      KilArg     = 4
+   };
+   llvm::Value *elemPtr(Args arg);
+   llvm::Value *elemIdx(llvm::Value *ptr, int idx,
+                        llvm::Value *indIdx = 0);
+   llvm::Value *element(Args arg, int idx, llvm::Value *indIdx = 0);
+
+private:
+   llvm::BasicBlock *m_block;
+   llvm::Value *m_INPUT;
+
+   std::map<int, llvm::ConstantInt*> m_constInts;
+   std::map<int, llvm::Constant*>    m_intVecs;
+   std::vector<llvm::Value*>         m_addrs;
+   std::vector<llvm::Constant*>      m_immediates;
+
+   llvm::VectorType *m_floatVecType;
+   llvm::VectorType *m_intVecType;
+
+   char        m_name[32];
+   int         m_idx;
+
+   int         m_numConsts;
+
+   std::map<int, bool > m_destWriteMap;
+   std::map<int, bool > m_tempWriteMap;
+
+   llvm::Value      *m_undefFloatVec;
+   llvm::Value      *m_undefIntVec;
+   llvm::Value      *m_extSwizzleVec;
+
+   std::stack<llvm::Value*> m_argStack;
+   std::stack<std::vector<llvm::Value*> > m_tempStack;
+};
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/storagesoa.cpp b/src/gallium/auxiliary/llvm/storagesoa.cpp
new file mode 100644
index 00000000000..ed0674a96f0
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/storagesoa.cpp
@@ -0,0 +1,389 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "storagesoa.h"
+
+#include "gallivm_p.h"
+
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_debug.h"
+
+#include <llvm/BasicBlock.h>
+#include <llvm/Module.h>
+#include <llvm/Value.h>
+
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/InstrTypes.h>
+#include <llvm/Instructions.h>
+
+using namespace llvm;
+
+
+StorageSoa::StorageSoa(llvm::BasicBlock *block,
+                       llvm::Value *input,
+                       llvm::Value *output,
+                       llvm::Value *consts,
+                       llvm::Value *temps)
+   : m_block(block),
+     m_input(input),
+     m_output(output),
+     m_consts(consts),
+     m_temps(temps),
+     m_immediates(0),
+     m_idx(0)
+{
+}
+
+void StorageSoa::addImmediate(float *vec)
+{
+   std::vector<float> vals(4);
+   vals[0] = vec[0];
+   vals[1] = vec[1];
+   vals[2] = vec[2];
+   vals[3] = vec[3];
+   m_immediatesToFlush.push_back(vals);
+}
+
+void StorageSoa::declareImmediates()
+{
+   if (m_immediatesToFlush.empty())
+      return;
+
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   ArrayType  *vectorChannels = ArrayType::get(vectorType, 4);
+   ArrayType  *arrayType = ArrayType::get(vectorChannels, m_immediatesToFlush.size());
+
+   m_immediates = new GlobalVariable(
+      /*Type=*/arrayType,
+      /*isConstant=*/false,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Initializer=*/0, // has initializer, specified below
+      /*Name=*/name("immediates"),
+      currentModule());
+
+   std::vector<Constant*> arrayVals;
+   for (unsigned int i = 0; i < m_immediatesToFlush.size(); ++i) {
+      std::vector<float> vec = m_immediatesToFlush[i];
+      std::vector<float> vals(4);
+      std::vector<Constant*> channelArray;
+
+      vals[0] = vec[0]; vals[1] = vec[0]; vals[2] = vec[0]; vals[3] = vec[0];
+      llvm::Constant *xChannel = createConstGlobalVector(vals);
+
+      vals[0] = vec[1]; vals[1] = vec[1]; vals[2] = vec[1]; vals[3] = vec[1];
+      llvm::Constant *yChannel = createConstGlobalVector(vals);
+
+      vals[0] = vec[2]; vals[1] = vec[2]; vals[2] = vec[2]; vals[3] = vec[2];
+      llvm::Constant *zChannel = createConstGlobalVector(vals);
+
+      vals[0] = vec[3]; vals[1] = vec[3]; vals[2] = vec[3]; vals[3] = vec[3];
+      llvm::Constant *wChannel = createConstGlobalVector(vals);
+      channelArray.push_back(xChannel);
+      channelArray.push_back(yChannel);
+      channelArray.push_back(zChannel);
+      channelArray.push_back(wChannel);
+      Constant *constChannels = ConstantArray::get(vectorChannels,
+                                                   channelArray);
+      arrayVals.push_back(constChannels);
+   }
+   Constant *constArray = ConstantArray::get(arrayType, arrayVals);
+   m_immediates->setInitializer(constArray);
+
+   m_immediatesToFlush.clear();
+}
+
+llvm::Value *StorageSoa::addrElement(int idx) const
+{
+   std::map<int, llvm::Value*>::const_iterator itr = m_addresses.find(idx);
+   if (itr == m_addresses.end()) {
+      debug_printf("Trying to access invalid shader 'address'\n");
+      return 0;
+   }
+   llvm::Value * res = (*itr).second;
+
+   res = new LoadInst(res, name("addr"), false, m_block);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::inputElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_input, idx, 0);
+   res[1] = element(m_input, idx, 1);
+   res[2] = element(m_input, idx, 2);
+   res[3] = element(m_input, idx, 3);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::constElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+   llvm::Value *xChannel, *yChannel, *zChannel, *wChannel;
+
+   xChannel = elementPointer(m_consts, idx, 0);
+   yChannel = elementPointer(m_consts, idx, 1);
+   zChannel = elementPointer(m_consts, idx, 2);
+   wChannel = elementPointer(m_consts, idx, 3);
+
+   res[0] = alignedArrayLoad(xChannel);
+   res[1] = alignedArrayLoad(yChannel);
+   res[2] = alignedArrayLoad(zChannel);
+   res[3] = alignedArrayLoad(wChannel);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::outputElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_output, idx, 0);
+   res[1] = element(m_output, idx, 1);
+   res[2] = element(m_output, idx, 2);
+   res[3] = element(m_output, idx, 3);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::tempElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_temps, idx, 0);
+   res[1] = element(m_temps, idx, 1);
+   res[2] = element(m_temps, idx, 2);
+   res[3] = element(m_temps, idx, 3);
+
+   return res;
+}
+
+std::vector<llvm::Value*> StorageSoa::immediateElement(llvm::Value *idx)
+{
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = element(m_immediates, idx, 0);
+   res[1] = element(m_immediates, idx, 1);
+   res[2] = element(m_immediates, idx, 2);
+   res[3] = element(m_immediates, idx, 3);
+
+   return res;
+}
+
+llvm::Value * StorageSoa::elementPointer(llvm::Value *ptr, llvm::Value *index,
+                                         int channel) const
+{
+   std::vector<Value*> indices;
+   if (m_immediates == ptr)
+      indices.push_back(constantInt(0));
+   indices.push_back(index);
+   indices.push_back(constantInt(channel));
+
+   GetElementPtrInst *getElem = new GetElementPtrInst(ptr,
+                                                      indices.begin(),
+                                                      indices.end(),
+                                                      name("ptr"),
+                                                      m_block);
+   return getElem;
+}
+
+llvm::Value * StorageSoa::element(llvm::Value *ptr, llvm::Value *index,
+                                  int channel) const
+{
+   llvm::Value *res = elementPointer(ptr, index, channel);
+   LoadInst *load = new LoadInst(res, name("element"), false, m_block);
+   //load->setAlignment(8);
+   return load;
+}
+
+const char * StorageSoa::name(const char *prefix) const
+{
+   ++m_idx;
+   snprintf(m_name, 32, "%s%d", prefix, m_idx);
+   return m_name;
+}
+
+llvm::ConstantInt * StorageSoa::constantInt(int idx) const
+{
+   if (m_constInts.find(idx) != m_constInts.end()) {
+      return m_constInts[idx];
+   }
+   ConstantInt *constInt = ConstantInt::get(APInt(32,  idx));
+   m_constInts[idx] = constInt;
+   return constInt;
+}
+
+llvm::Value *StorageSoa::alignedArrayLoad(llvm::Value *val)
+{
+   VectorType  *vectorType = VectorType::get(Type::FloatTy, 4);
+   PointerType *vectorPtr  = PointerType::get(vectorType, 0);
+
+   CastInst *cast = new BitCastInst(val, vectorPtr, name("toVector"), m_block);
+   LoadInst *load = new LoadInst(cast, name("alignLoad"), false, m_block);
+   load->setAlignment(8);
+   return load;
+}
+
+llvm::Module * StorageSoa::currentModule() const
+{
+    if (!m_block || !m_block->getParent())
+       return 0;
+
+    return m_block->getParent()->getParent();
+}
+
+llvm::Constant * StorageSoa::createConstGlobalVector(const std::vector<float> &vec)
+{
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   std::vector<Constant*> immValues;
+   ConstantFP *constx = ConstantFP::get(Type::FloatTy, APFloat(vec[0]));
+   ConstantFP *consty = ConstantFP::get(Type::FloatTy, APFloat(vec[1]));
+   ConstantFP *constz = ConstantFP::get(Type::FloatTy, APFloat(vec[2]));
+   ConstantFP *constw = ConstantFP::get(Type::FloatTy, APFloat(vec[3]));
+   immValues.push_back(constx);
+   immValues.push_back(consty);
+   immValues.push_back(constz);
+   immValues.push_back(constw);
+   Constant  *constVector = ConstantVector::get(vectorType, immValues);
+
+   return constVector;
+}
+
+std::vector<llvm::Value*> StorageSoa::load(Argument type, int idx, int swizzle,
+                                           llvm::Value *indIdx)
+{
+   std::vector<llvm::Value*> val(4);
+
+   //if we have an indirect index, always use that
+   //   if not use the integer offset to create one
+   llvm::Value *realIndex = 0;
+   if (indIdx)
+      realIndex = indIdx;
+   else
+      realIndex = constantInt(idx);
+   debug_printf("XXXXXXXXX realIdx = %p, indIdx = %p\n", realIndex, indIdx);
+
+   switch(type) {
+   case Input:
+      val = inputElement(realIndex);
+      break;
+   case Output:
+      val = outputElement(realIndex);
+      break;
+   case Temp:
+      val = tempElement(realIndex);
+      break;
+   case Const:
+      val = constElement(realIndex);
+      break;
+   case Immediate:
+      val = immediateElement(realIndex);
+      break;
+   case Address:
+      debug_printf("Address not handled in the load phase!\n");
+      assert(0);
+      break;
+   }
+   if (!gallivm_is_swizzle(swizzle))
+      return val;
+
+   std::vector<llvm::Value*> res(4);
+
+   res[0] = val[gallivm_x_swizzle(swizzle)];
+   res[1] = val[gallivm_y_swizzle(swizzle)];
+   res[2] = val[gallivm_z_swizzle(swizzle)];
+   res[3] = val[gallivm_w_swizzle(swizzle)];
+   return res;
+}
+
+void StorageSoa::store(Argument type, int idx, const std::vector<llvm::Value*> &val,
+                       int mask)
+{
+   llvm::Value *out = 0;
+   switch(type) {
+   case Output:
+      out = m_output;
+      break;
+   case Temp:
+      out = m_temps;
+      break;
+   case Input:
+      out = m_input;
+      break;
+   case Address: {
+      llvm::Value *addr = m_addresses[idx];
+      if (!addr) {
+         addAddress(idx);
+         addr = m_addresses[idx];
+         assert(addr);
+      }
+      new StoreInst(val[0], addr, false, m_block);
+      return;
+      break;
+   }
+   default:
+      debug_printf("Can't save output of this type: %d !\n", type);
+      assert(0);
+      break;
+   }
+   llvm::Value *realIndex = constantInt(idx);
+   if ((mask & TGSI_WRITEMASK_X)) {
+      llvm::Value *xChannel = elementPointer(out, realIndex, 0);
+      new StoreInst(val[0], xChannel, false, m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Y)) {
+      llvm::Value *yChannel = elementPointer(out, realIndex, 1);
+      new StoreInst(val[1], yChannel, false, m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_Z)) {
+      llvm::Value *zChannel = elementPointer(out, realIndex, 2);
+      new StoreInst(val[2], zChannel, false, m_block);
+   }
+   if ((mask & TGSI_WRITEMASK_W)) {
+      llvm::Value *wChannel = elementPointer(out, realIndex, 3);
+      new StoreInst(val[3], wChannel, false, m_block);
+   }
+}
+
+void StorageSoa::addAddress(int idx)
+{
+   GlobalVariable *val = new GlobalVariable(
+      /*Type=*/IntegerType::get(32),
+      /*isConstant=*/false,
+      /*Linkage=*/GlobalValue::ExternalLinkage,
+      /*Initializer=*/0, // has initializer, specified below
+      /*Name=*/name("address"),
+      currentModule());
+   val->setInitializer(Constant::getNullValue(IntegerType::get(32)));
+
+   debug_printf("adding to %d\n", idx);
+   m_addresses[idx] = val;
+}
diff --git a/src/gallium/auxiliary/llvm/storagesoa.h b/src/gallium/auxiliary/llvm/storagesoa.h
new file mode 100644
index 00000000000..6443351f270
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/storagesoa.h
@@ -0,0 +1,111 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#ifndef STORAGESOA_H
+#define STORAGESOA_H
+
+#include <vector>
+#include <list>
+#include <map>
+
+namespace llvm {
+   class BasicBlock;
+   class Constant;
+   class ConstantInt;
+   class GlobalVariable;
+   class LoadInst;
+   class Value;
+   class VectorType;
+   class Module;
+}
+
+class StorageSoa
+{
+public:
+   enum Argument {
+      Input,
+      Output,
+      Temp,
+      Const,
+      Immediate,
+      Address
+   };
+public:
+   StorageSoa(llvm::BasicBlock *block,
+              llvm::Value *input,
+              llvm::Value *output,
+              llvm::Value *consts,
+              llvm::Value *temps);
+
+
+   std::vector<llvm::Value*> load(Argument type, int idx, int swizzle, 
+                                  llvm::Value *indIdx =0);
+   void store(Argument type, int idx, const std::vector<llvm::Value*> &val,
+              int mask);
+
+   void addImmediate(float *vec);
+   void declareImmediates();
+
+   void addAddress(int idx);
+
+   llvm::Value  * addrElement(int idx) const;
+
+   llvm::ConstantInt *constantInt(int) const;
+private:
+   llvm::Value *elementPointer(llvm::Value *ptr, llvm::Value *indIdx,
+                               int channel) const;
+   llvm::Value *element(llvm::Value *ptr, llvm::Value *idx,
+                        int channel) const;
+   const char *name(const char *prefix) const;
+   llvm::Value  *alignedArrayLoad(llvm::Value *val);
+   llvm::Module *currentModule() const;
+   llvm::Constant  *createConstGlobalVector(const std::vector<float> &vec);
+
+   std::vector<llvm::Value*> inputElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> constElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> outputElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> tempElement(llvm::Value *indIdx);
+   std::vector<llvm::Value*> immediateElement(llvm::Value *indIdx);
+private:
+   llvm::BasicBlock *m_block;
+
+   llvm::Value *m_input;
+   llvm::Value *m_output;
+   llvm::Value *m_consts;
+   llvm::Value *m_temps;
+   llvm::GlobalVariable *m_immediates;
+
+   std::map<int, llvm::Value*> m_addresses;
+
+   std::vector<std::vector<float> > m_immediatesToFlush;
+
+   mutable std::map<int, llvm::ConstantInt*> m_constInts;
+   mutable char        m_name[32];
+   mutable int         m_idx;
+};
+
+#endif
diff --git a/src/gallium/auxiliary/llvm/tgsitollvm.cpp b/src/gallium/auxiliary/llvm/tgsitollvm.cpp
new file mode 100644
index 00000000000..2cb4acce32f
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/tgsitollvm.cpp
@@ -0,0 +1,1221 @@
+#include "tgsitollvm.h"
+
+#include "gallivm.h"
+#include "gallivm_p.h"
+
+#include "storage.h"
+#include "instructions.h"
+#include "storagesoa.h"
+#include "instructionssoa.h"
+
+#include "pipe/p_shader_tokens.h"
+
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi/util/tgsi_build.h"
+#include "tgsi/util/tgsi_dump.h"
+
+
+#include <llvm/Module.h>
+#include <llvm/CallingConv.h>
+#include <llvm/Constants.h>
+#include <llvm/DerivedTypes.h>
+#include <llvm/Instructions.h>
+#include <llvm/ModuleProvider.h>
+#include <llvm/Pass.h>
+#include <llvm/PassManager.h>
+#include <llvm/ParameterAttributes.h>
+#include <llvm/Support/PatternMatch.h>
+#include <llvm/ExecutionEngine/JIT.h>
+#include <llvm/ExecutionEngine/Interpreter.h>
+#include <llvm/ExecutionEngine/GenericValue.h>
+#include <llvm/Support/MemoryBuffer.h>
+#include <llvm/LinkAllPasses.h>
+#include <llvm/Analysis/Verifier.h>
+#include <llvm/Analysis/LoopPass.h>
+#include <llvm/Target/TargetData.h>
+#include <llvm/Bitcode/ReaderWriter.h>
+#include <llvm/Transforms/Utils/Cloning.h>
+
+
+#include <sstream>
+#include <fstream>
+#include <iostream>
+
+using namespace llvm;
+
+static inline FunctionType *vertexShaderFunctionType()
+{
+   //Function takes three arguments,
+   // the calling code has to make sure the types it will
+   // pass are castable to the following:
+   // [4 x <4 x float>] inputs,
+   // [4 x <4 x float>] output,
+   // [4 x [4 x float]] consts,
+   // [4 x <4 x float>] temps
+
+   std::vector<const Type*> funcArgs;
+   VectorType *vectorType = VectorType::get(Type::FloatTy, 4);
+   ArrayType *vectorArray = ArrayType::get(vectorType, 4);
+   PointerType *vectorArrayPtr = PointerType::get(vectorArray, 0);
+
+   ArrayType   *floatArray     = ArrayType::get(Type::FloatTy, 4);
+   ArrayType   *constsArray    = ArrayType::get(floatArray, 4);
+   PointerType *constsArrayPtr = PointerType::get(constsArray, 0);
+
+   funcArgs.push_back(vectorArrayPtr);//inputs
+   funcArgs.push_back(vectorArrayPtr);//output
+   funcArgs.push_back(constsArrayPtr);//consts
+   funcArgs.push_back(vectorArrayPtr);//temps
+
+   FunctionType *functionType = FunctionType::get(
+      /*Result=*/Type::VoidTy,
+      /*Params=*/funcArgs,
+      /*isVarArg=*/false);
+
+   return functionType;
+}
+
+static inline void
+add_interpolator(struct gallivm_ir *ir,
+                 struct gallivm_interpolate *interp)
+{
+   ir->interpolators[ir->num_interp] = *interp;
+   ++ir->num_interp;
+}
+
+static void
+translate_declaration(struct gallivm_ir *prog,
+                      llvm::Module *module,
+                      Storage *storage,
+                      struct tgsi_full_declaration *decl,
+                      struct tgsi_full_declaration *fd)
+{
+   if (decl->Declaration.File == TGSI_FILE_INPUT) {
+      unsigned first, last, mask;
+      uint interp_method;
+
+      assert(decl->Declaration.Declare == TGSI_DECLARE_RANGE);
+
+      first = decl->u.DeclarationRange.First;
+      last = decl->u.DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      /* Do not touch WPOS.xy */
+      if (first == 0) {
+         mask &= ~TGSI_WRITEMASK_XY;
+         if (mask == TGSI_WRITEMASK_NONE) {
+            first++;
+            if (first > last) {
+               return;
+            }
+         }
+      }
+
+      interp_method = decl->Interpolation.Interpolate;
+
+      if (mask == TGSI_WRITEMASK_XYZW) {
+         unsigned i, j;
+
+         for (i = first; i <= last; i++) {
+            for (j = 0; j < NUM_CHANNELS; j++) {
+               //interp( mach, i, j );
+               struct gallivm_interpolate interp;
+               interp.type = interp_method;
+               interp.attrib = i;
+               interp.chan = j;
+               add_interpolator(prog, &interp);
+            }
+         }
+      } else {
+         unsigned i, j;
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               for( i = first; i <= last; i++ ) {
+                  struct gallivm_interpolate interp;
+                  interp.type = interp_method;
+                  interp.attrib = i;
+                  interp.chan = j;
+                  add_interpolator(prog, &interp);
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+translate_declarationir(struct gallivm_ir *,
+                      llvm::Module *,
+                      StorageSoa *storage,
+                      struct tgsi_full_declaration *decl,
+                      struct tgsi_full_declaration *)
+{
+   if (decl->Declaration.File == TGSI_FILE_ADDRESS) {
+      int idx = decl->u.DeclarationRange.First;
+      storage->addAddress(idx);
+   }
+}
+
+static void
+translate_immediate(Storage *storage,
+                    struct tgsi_full_immediate *imm)
+{
+   float vec[4];
+   int i;
+   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         vec[i] = imm->u.ImmediateFloat32[i].Float;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   storage->addImmediate(vec);
+}
+
+
+static void
+translate_immediateir(StorageSoa *storage,
+                      struct tgsi_full_immediate *imm)
+{
+   float vec[4];
+   int i;
+   for (i = 0; i < imm->Immediate.Size - 1; ++i) {
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         vec[i] = imm->u.ImmediateFloat32[i].Float;
+         break;
+      default:
+         assert(0);
+      }
+   }
+   storage->addImmediate(vec);
+}
+
+static inline int
+swizzleInt(struct tgsi_full_src_register *src)
+{
+   int swizzle = 0;
+   int start = 1000;
+
+   for (int k = 0; k < 4; ++k) {
+      swizzle += tgsi_util_get_full_src_register_extswizzle(src, k) * start;
+      start /= 10;
+   }
+   return swizzle;
+}
+
+static inline llvm::Value *
+swizzleVector(llvm::Value *val, struct tgsi_full_src_register *src,
+              Storage *storage)
+{
+   int swizzle = swizzleInt(src);
+
+   if (gallivm_is_swizzle(swizzle)) {
+      /*fprintf(stderr, "XXXXXXXX swizzle = %d\n", swizzle);*/
+      val = storage->shuffleVector(val, swizzle);
+   }
+   return val;
+}
+
+static void
+translate_instruction(llvm::Module *module,
+                      Storage *storage,
+                      Instructions *instr,
+                      struct tgsi_full_instruction *inst,
+                      struct tgsi_full_instruction *fi,
+                      unsigned instno)
+{
+   llvm::Value *inputs[4];
+   inputs[0] = 0;
+   inputs[1] = 0;
+   inputs[2] = 0;
+   inputs[3] = 0;
+
+   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      llvm::Value *val = 0;
+      llvm::Value *indIdx = 0;
+
+      if (src->SrcRegister.Indirect) {
+         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
+         indIdx = storage->extractIndex(indIdx);
+      }
+      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
+         val = storage->constElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
+         val = storage->inputElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+         val = storage->tempElement(src->SrcRegister.Index);
+      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
+         val = storage->outputElement(src->SrcRegister.Index, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+         val = storage->immediateElement(src->SrcRegister.Index);
+      } else {
+         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
+         return;
+      }
+
+      inputs[i] = swizzleVector(val, src, storage);
+   }
+
+   /*if (inputs[0])
+     instr->printVector(inputs[0]);
+     if (inputs[1])
+     instr->printVector(inputs[1]);*/
+   llvm::Value *out = 0;
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL: {
+      out = instr->arl(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_MOV: {
+      out = inputs[0];
+   }
+      break;
+   case TGSI_OPCODE_LIT: {
+      out = instr->lit(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCP: {
+      out = instr->rcp(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RSQ: {
+      out = instr->rsq(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_EXP:
+      break;
+   case TGSI_OPCODE_LOG:
+      break;
+   case TGSI_OPCODE_MUL: {
+      out = instr->mul(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_ADD: {
+      out = instr->add(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP3: {
+      out = instr->dp3(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP4: {
+      out = instr->dp4(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DST: {
+      out = instr->dst(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MIN: {
+      out = instr->min(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAX: {
+      out = instr->max(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SLT: {
+      out = instr->slt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SGE: {
+      out = instr->sge(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MAD: {
+      out = instr->madd(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SUB: {
+      out = instr->sub(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_LERP: {
+      out = instr->lerp(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_CND:
+      break;
+   case TGSI_OPCODE_CND0:
+      break;
+   case TGSI_OPCODE_DOT2ADD:
+      break;
+   case TGSI_OPCODE_INDEX:
+      break;
+   case TGSI_OPCODE_NEGATE:
+      break;
+   case TGSI_OPCODE_FRAC: {
+      out = instr->frc(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_CLAMP:
+      break;
+   case TGSI_OPCODE_FLOOR: {
+      out = instr->floor(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_ROUND:
+      break;
+   case TGSI_OPCODE_EXPBASE2: {
+      out = instr->ex2(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_LOGBASE2: {
+      out = instr->lg2(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_POWER: {
+      out = instr->pow(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_CROSSPRODUCT: {
+      out = instr->cross(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      break;
+   case TGSI_OPCODE_ABS: {
+      out = instr->abs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_RCC:
+      break;
+   case TGSI_OPCODE_DPH: {
+      out = instr->dph(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_COS: {
+      out = instr->cos(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_DDX:
+      break;
+   case TGSI_OPCODE_DDY:
+      break;
+   case TGSI_OPCODE_KILP: {
+      out = instr->kilp(inputs[0]);
+      storage->setKilElement(out);
+      return;
+   }
+      break;
+   case TGSI_OPCODE_PK2H:
+      break;
+   case TGSI_OPCODE_PK2US:
+      break;
+   case TGSI_OPCODE_PK4B:
+      break;
+   case TGSI_OPCODE_PK4UB:
+      break;
+   case TGSI_OPCODE_RFL:
+      break;
+   case TGSI_OPCODE_SEQ:
+      break;
+   case TGSI_OPCODE_SFL:
+      break;
+   case TGSI_OPCODE_SGT: {
+      out = instr->sgt(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_SIN: {
+      out = instr->sin(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_SLE:
+      break;
+   case TGSI_OPCODE_SNE:
+      break;
+   case TGSI_OPCODE_STR:
+      break;
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXD:
+      break;
+   case TGSI_OPCODE_UP2H:
+      break;
+   case TGSI_OPCODE_UP2US:
+      break;
+   case TGSI_OPCODE_UP4B:
+      break;
+   case TGSI_OPCODE_UP4UB:
+      break;
+   case TGSI_OPCODE_X2D:
+      break;
+   case TGSI_OPCODE_ARA:
+      break;
+   case TGSI_OPCODE_ARR:
+      break;
+   case TGSI_OPCODE_BRA:
+      break;
+   case TGSI_OPCODE_CAL: {
+      instr->cal(inst->InstructionExtLabel.Label, storage->inputPtr());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_RET: {
+      instr->end();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_SSG:
+      break;
+   case TGSI_OPCODE_CMP: {
+      out = instr->cmp(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SCS: {
+      out = instr->scs(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_TXB:
+      break;
+   case TGSI_OPCODE_NRM:
+      break;
+   case TGSI_OPCODE_DIV:
+      break;
+   case TGSI_OPCODE_DP2:
+      break;
+   case TGSI_OPCODE_TXL:
+      break;
+   case TGSI_OPCODE_BRK: {
+      instr->brk();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_IF: {
+      instr->ifop(inputs[0]);
+      storage->setCurrentBlock(instr->currentBlock());
+      return;  //just update the state
+   }
+      break;
+   case TGSI_OPCODE_LOOP:
+      break;
+   case TGSI_OPCODE_REP:
+      break;
+   case TGSI_OPCODE_ELSE: {
+      instr->elseop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return; //only state update
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: {
+      instr->endif();
+      storage->setCurrentBlock(instr->currentBlock());
+      return; //just update the state
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+   case TGSI_OPCODE_ENDREP:
+      break;
+   case TGSI_OPCODE_PUSHA:
+      break;
+   case TGSI_OPCODE_POPA:
+      break;
+   case TGSI_OPCODE_CEIL:
+      break;
+   case TGSI_OPCODE_I2F:
+      break;
+   case TGSI_OPCODE_NOT:
+      break;
+   case TGSI_OPCODE_TRUNC: {
+      out = instr->trunc(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_SHL:
+      break;
+   case TGSI_OPCODE_SHR:
+      break;
+   case TGSI_OPCODE_AND:
+      break;
+   case TGSI_OPCODE_OR:
+      break;
+   case TGSI_OPCODE_MOD:
+      break;
+   case TGSI_OPCODE_XOR:
+      break;
+   case TGSI_OPCODE_SAD:
+      break;
+   case TGSI_OPCODE_TXF:
+      break;
+   case TGSI_OPCODE_TXQ:
+      break;
+   case TGSI_OPCODE_CONT:
+      break;
+   case TGSI_OPCODE_EMIT:
+      break;
+   case TGSI_OPCODE_ENDPRIM:
+      break;
+   case TGSI_OPCODE_BGNLOOP2: {
+      instr->beginLoop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB: {
+      instr->bgnSub(instno);
+      storage->setCurrentBlock(instr->currentBlock());
+      storage->pushTemps();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP2: {
+      instr->endLoop();
+      storage->setCurrentBlock(instr->currentBlock());
+      return;
+   }
+      break;
+   case TGSI_OPCODE_ENDSUB: {
+      instr->endSub();
+      storage->setCurrentBlock(instr->currentBlock());
+      storage->popArguments();
+      storage->popTemps();
+      return;
+   }
+      break;
+   case TGSI_OPCODE_NOISE1:
+      break;
+   case TGSI_OPCODE_NOISE2:
+      break;
+   case TGSI_OPCODE_NOISE3:
+      break;
+   case TGSI_OPCODE_NOISE4:
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_TEXBEM:
+      break;
+   case TGSI_OPCODE_TEXBEML:
+      break;
+   case TGSI_OPCODE_TEXREG2AR:
+      break;
+   case TGSI_OPCODE_TEXM3X2PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X2TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X3TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3SPEC:
+      break;
+   case TGSI_OPCODE_TEXM3X3VSPEC:
+      break;
+   case TGSI_OPCODE_TEXREG2GB:
+      break;
+   case TGSI_OPCODE_TEXREG2RGB:
+      break;
+   case TGSI_OPCODE_TEXDP3TEX:
+      break;
+   case TGSI_OPCODE_TEXDP3:
+      break;
+   case TGSI_OPCODE_TEXM3X3:
+      break;
+   case TGSI_OPCODE_TEXM3X2DEPTH:
+      break;
+   case TGSI_OPCODE_TEXDEPTH:
+      break;
+   case TGSI_OPCODE_BEM:
+      break;
+   case TGSI_OPCODE_M4X3:
+      break;
+   case TGSI_OPCODE_M3X4:
+      break;
+   case TGSI_OPCODE_M3X3:
+      break;
+   case TGSI_OPCODE_M3X2:
+      break;
+   case TGSI_OPCODE_NRM4:
+      break;
+   case TGSI_OPCODE_CALLNZ:
+      break;
+   case TGSI_OPCODE_IFC:
+      break;
+   case TGSI_OPCODE_BREAKC:
+      break;
+   case TGSI_OPCODE_KIL:
+      break;
+   case TGSI_OPCODE_END:
+      instr->end();
+      return;
+      break;
+   default:
+      fprintf(stderr, "ERROR: Unknown opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(0);
+      break;
+   }
+
+   if (!out) {
+      fprintf(stderr, "ERROR: unsupported opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(!"Unsupported opcode");
+   }
+
+   /* # not sure if we need this */
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      /*TXT( "_SAT" );*/
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      /*TXT( "_SAT[-1,1]" );*/
+      break;
+   default:
+      assert( 0 );
+   }
+
+   /* store results  */
+   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+         storage->setOutputElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
+         storage->setTempElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
+         storage->setAddrElement(dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else {
+         fprintf(stderr, "ERROR: unsupported LLVM destination!");
+         assert(!"wrong destination");
+      }
+   }
+}
+
+
+static void
+translate_instructionir(llvm::Module *module,
+                        StorageSoa *storage,
+                        InstructionsSoa *instr,
+                        struct tgsi_full_instruction *inst,
+                        struct tgsi_full_instruction *fi,
+                        unsigned instno)
+{
+   std::vector< std::vector<llvm::Value*> > inputs(inst->Instruction.NumSrcRegs);
+
+   for (int i = 0; i < inst->Instruction.NumSrcRegs; ++i) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      std::vector<llvm::Value*> val;
+      llvm::Value *indIdx = 0;
+      int swizzle = swizzleInt(src);
+
+      if (src->SrcRegister.Indirect) {
+         indIdx = storage->addrElement(src->SrcRegisterInd.Index);
+      }
+      if (src->SrcRegister.File == TGSI_FILE_CONSTANT) {
+         val = storage->load(StorageSoa::Const,
+                             src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_INPUT) {
+         val = storage->load(StorageSoa::Input,
+                             src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_TEMPORARY) {
+         val = storage->load(StorageSoa::Temp,
+                             src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_OUTPUT) {
+         val = storage->load(StorageSoa::Output,
+                             src->SrcRegister.Index, swizzle, indIdx);
+      } else if (src->SrcRegister.File == TGSI_FILE_IMMEDIATE) {
+         val = storage->load(StorageSoa::Immediate,
+                             src->SrcRegister.Index, swizzle, indIdx);
+      } else {
+         fprintf(stderr, "ERROR: not supported llvm source %d\n", src->SrcRegister.File);
+         return;
+      }
+
+      inputs[i] = val;
+   }
+
+   std::vector<llvm::Value*> out(4);
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL: {
+      out = instr->arl(inputs[0]);
+   }
+      break;
+   case TGSI_OPCODE_MOV: {
+      out = inputs[0];
+   }
+      break;
+   case TGSI_OPCODE_LIT: {
+   }
+      break;
+   case TGSI_OPCODE_RCP: {
+   }
+      break;
+   case TGSI_OPCODE_RSQ: {
+   }
+      break;
+   case TGSI_OPCODE_EXP:
+      break;
+   case TGSI_OPCODE_LOG:
+      break;
+   case TGSI_OPCODE_MUL: {
+      out = instr->mul(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_ADD: {
+      out = instr->add(inputs[0], inputs[1]);
+   }
+      break;
+   case TGSI_OPCODE_DP3: {
+   }
+      break;
+   case TGSI_OPCODE_DP4: {
+   }
+      break;
+   case TGSI_OPCODE_DST: {
+   }
+      break;
+   case TGSI_OPCODE_MIN: {
+   }
+      break;
+   case TGSI_OPCODE_MAX: {
+   }
+      break;
+   case TGSI_OPCODE_SLT: {
+   }
+      break;
+   case TGSI_OPCODE_SGE: {
+   }
+      break;
+   case TGSI_OPCODE_MAD: {
+      out = instr->madd(inputs[0], inputs[1], inputs[2]);
+   }
+      break;
+   case TGSI_OPCODE_SUB: {
+   }
+      break;
+   case TGSI_OPCODE_LERP: {
+   }
+      break;
+   case TGSI_OPCODE_CND:
+      break;
+   case TGSI_OPCODE_CND0:
+      break;
+   case TGSI_OPCODE_DOT2ADD:
+      break;
+   case TGSI_OPCODE_INDEX:
+      break;
+   case TGSI_OPCODE_NEGATE:
+      break;
+   case TGSI_OPCODE_FRAC: {
+   }
+      break;
+   case TGSI_OPCODE_CLAMP:
+      break;
+   case TGSI_OPCODE_FLOOR: {
+   }
+      break;
+   case TGSI_OPCODE_ROUND:
+      break;
+   case TGSI_OPCODE_EXPBASE2: {
+   }
+      break;
+   case TGSI_OPCODE_LOGBASE2: {
+   }
+      break;
+   case TGSI_OPCODE_POWER: {
+   }
+      break;
+   case TGSI_OPCODE_CROSSPRODUCT: {
+   }
+      break;
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      break;
+   case TGSI_OPCODE_ABS: {
+   }
+      break;
+   case TGSI_OPCODE_RCC:
+      break;
+   case TGSI_OPCODE_DPH: {
+   }
+      break;
+   case TGSI_OPCODE_COS: {
+   }
+      break;
+   case TGSI_OPCODE_DDX:
+      break;
+   case TGSI_OPCODE_DDY:
+      break;
+   case TGSI_OPCODE_KILP: {
+   }
+      break;
+   case TGSI_OPCODE_PK2H:
+      break;
+   case TGSI_OPCODE_PK2US:
+      break;
+   case TGSI_OPCODE_PK4B:
+      break;
+   case TGSI_OPCODE_PK4UB:
+      break;
+   case TGSI_OPCODE_RFL:
+      break;
+   case TGSI_OPCODE_SEQ:
+      break;
+   case TGSI_OPCODE_SFL:
+      break;
+   case TGSI_OPCODE_SGT: {
+   }
+      break;
+   case TGSI_OPCODE_SIN: {
+   }
+      break;
+   case TGSI_OPCODE_SLE:
+      break;
+   case TGSI_OPCODE_SNE:
+      break;
+   case TGSI_OPCODE_STR:
+      break;
+   case TGSI_OPCODE_TEX:
+      break;
+   case TGSI_OPCODE_TXD:
+      break;
+   case TGSI_OPCODE_UP2H:
+      break;
+   case TGSI_OPCODE_UP2US:
+      break;
+   case TGSI_OPCODE_UP4B:
+      break;
+   case TGSI_OPCODE_UP4UB:
+      break;
+   case TGSI_OPCODE_X2D:
+      break;
+   case TGSI_OPCODE_ARA:
+      break;
+   case TGSI_OPCODE_ARR:
+      break;
+   case TGSI_OPCODE_BRA:
+      break;
+   case TGSI_OPCODE_CAL: {
+   }
+      break;
+   case TGSI_OPCODE_RET: {
+   }
+      break;
+   case TGSI_OPCODE_SSG:
+      break;
+   case TGSI_OPCODE_CMP: {
+   }
+      break;
+   case TGSI_OPCODE_SCS: {
+   }
+      break;
+   case TGSI_OPCODE_TXB:
+      break;
+   case TGSI_OPCODE_NRM:
+      break;
+   case TGSI_OPCODE_DIV:
+      break;
+   case TGSI_OPCODE_DP2:
+      break;
+   case TGSI_OPCODE_TXL:
+      break;
+   case TGSI_OPCODE_BRK: {
+   }
+      break;
+   case TGSI_OPCODE_IF: {
+   }
+      break;
+   case TGSI_OPCODE_LOOP:
+      break;
+   case TGSI_OPCODE_REP:
+      break;
+   case TGSI_OPCODE_ELSE: {
+   }
+      break;
+   case TGSI_OPCODE_ENDIF: {
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP:
+      break;
+   case TGSI_OPCODE_ENDREP:
+      break;
+   case TGSI_OPCODE_PUSHA:
+      break;
+   case TGSI_OPCODE_POPA:
+      break;
+   case TGSI_OPCODE_CEIL:
+      break;
+   case TGSI_OPCODE_I2F:
+      break;
+   case TGSI_OPCODE_NOT:
+      break;
+   case TGSI_OPCODE_TRUNC: {
+   }
+      break;
+   case TGSI_OPCODE_SHL:
+      break;
+   case TGSI_OPCODE_SHR:
+      break;
+   case TGSI_OPCODE_AND:
+      break;
+   case TGSI_OPCODE_OR:
+      break;
+   case TGSI_OPCODE_MOD:
+      break;
+   case TGSI_OPCODE_XOR:
+      break;
+   case TGSI_OPCODE_SAD:
+      break;
+   case TGSI_OPCODE_TXF:
+      break;
+   case TGSI_OPCODE_TXQ:
+      break;
+   case TGSI_OPCODE_CONT:
+      break;
+   case TGSI_OPCODE_EMIT:
+      break;
+   case TGSI_OPCODE_ENDPRIM:
+      break;
+   case TGSI_OPCODE_BGNLOOP2: {
+   }
+      break;
+   case TGSI_OPCODE_BGNSUB: {
+   }
+      break;
+   case TGSI_OPCODE_ENDLOOP2: {
+   }
+      break;
+   case TGSI_OPCODE_ENDSUB: {
+   }
+      break;
+   case TGSI_OPCODE_NOISE1:
+      break;
+   case TGSI_OPCODE_NOISE2:
+      break;
+   case TGSI_OPCODE_NOISE3:
+      break;
+   case TGSI_OPCODE_NOISE4:
+      break;
+   case TGSI_OPCODE_NOP:
+      break;
+   case TGSI_OPCODE_TEXBEM:
+      break;
+   case TGSI_OPCODE_TEXBEML:
+      break;
+   case TGSI_OPCODE_TEXREG2AR:
+      break;
+   case TGSI_OPCODE_TEXM3X2PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X2TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3PAD:
+      break;
+   case TGSI_OPCODE_TEXM3X3TEX:
+      break;
+   case TGSI_OPCODE_TEXM3X3SPEC:
+      break;
+   case TGSI_OPCODE_TEXM3X3VSPEC:
+      break;
+   case TGSI_OPCODE_TEXREG2GB:
+      break;
+   case TGSI_OPCODE_TEXREG2RGB:
+      break;
+   case TGSI_OPCODE_TEXDP3TEX:
+      break;
+   case TGSI_OPCODE_TEXDP3:
+      break;
+   case TGSI_OPCODE_TEXM3X3:
+      break;
+   case TGSI_OPCODE_TEXM3X2DEPTH:
+      break;
+   case TGSI_OPCODE_TEXDEPTH:
+      break;
+   case TGSI_OPCODE_BEM:
+      break;
+   case TGSI_OPCODE_M4X3:
+      break;
+   case TGSI_OPCODE_M3X4:
+      break;
+   case TGSI_OPCODE_M3X3:
+      break;
+   case TGSI_OPCODE_M3X2:
+      break;
+   case TGSI_OPCODE_NRM4:
+      break;
+   case TGSI_OPCODE_CALLNZ:
+      break;
+   case TGSI_OPCODE_IFC:
+      break;
+   case TGSI_OPCODE_BREAKC:
+      break;
+   case TGSI_OPCODE_KIL:
+      break;
+   case TGSI_OPCODE_END:
+      instr->end();
+      return;
+      break;
+   default:
+      fprintf(stderr, "ERROR: Unknown opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(0);
+      break;
+   }
+
+   if (!out[0]) {
+      fprintf(stderr, "ERROR: unsupported opcode %d\n",
+              inst->Instruction.Opcode);
+      assert(!"Unsupported opcode");
+   }
+
+   /* store results  */
+   for (int i = 0; i < inst->Instruction.NumDstRegs; ++i) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if (dst->DstRegister.File == TGSI_FILE_OUTPUT) {
+         storage->store(StorageSoa::Output,
+                        dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_TEMPORARY) {
+         storage->store(StorageSoa::Temp,
+                        dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else if (dst->DstRegister.File == TGSI_FILE_ADDRESS) {
+         storage->store(StorageSoa::Address,
+                        dst->DstRegister.Index, out, dst->DstRegister.WriteMask);
+      } else {
+         fprintf(stderr, "ERROR: unsupported LLVM destination!");
+         assert(!"wrong destination");
+      }
+   }
+}
+
+llvm::Module *
+tgsi_to_llvm(struct gallivm_ir *ir, const struct tgsi_token *tokens)
+{
+   llvm::Module *mod = new Module("shader");
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned instno = 0;
+   Function* shader = mod->getFunction("execute_shader");
+   std::ostringstream stream;
+   if (ir->type == GALLIVM_VS) {
+      stream << "vs_shader";
+   } else {
+      stream << "fs_shader";
+   }
+   stream << ir->id;
+   std::string func_name = stream.str();
+   shader->setName(func_name.c_str());
+
+   Function::arg_iterator args = shader->arg_begin();
+   Value *ptr_INPUT = args++;
+   ptr_INPUT->setName("input");
+
+   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
+
+   tgsi_parse_init(&parse, tokens);
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+   Storage storage(label_entry, ptr_INPUT);
+   Instructions instr(mod, shader, label_entry, &storage);
+   while(!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         translate_declaration(ir, mod, &storage,
+                               &parse.FullToken.FullDeclaration,
+                               &fd);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         translate_immediate(&storage,
+                             &parse.FullToken.FullImmediate);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         translate_instruction(mod, &storage, &instr,
+                               &parse.FullToken.FullInstruction,
+                               &fi, instno);
+         ++instno;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   ir->num_consts = storage.numConsts();
+   return mod;
+}
+
+llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
+                              const struct tgsi_token *tokens)
+{
+   llvm::Module *mod = new Module("shader");
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned instno = 0;
+   std::ostringstream stream;
+   if (ir->type == GALLIVM_VS) {
+      stream << "vs_shader";
+   } else {
+      stream << "fs_shader";
+   }
+   //stream << ir->id;
+   std::string func_name = stream.str();
+   Function *shader = llvm::cast<Function>(mod->getOrInsertFunction(
+                                              func_name.c_str(),
+                                              vertexShaderFunctionType()));
+
+   Function::arg_iterator args = shader->arg_begin();
+   Value *input = args++;
+   input->setName("inputs");
+   Value *output = args++;
+   output->setName("outputs");
+   Value *consts = args++;
+   consts->setName("consts");
+   Value *temps = args++;
+   temps->setName("temps");
+
+   BasicBlock *label_entry = new BasicBlock("entry", shader, 0);
+
+   tgsi_parse_init(&parse, tokens);
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+
+   StorageSoa storage(label_entry, input, output, consts, temps);
+   InstructionsSoa instr(mod, shader, label_entry, &storage);
+
+   while(!tgsi_parse_end_of_tokens(&parse)) {
+      tgsi_parse_token(&parse);
+
+      switch (parse.FullToken.Token.Type) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         translate_declarationir(ir, mod, &storage,
+                                 &parse.FullToken.FullDeclaration,
+                                 &fd);
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         translate_immediateir(&storage,
+                             &parse.FullToken.FullImmediate);
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         storage.declareImmediates();
+         translate_instructionir(mod, &storage, &instr,
+                                 &parse.FullToken.FullInstruction,
+                                 &fi, instno);
+         ++instno;
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+
+   tgsi_parse_free(&parse);
+
+   return mod;
+}
diff --git a/src/gallium/auxiliary/llvm/tgsitollvm.h b/src/gallium/auxiliary/llvm/tgsitollvm.h
new file mode 100644
index 00000000000..7ada04d6299
--- /dev/null
+++ b/src/gallium/auxiliary/llvm/tgsitollvm.h
@@ -0,0 +1,20 @@
+#ifndef TGSITOLLVM_H
+#define TGSITOLLVM_H
+
+
+namespace llvm {
+   class Module;
+}
+
+struct gallivm_ir;
+struct tgsi_token;
+
+
+llvm::Module * tgsi_to_llvm(struct gallivm_ir *ir,
+                            const struct tgsi_token *tokens);
+
+
+llvm::Module * tgsi_to_llvmir(struct gallivm_ir *ir,
+                              const struct tgsi_token *tokens);
+
+#endif
diff --git a/src/gallium/auxiliary/pipebuffer/Makefile b/src/gallium/auxiliary/pipebuffer/Makefile
new file mode 100644
index 00000000000..588629e8701
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/Makefile
@@ -0,0 +1,23 @@
+
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = pipebuffer
+
+DRIVER_SOURCES = \
+	pb_buffer_fenced.c \
+	pb_buffer_malloc.c \
+	pb_bufmgr_fenced.c \
+	pb_bufmgr_mm.c \
+	pb_bufmgr_pool.c \
+	pb_winsys.c
+
+C_SOURCES = \
+	$(DRIVER_SOURCES)
+
+ASM_SOURCES = 
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/pipebuffer/linked_list.h b/src/gallium/auxiliary/pipebuffer/linked_list.h
new file mode 100644
index 00000000000..e99817fb130
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/linked_list.h
@@ -0,0 +1,91 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND. USA.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM, 
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * List macros heavily inspired by the Linux kernel
+ * list handling. No list looping yet.
+ * 
+ * Is not threadsafe, so common operations need to
+ * be protected using an external mutex.
+ */
+
+#ifndef LINKED_LIST_H_
+#define LINKED_LIST_H_
+
+
+#include <stddef.h>
+
+
+struct list_head
+{
+    struct list_head *prev;
+    struct list_head *next;
+};
+
+
+#define LIST_INITHEAD(__item)			\
+  do {						\
+    (__item)->prev = (__item);			\
+    (__item)->next = (__item);			\
+  } while (0)
+
+#define LIST_ADD(__item, __list)		\
+  do {						\
+    (__item)->prev = (__list);			\
+    (__item)->next = (__list)->next;		\
+    (__list)->next->prev = (__item);		\
+    (__list)->next = (__item);			\
+  } while (0)
+
+#define LIST_ADDTAIL(__item, __list)		\
+  do {						\
+    (__item)->next = (__list);			\
+    (__item)->prev = (__list)->prev;		\
+    (__list)->prev->next = (__item);		\
+    (__list)->prev = (__item);			\
+  } while(0)
+
+#define LIST_DEL(__item)			\
+  do {						\
+    (__item)->prev->next = (__item)->next;	\
+    (__item)->next->prev = (__item)->prev;	\
+  } while(0)
+
+#define LIST_DELINIT(__item)			\
+  do {						\
+    (__item)->prev->next = (__item)->next;	\
+    (__item)->next->prev = (__item)->prev;	\
+    (__item)->next = (__item);			\
+    (__item)->prev = (__item);			\
+  } while(0)
+
+#define LIST_ENTRY(__type, __item, __field)   \
+    ((__type *)(((char *)(__item)) - offsetof(__type, __field)))
+
+
+#endif /*LINKED_LIST_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer.h b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
new file mode 100644
index 00000000000..97beb5f72a9
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer.h
@@ -0,0 +1,202 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Generic code for buffers.
+ * 
+ * Behind a pipe buffle handle there can be DMA buffers, client (or user) 
+ * buffers, regular malloced buffers, etc. This file provides an abstract base 
+ * buffer handle that allows the driver to cope with all those kinds of buffers 
+ * in a more flexible way.
+ * 
+ * There is no obligation of a winsys driver to use this library. And a pipe
+ * driver should be completly agnostic about it.
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+#ifndef PB_BUFFER_H_
+#define PB_BUFFER_H_
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_debug.h"
+#include "pipe/p_state.h"
+#include "pipe/p_inlines.h"
+
+
+struct pb_vtbl;
+
+/**
+ * Buffer description.
+ * 
+ * Used when allocating the buffer.
+ */
+struct pb_desc
+{
+   unsigned alignment;
+   unsigned usage;
+};
+
+
+/**
+ * Base class for all pb_* buffers.
+ */
+struct pb_buffer 
+{
+   struct pipe_buffer base;
+
+   /**
+    * Pointer to the virtual function table.
+    *
+    * Avoid accessing this table directly. Use the inline functions below 
+    * instead to avoid mistakes. 
+    */
+   const struct pb_vtbl *vtbl;
+};
+
+
+/**
+ * Virtual function table for the buffer storage operations.
+ * 
+ * Note that creation is not done through this table.
+ */
+struct pb_vtbl
+{
+   void (*destroy)( struct pb_buffer *buf );
+
+   /** 
+    * Map the entire data store of a buffer object into the client's address.
+    * flags is bitmask of PIPE_BUFFER_FLAG_READ/WRITE. 
+    */
+   void *(*map)( struct pb_buffer *buf, 
+                 unsigned flags );
+   
+   void (*unmap)( struct pb_buffer *buf );
+
+   /**
+    * Get the base buffer and the offset.
+    * 
+    * A buffer can be subdivided in smaller buffers. This method should return
+    * the underlaying buffer, and the relative offset.
+    * 
+    * Buffers without an underlaying base buffer should return themselves, with 
+    * a zero offset.
+    * 
+    * Note that this will increase the reference count of the base buffer.
+    */
+   void (*get_base_buffer)( struct pb_buffer *buf,
+                            struct pb_buffer **base_buf,
+                            unsigned *offset );
+};
+
+
+static INLINE struct pipe_buffer *
+pb_pipe_buffer( struct pb_buffer *pbuf )
+{
+   assert(pbuf);
+   return &pbuf->base;
+}
+
+
+static INLINE struct pb_buffer *
+pb_buffer( struct pipe_buffer *buf )
+{
+   assert(buf);
+   /* Could add a magic cookie check on debug builds.
+    */
+   return (struct pb_buffer *)buf;
+}
+
+
+/* Accessor functions for pb->vtbl:
+ */
+static INLINE void *
+pb_map(struct pb_buffer *buf, 
+       unsigned flags)
+{
+   assert(buf);
+   return buf->vtbl->map(buf, flags);
+}
+
+
+static INLINE void 
+pb_unmap(struct pb_buffer *buf)
+{
+   assert(buf);
+   buf->vtbl->unmap(buf);
+}
+
+
+static INLINE void
+pb_get_base_buffer( struct pb_buffer *buf,
+		    struct pb_buffer **base_buf,
+		    unsigned *offset )
+{
+   buf->vtbl->get_base_buffer(buf, base_buf, offset);
+}
+
+
+static INLINE void 
+pb_destroy(struct pb_buffer *buf)
+{
+   assert(buf);
+   buf->vtbl->destroy(buf);
+}
+
+
+/* XXX: thread safety issues!
+ */
+static INLINE void
+pb_reference(struct pb_buffer **dst,
+             struct pb_buffer *src)
+{
+   if (src) 
+      src->base.refcount++;
+
+   if (*dst && --(*dst)->base.refcount == 0)
+      pb_destroy( *dst );
+
+   *dst = src;
+}
+
+
+/**
+ * Malloc-based buffer to store data that can't be used by the graphics 
+ * hardware.
+ */
+struct pb_buffer *
+pb_malloc_buffer_create(size_t size, 
+                        const struct pb_desc *desc);
+
+
+void 
+pb_init_winsys(struct pipe_winsys *winsys);
+
+
+#endif /*PB_BUFFER_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
new file mode 100644
index 00000000000..f4fc3f6d714
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.c
@@ -0,0 +1,299 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of fenced buffers.
+ * 
+ * \author Jos� Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "linked_list.h"
+
+#include "p_compiler.h"
+#include "p_debug.h"
+#include "p_winsys.h"
+#include "p_thread.h"
+#include "p_util.h"
+
+#include "pb_buffer.h"
+#include "pb_buffer_fenced.h"
+
+#ifndef __MSC__
+#include <unistd.h>
+#endif
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct fenced_buffer_list
+{
+   _glthread_Mutex mutex;
+   
+   struct pipe_winsys *winsys;
+   
+   size_t numDelayed;
+   size_t checkDelayed;
+   
+   struct list_head delayed;
+};
+
+
+/**
+ * Wrapper around a pipe buffer which adds fencing and reference counting.
+ */
+struct fenced_buffer
+{
+   struct pb_buffer base;
+   
+   struct pb_buffer *buffer;
+
+   struct pipe_fence_handle *fence;
+
+   struct list_head head;
+   struct fenced_buffer_list *list;
+};
+
+
+static INLINE struct fenced_buffer *
+fenced_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &fenced_buffer_vtbl);
+   return (struct fenced_buffer *)buf;
+}
+
+
+
+
+static void
+_fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                               int wait)
+{
+   struct pipe_winsys *winsys = fenced_list->winsys;
+   struct fenced_buffer *fenced_buf;   
+   struct list_head *list, *prev;
+   int signaled = -1;
+
+   list = fenced_list->delayed.next;
+
+   if (fenced_list->numDelayed > 3) {
+      unsigned i;
+
+      for (i = 0; i < fenced_list->numDelayed; i += 3) {
+         list = list->next;
+      }
+   }
+
+   prev = list->prev;
+   for (; list != &fenced_list->delayed; list = prev, prev = list->prev) {
+
+      fenced_buf = LIST_ENTRY(struct fenced_buffer, list, head);
+
+      if (signaled != 0) {
+         if (wait) {
+            signaled = winsys->fence_finish(winsys, fenced_buf->fence, 0);
+         }
+         else {
+            signaled = winsys->fence_signalled(winsys, fenced_buf->fence, 0);
+         }
+      }
+
+      if (signaled != 0)
+	 /* XXX: we are assuming that buffers are freed in the same order they 
+	  * are fenced which may not always be true... 
+	  */
+         break;
+
+      winsys->fence_reference(winsys, &fenced_buf->fence, NULL);
+      
+      LIST_DEL(list);
+      fenced_list->numDelayed--;
+
+      /* Do the delayed destroy:
+       */
+      pb_reference(&fenced_buf->buffer, NULL);
+      FREE(fenced_buf);
+   }
+}
+
+
+static void
+fenced_buffer_destroy(struct pb_buffer *buf)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+
+   if (fenced_buf->fence) {
+      LIST_ADDTAIL(&fenced_buf->head, &fenced_list->delayed);
+      fenced_list->numDelayed++;
+   }
+   else {
+      pb_reference(&fenced_buf->buffer, NULL);
+      FREE(fenced_buf);
+   }
+   
+   if ((fenced_list->numDelayed % fenced_list->checkDelayed) == 0)
+      _fenced_buffer_list_check_free(fenced_list, 0);
+}
+
+
+static void *
+fenced_buffer_map(struct pb_buffer *buf, 
+                  unsigned flags)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
+   return pb_map(fenced_buf->buffer, flags);
+}
+
+
+static void
+fenced_buffer_unmap(struct pb_buffer *buf)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);   
+   pb_unmap(fenced_buf->buffer);
+}
+
+
+static void
+fenced_buffer_get_base_buffer(struct pb_buffer *buf,
+                              struct pb_buffer **base_buf,
+                              unsigned *offset)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   pb_get_base_buffer(fenced_buf->buffer, base_buf, offset);
+}
+
+
+const struct pb_vtbl 
+fenced_buffer_vtbl = {
+      fenced_buffer_destroy,
+      fenced_buffer_map,
+      fenced_buffer_unmap,
+      fenced_buffer_get_base_buffer
+};
+
+
+struct pb_buffer *
+fenced_buffer_create(struct fenced_buffer_list *fenced_list, 
+                     struct pb_buffer *buffer)
+{
+   struct fenced_buffer *buf;
+   
+   if(!buffer)
+      return NULL;
+   
+   buf = CALLOC_STRUCT(fenced_buffer);
+   if(!buf)
+      return NULL;
+   
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = buffer->base.alignment;
+   buf->base.base.usage = buffer->base.usage;
+   buf->base.base.size = buffer->base.size;
+   
+   buf->base.vtbl = &fenced_buffer_vtbl;
+   buf->buffer = buffer;
+   buf->list = fenced_list;
+   
+   return &buf->base;
+}
+
+
+void
+buffer_fence(struct pb_buffer *buf,
+             struct pipe_fence_handle *fence)
+{
+   struct fenced_buffer *fenced_buf = fenced_buffer(buf);
+   struct fenced_buffer_list *fenced_list = fenced_buf->list;
+   struct pipe_winsys *winsys = fenced_list->winsys;
+   
+   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   winsys->fence_reference(winsys, &fenced_buf->fence, fence);
+   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+}
+
+
+struct fenced_buffer_list *
+fenced_buffer_list_create(struct pipe_winsys *winsys) 
+{
+   struct fenced_buffer_list *fenced_list;
+
+   fenced_list = (struct fenced_buffer_list *)CALLOC(1, sizeof(*fenced_list));
+   if (!fenced_list)
+      return NULL;
+
+   fenced_list->winsys = winsys;
+
+   LIST_INITHEAD(&fenced_list->delayed);
+
+   fenced_list->numDelayed = 0;
+   
+   /* TODO: don't hard code this */ 
+   fenced_list->checkDelayed = 5;
+
+   _glthread_INIT_MUTEX(fenced_list->mutex);
+
+   return fenced_list;
+}
+
+
+void
+fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                              int wait)
+{
+   _glthread_LOCK_MUTEX(fenced_list->mutex);
+   _fenced_buffer_list_check_free(fenced_list, wait);
+   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+}
+
+
+void
+fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list)
+{
+   _glthread_LOCK_MUTEX(fenced_list->mutex);
+
+   /* Wait on outstanding fences */
+   while (fenced_list->numDelayed) {
+      _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+      sched_yield();
+      _fenced_buffer_list_check_free(fenced_list, 1);
+      _glthread_LOCK_MUTEX(fenced_list->mutex);
+   }
+
+   _glthread_UNLOCK_MUTEX(fenced_list->mutex);
+   
+   FREE(fenced_list);
+}
+
+
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
new file mode 100644
index 00000000000..c40b9c75e18
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_fenced.h
@@ -0,0 +1,117 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer fencing.
+ * 
+ * "Fenced buffers" is actually a misnomer. They should be referred as 
+ * "fenceable buffers", i.e, buffers that can be fenced, but I couldn't find
+ * the word "fenceable" in the dictionary.
+ * 
+ * A "fenced buffer" is a decorator around a normal buffer, which adds two 
+ * special properties:
+ * - the ability for the destruction to be delayed by a fence;
+ * - reference counting.
+ * 
+ * Usually DMA buffers have a life-time that will extend the life-time of its 
+ * handle. The end-of-life is dictated by the fence signalling. 
+ * 
+ * Between the handle's destruction, and the fence signalling, the buffer is 
+ * stored in a fenced buffer list.
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+#ifndef PB_BUFFER_FENCED_H_
+#define PB_BUFFER_FENCED_H_
+
+
+#include "pipe/p_debug.h"
+
+
+struct pipe_winsys;
+struct pipe_buffer;
+struct pipe_fence_handle;
+
+
+/**
+ * List of buffers which are awaiting fence signalling.
+ */
+struct fenced_buffer_list;
+
+
+/**
+ * The fenced buffer's virtual function table.
+ * 
+ * NOTE: Made public for debugging purposes.
+ */
+extern const struct pb_vtbl fenced_buffer_vtbl;
+
+
+/**
+ * Create a fenced buffer list.
+ * 
+ * See also fenced_bufmgr_create for a more convenient way to use this.
+ */
+struct fenced_buffer_list *
+fenced_buffer_list_create(struct pipe_winsys *winsys);
+
+
+/**
+ * Walk the fenced buffer list to check and free signalled buffers.
+ */ 
+void
+fenced_buffer_list_check_free(struct fenced_buffer_list *fenced_list, 
+                              int wait);
+
+void
+fenced_buffer_list_destroy(struct fenced_buffer_list *fenced_list);
+
+
+/**
+ * Wrap a buffer in a fenced buffer.
+ * 
+ * NOTE: this will not increase the buffer reference count.
+ */
+struct pb_buffer *
+fenced_buffer_create(struct fenced_buffer_list *fenced, 
+                     struct pb_buffer *buffer);
+
+
+/**
+ * Set a buffer's fence.
+ * 
+ * NOTE: Although it takes a generic pb_buffer argument, it will fail
+ * on everything but buffers returned by fenced_buffer_create.
+ */
+void
+buffer_fence(struct pb_buffer *buf,
+             struct pipe_fence_handle *fence);
+
+
+#endif /*PB_BUFFER_FENCED_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
new file mode 100644
index 00000000000..9e8244f909b
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_buffer_malloc.c
@@ -0,0 +1,127 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of malloc-based buffers to store data that can't be processed
+ * by the hardware. 
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pb_buffer.h"
+
+
+struct malloc_buffer 
+{
+   struct pb_buffer base;
+   void *data;
+};
+
+
+extern const struct pb_vtbl malloc_buffer_vtbl;
+
+static INLINE struct malloc_buffer *
+malloc_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &malloc_buffer_vtbl);
+   return (struct malloc_buffer *)buf;
+}
+
+
+static void
+malloc_buffer_destroy(struct pb_buffer *buf)
+{
+   align_free(malloc_buffer(buf)->data);
+   FREE(buf);
+}
+
+
+static void *
+malloc_buffer_map(struct pb_buffer *buf, 
+                  unsigned flags)
+{
+   return malloc_buffer(buf)->data;
+}
+
+
+static void
+malloc_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static void
+malloc_buffer_get_base_buffer(struct pb_buffer *buf,
+                              struct pb_buffer **base_buf,
+                              unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+
+const struct pb_vtbl 
+malloc_buffer_vtbl = {
+      malloc_buffer_destroy,
+      malloc_buffer_map,
+      malloc_buffer_unmap,
+      malloc_buffer_get_base_buffer
+};
+
+
+struct pb_buffer *
+pb_malloc_buffer_create(size_t size,
+                   	const struct pb_desc *desc) 
+{
+   struct malloc_buffer *buf;
+   
+   /* TODO: do a single allocation */
+   
+   buf = CALLOC_STRUCT(malloc_buffer);
+   if(!buf)
+      return NULL;
+
+   buf->base.base.refcount = 1;
+   buf->base.base.alignment = desc->alignment;
+   buf->base.base.usage = desc->usage;
+   buf->base.base.size = size;
+   buf->base.vtbl = &malloc_buffer_vtbl;
+
+   buf->data = align_malloc(size, desc->alignment < sizeof(void*) ? sizeof(void*) : desc->alignment);
+   if(!buf->data) {
+      align_free(buf);
+      return NULL;
+   }
+
+   return &buf->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
new file mode 100644
index 00000000000..1ddf784c978
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr.h
@@ -0,0 +1,126 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer management.
+ * 
+ * A buffer manager does only one basic thing: it creates buffers. Actually,
+ * "buffer factory" would probably a more accurate description.
+ * 
+ * You can chain buffer managers so that you can have a finer grained memory
+ * management and pooling.
+ * 
+ * For example, for a simple batch buffer manager you would chain:
+ * - the native buffer manager, which provides DMA memory from the graphics
+ * memory space;
+ * - the pool buffer manager, which keep around a pool of equally sized buffers
+ * to avoid latency associated with the native buffer manager; 
+ * - the fenced buffer manager, which will delay buffer destruction until the 
+ * the moment the card finishing processing it. 
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+#ifndef PB_BUFMGR_H_
+#define PB_BUFMGR_H_
+
+
+#include <stddef.h>
+
+
+struct pb_desc;
+struct pipe_buffer;
+struct pipe_winsys;
+
+
+/** 
+ * Abstract base class for all buffer managers.
+ */
+struct pb_manager
+{
+   /* XXX: we will likely need more allocation flags */
+   struct pb_buffer *
+   (*create_buffer)( struct pb_manager *mgr, 
+	             size_t size,
+	             const struct pb_desc *desc);
+
+   void
+   (*destroy)( struct pb_manager *mgr );
+};
+
+
+/** 
+ * Static buffer pool manager.
+ * 
+ * Manages the allocation of equally sized buffers. It does so by allocating
+ * a single big buffer and divide it equally sized buffers. 
+ * 
+ * It is meant to manage the allocation of batch buffer pools.
+ */
+struct pb_manager *
+pool_bufmgr_create(struct pb_manager *provider, 
+                   size_t n, size_t size,
+                   const struct pb_desc *desc);
+
+
+/** 
+ * Wraper around the old memory manager.
+ * 
+ * It managers buffers of different sizes. It does so by allocating a buffer
+ * with the size of the heap, and then using the old mm memory manager to manage
+ * that heap. 
+ */
+struct pb_manager *
+mm_bufmgr_create(struct pb_manager *provider, 
+                 size_t size, size_t align2);
+
+/**
+ * Same as mm_bufmgr_create.
+ * 
+ * Buffer will be release when the manager is destroyed.
+ */
+struct pb_manager *
+mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, 
+                             size_t size, size_t align2);
+
+
+/** 
+ * Fenced buffer manager.
+ *
+ * This manager is just meant for convenience. It wraps the buffers returned
+ * by another manager in fenced buffers, so that  
+ * 
+ * NOTE: the buffer manager that provides the buffers will be destroyed
+ * at the same time.
+ */
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider,
+                     struct pipe_winsys *winsys);
+
+
+#endif /*PB_BUFMGR_H_*/
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
new file mode 100644
index 00000000000..c535d3276c9
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_fenced.c
@@ -0,0 +1,131 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * A buffer manager that wraps buffers in fenced buffers.
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+
+#include "p_debug.h"
+#include "p_util.h"
+
+#include "pb_buffer.h"
+#include "pb_buffer_fenced.h"
+#include "pb_bufmgr.h"
+
+
+struct fenced_pb_manager
+{
+   struct pb_manager base;
+
+   struct pb_manager *provider;
+   
+   struct fenced_buffer_list *fenced_list;
+};
+
+
+static INLINE struct fenced_pb_manager *
+fenced_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct fenced_pb_manager *)mgr;
+}
+
+
+static struct pb_buffer *
+fenced_bufmgr_create_buffer(struct pb_manager *mgr, 
+                            size_t size,
+                            const struct pb_desc *desc)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+   struct pb_buffer *buf;
+   struct pb_buffer *fenced_buf;
+
+   /* check for free buffers before allocating new ones */
+   fenced_buffer_list_check_free(fenced_mgr->fenced_list, 0);
+   
+   buf = fenced_mgr->provider->create_buffer(fenced_mgr->provider, size, desc);
+   if(!buf) {
+      /* try harder to get a buffer */
+      fenced_buffer_list_check_free(fenced_mgr->fenced_list, 1);
+      
+      buf = fenced_mgr->provider->create_buffer(fenced_mgr->provider, size, desc);
+      if(!buf) {
+         /* give up */
+         return NULL;
+      }
+   }
+   
+   fenced_buf = fenced_buffer_create(fenced_mgr->fenced_list, buf);
+   if(!fenced_buf) {
+      assert(buf->base.refcount == 1);
+      pb_destroy(buf);
+   }
+   
+   return fenced_buf;
+}
+
+
+static void
+fenced_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct fenced_pb_manager *fenced_mgr = fenced_pb_manager(mgr);
+
+   fenced_buffer_list_destroy(fenced_mgr->fenced_list);
+
+   fenced_mgr->provider->destroy(fenced_mgr->provider);
+   
+   FREE(fenced_mgr);
+}
+
+
+struct pb_manager *
+fenced_bufmgr_create(struct pb_manager *provider, 
+                     struct pipe_winsys *winsys) 
+{
+   struct fenced_pb_manager *fenced_mgr;
+
+   fenced_mgr = (struct fenced_pb_manager *)CALLOC(1, sizeof(*fenced_mgr));
+   if (!fenced_mgr)
+      return NULL;
+
+   fenced_mgr->base.destroy = fenced_bufmgr_destroy;
+   fenced_mgr->base.create_buffer = fenced_bufmgr_create_buffer;
+
+   fenced_mgr->provider = provider;
+   fenced_mgr->fenced_list = fenced_buffer_list_create(winsys);
+   if(!fenced_mgr->fenced_list) {
+      FREE(fenced_mgr);
+      return NULL;
+   }
+      
+   return &fenced_mgr->base;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
new file mode 100644
index 00000000000..8b1b51c0e28
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_mm.c
@@ -0,0 +1,593 @@
+/**************************************************************************
+ *
+ * Copyright 2006 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * Copyright 1999 Wittawat Yamwong
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Buffer manager using the old texture memory manager.
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+
+#include "linked_list.h"
+
+#include "p_defines.h"
+#include "p_debug.h"
+#include "p_thread.h"
+#include "p_util.h"
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct mem_block 
+{
+   struct mem_block *next, *prev;
+   struct mem_block *next_free, *prev_free;
+   struct mem_block *heap;
+   int ofs, size;
+   unsigned int free:1;
+   unsigned int reserved:1;
+};
+
+
+#ifdef DEBUG
+/**
+ * For debugging purposes.
+ */
+static void
+mmDumpMemInfo(const struct mem_block *heap)
+{
+   debug_printf("Memory heap %p:\n", (void *)heap);
+   if (heap == 0) {
+      debug_printf("  heap == 0\n");
+   } else {
+      const struct mem_block *p;
+
+      for(p = heap->next; p != heap; p = p->next) {
+	 debug_printf("  Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+      debug_printf("\nFree list:\n");
+
+      for(p = heap->next_free; p != heap; p = p->next_free) {
+	 debug_printf(" FREE Offset:%08x, Size:%08x, %c%c\n",p->ofs,p->size,
+		 p->free ? 'F':'.',
+		 p->reserved ? 'R':'.');
+      }
+
+   }
+   debug_printf("End of memory blocks\n");
+}
+#endif
+
+
+/** 
+ * input: total size in bytes
+ * return: a heap pointer if OK, NULL if error
+ */
+static struct mem_block *
+mmInit(int ofs, int size)
+{
+   struct mem_block *heap, *block;
+  
+   if (size <= 0) 
+      return NULL;
+
+   heap = CALLOC_STRUCT(mem_block);
+   if (!heap) 
+      return NULL;
+   
+   block = CALLOC_STRUCT(mem_block);
+   if (!block) {
+      FREE(heap);
+      return NULL;
+   }
+
+   heap->next = block;
+   heap->prev = block;
+   heap->next_free = block;
+   heap->prev_free = block;
+
+   block->heap = heap;
+   block->next = heap;
+   block->prev = heap;
+   block->next_free = heap;
+   block->prev_free = heap;
+
+   block->ofs = ofs;
+   block->size = size;
+   block->free = 1;
+
+   return heap;
+}
+
+
+static struct mem_block *
+SliceBlock(struct mem_block *p, 
+           int startofs, int size, 
+           int reserved, int alignment)
+{
+   struct mem_block *newblock;
+
+   /* break left  [p, newblock, p->next], then p = newblock */
+   if (startofs > p->ofs) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs;
+      newblock->size = p->size - (startofs - p->ofs);
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+
+      p->size -= newblock->size;
+      p = newblock;
+   }
+
+   /* break right, also [p, newblock, p->next] */
+   if (size < p->size) {
+      newblock = CALLOC_STRUCT(mem_block);
+      if (!newblock)
+	 return NULL;
+      newblock->ofs = startofs + size;
+      newblock->size = p->size - size;
+      newblock->free = 1;
+      newblock->heap = p->heap;
+
+      newblock->next = p->next;
+      newblock->prev = p;
+      p->next->prev = newblock;
+      p->next = newblock;
+
+      newblock->next_free = p->next_free;
+      newblock->prev_free = p;
+      p->next_free->prev_free = newblock;
+      p->next_free = newblock;
+	 
+      p->size = size;
+   }
+
+   /* p = middle block */
+   p->free = 0;
+
+   /* Remove p from the free list: 
+    */
+   p->next_free->prev_free = p->prev_free;
+   p->prev_free->next_free = p->next_free;
+
+   p->next_free = 0;
+   p->prev_free = 0;
+
+   p->reserved = reserved;
+   return p;
+}
+
+
+/**
+ * Allocate 'size' bytes with 2^align2 bytes alignment,
+ * restrict the search to free memory after 'startSearch'
+ * depth and back buffers should be in different 4mb banks
+ * to get better page hits if possible
+ * input:	size = size of block
+ *       	align2 = 2^align2 bytes alignment
+ *		startSearch = linear offset from start of heap to begin search
+ * return: pointer to the allocated block, 0 if error
+ */
+static struct mem_block *
+mmAllocMem(struct mem_block *heap, int size, int align2, int startSearch)
+{
+   struct mem_block *p;
+   const int mask = (1 << align2)-1;
+   int startofs = 0;
+   int endofs;
+
+   if (!heap || align2 < 0 || size <= 0)
+      return NULL;
+
+   for (p = heap->next_free; p != heap; p = p->next_free) {
+      assert(p->free);
+
+      startofs = (p->ofs + mask) & ~mask;
+      if ( startofs < startSearch ) {
+	 startofs = startSearch;
+      }
+      endofs = startofs+size;
+      if (endofs <= (p->ofs+p->size))
+	 break;
+   }
+
+   if (p == heap) 
+      return NULL;
+
+   assert(p->free);
+   p = SliceBlock(p,startofs,size,0,mask+1);
+
+   return p;
+}
+
+
+#if 0
+/**
+ * Free block starts at offset
+ * input: pointer to a heap, start offset
+ * return: pointer to a block
+ */
+static struct mem_block *
+mmFindBlock(struct mem_block *heap, int start)
+{
+   struct mem_block *p;
+
+   for (p = heap->next; p != heap; p = p->next) {
+      if (p->ofs == start) 
+	 return p;
+   }
+
+   return NULL;
+}
+#endif
+
+
+static INLINE int
+Join2Blocks(struct mem_block *p)
+{
+   /* XXX there should be some assertions here */
+
+   /* NOTE: heap->free == 0 */
+
+   if (p->free && p->next->free) {
+      struct mem_block *q = p->next;
+
+      assert(p->ofs + p->size == q->ofs);
+      p->size += q->size;
+
+      p->next = q->next;
+      q->next->prev = p;
+
+      q->next_free->prev_free = q->prev_free; 
+      q->prev_free->next_free = q->next_free;
+     
+      FREE(q);
+      return 1;
+   }
+   return 0;
+}
+
+
+/**
+ * Free block starts at offset
+ * input: pointer to a block
+ * return: 0 if OK, -1 if error
+ */
+static int
+mmFreeMem(struct mem_block *b)
+{
+   if (!b)
+      return 0;
+
+   if (b->free) {
+      debug_printf("block already free\n");
+      return -1;
+   }
+   if (b->reserved) {
+      debug_printf("block is reserved\n");
+      return -1;
+   }
+
+   b->free = 1;
+   b->next_free = b->heap->next_free;
+   b->prev_free = b->heap;
+   b->next_free->prev_free = b;
+   b->prev_free->next_free = b;
+
+   Join2Blocks(b);
+   if (b->prev != b->heap)
+      Join2Blocks(b->prev);
+
+   return 0;
+}
+
+
+/**
+ * destroy MM
+ */
+static void
+mmDestroy(struct mem_block *heap)
+{
+   struct mem_block *p;
+
+   if (!heap)
+      return;
+
+   for (p = heap->next; p != heap; ) {
+      struct mem_block *next = p->next;
+      FREE(p);
+      p = next;
+   }
+
+   FREE(heap);
+}
+
+
+struct mm_pb_manager
+{
+   struct pb_manager base;
+   
+   _glthread_Mutex mutex;
+   
+   size_t size;
+   struct mem_block *heap;
+   
+   size_t align2;
+   
+   struct pb_buffer *buffer;
+   void *map;
+};
+
+
+static INLINE struct mm_pb_manager *
+mm_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct mm_pb_manager *)mgr;
+}
+
+
+struct mm_buffer
+{
+   struct pb_buffer base;
+   
+   struct mm_pb_manager *mgr;
+   
+   struct mem_block *block;
+};
+
+
+static INLINE struct mm_buffer *
+mm_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct mm_buffer *)buf;
+}
+
+
+static void
+mm_buffer_destroy(struct pb_buffer *buf)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   
+   assert(buf->base.refcount == 0);
+   
+   _glthread_LOCK_MUTEX(mm->mutex);
+   mmFreeMem(mm_buf->block);
+   FREE(buf);
+   _glthread_UNLOCK_MUTEX(mm->mutex);
+}
+
+
+static void *
+mm_buffer_map(struct pb_buffer *buf,
+              unsigned flags)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+
+   return (unsigned char *) mm->map + mm_buf->block->ofs;
+}
+
+
+static void
+mm_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static void
+mm_buffer_get_base_buffer(struct pb_buffer *buf,
+                          struct pb_buffer **base_buf,
+                          unsigned *offset)
+{
+   struct mm_buffer *mm_buf = mm_buffer(buf);
+   struct mm_pb_manager *mm = mm_buf->mgr;
+   pb_get_base_buffer(mm->buffer, base_buf, offset);
+   *offset += mm_buf->block->ofs;
+}
+
+
+static const struct pb_vtbl 
+mm_buffer_vtbl = {
+      mm_buffer_destroy,
+      mm_buffer_map,
+      mm_buffer_unmap,
+      mm_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+mm_bufmgr_create_buffer(struct pb_manager *mgr, 
+                        size_t size,
+                        const struct pb_desc *desc)
+{
+   struct mm_pb_manager *mm = mm_pb_manager(mgr);
+   struct mm_buffer *mm_buf;
+
+   /* We don't handle alignments larger then the one initially setup */
+   assert(desc->alignment % (1 << mm->align2) == 0);
+   if(desc->alignment % (1 << mm->align2))
+      return NULL;
+   
+   _glthread_LOCK_MUTEX(mm->mutex);
+
+   mm_buf = CALLOC_STRUCT(mm_buffer);
+   if (!mm_buf) {
+      _glthread_UNLOCK_MUTEX(mm->mutex);
+      return NULL;
+   }
+
+   mm_buf->base.base.refcount = 1;
+   mm_buf->base.base.alignment = desc->alignment;
+   mm_buf->base.base.usage = desc->usage;
+   mm_buf->base.base.size = size;
+   
+   mm_buf->base.vtbl = &mm_buffer_vtbl;
+   
+   mm_buf->mgr = mm;
+   
+   mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+   if(!mm_buf->block) {
+      debug_printf("warning: heap full\n");
+#if 0
+      mmDumpMemInfo(mm->heap);
+#endif
+      
+      mm_buf->block = mmAllocMem(mm->heap, size, mm->align2, 0);
+      if(!mm_buf->block) {
+	 assert(0);
+         FREE(mm_buf);
+         _glthread_UNLOCK_MUTEX(mm->mutex);
+         return NULL;
+      }
+   }
+   
+   /* Some sanity checks */
+   assert(0 <= mm_buf->block->ofs && mm_buf->block->ofs < mm->size);
+   assert(size <= mm_buf->block->size && mm_buf->block->ofs + mm_buf->block->size <= mm->size);
+   
+   _glthread_UNLOCK_MUTEX(mm->mutex);
+   return SUPER(mm_buf);
+}
+
+
+static void
+mm_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct mm_pb_manager *mm = mm_pb_manager(mgr);
+   
+   _glthread_LOCK_MUTEX(mm->mutex);
+
+   mmDestroy(mm->heap);
+   
+   pb_unmap(mm->buffer);
+   pb_reference(&mm->buffer, NULL);
+   
+   _glthread_UNLOCK_MUTEX(mm->mutex);
+   
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+mm_bufmgr_create_from_buffer(struct pb_buffer *buffer, 
+                             size_t size, size_t align2) 
+{
+   struct mm_pb_manager *mm;
+
+   if(!buffer)
+      return NULL;
+   
+   mm = CALLOC_STRUCT(mm_pb_manager);
+   if (!mm)
+      return NULL;
+
+   mm->base.create_buffer = mm_bufmgr_create_buffer;
+   mm->base.destroy = mm_bufmgr_destroy;
+
+   mm->size = size;
+   mm->align2 = align2; /* 64-byte alignment */
+
+   _glthread_INIT_MUTEX(mm->mutex);
+
+   mm->buffer = buffer; 
+
+   mm->map = pb_map(mm->buffer, 
+		    PIPE_BUFFER_USAGE_CPU_READ |
+		    PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!mm->map)
+      goto failure;
+
+   mm->heap = mmInit(0, size); 
+   if (!mm->heap)
+      goto failure;
+
+   return SUPER(mm);
+   
+failure:
+if(mm->heap)
+   mmDestroy(mm->heap);
+   if(mm->map)
+      pb_unmap(mm->buffer);
+   if(mm)
+      FREE(mm);
+   return NULL;
+}
+
+
+struct pb_manager *
+mm_bufmgr_create(struct pb_manager *provider, 
+                 size_t size, size_t align2) 
+{
+   struct pb_buffer *buffer;
+   struct pb_manager *mgr;
+   struct pb_desc desc;
+
+   assert(provider);
+   assert(provider->create_buffer);
+   
+   memset(&desc, 0, sizeof(desc));
+   desc.alignment = 1 << align2;
+   
+   buffer = provider->create_buffer(provider, size, &desc); 
+   if (!buffer)
+      return NULL;
+   
+   mgr = mm_bufmgr_create_from_buffer(buffer, size, align2);
+   if (!mgr) {
+      pb_reference(&buffer, NULL);
+      return NULL;
+   }
+
+  return mgr;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
new file mode 100644
index 00000000000..04477a865a5
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_bufmgr_pool.c
@@ -0,0 +1,288 @@
+/**************************************************************************
+ * 
+ * Copyright 2006 Tungsten Graphics, Inc., Bismarck, ND., USA
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
+ * THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR 
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE 
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * 
+ **************************************************************************/
+
+/**
+ * \file
+ * Batch buffer pool management.
+ * 
+ * \author Jos� Fonseca <jrfonseca-at-tungstengraphics-dot-com>
+ * \author Thomas Hellstr�m <thomas-at-tungstengraphics-dot-com>
+ */
+
+
+#include "linked_list.h"
+
+#include "p_compiler.h"
+#include "p_debug.h"
+#include "p_thread.h"
+#include "p_defines.h"
+#include "p_util.h"
+
+#include "pb_buffer.h"
+#include "pb_bufmgr.h"
+
+
+/**
+ * Convenience macro (type safe).
+ */
+#define SUPER(__derived) (&(__derived)->base)
+
+
+struct pool_pb_manager
+{
+   struct pb_manager base;
+   
+   _glthread_Mutex mutex;
+   
+   size_t bufSize;
+   size_t bufAlign;
+   
+   size_t numFree;
+   size_t numTot;
+   
+   struct list_head free;
+   
+   struct pb_buffer *buffer;
+   void *map;
+   
+   struct pool_buffer *bufs;
+};
+
+
+static INLINE struct pool_pb_manager *
+pool_pb_manager(struct pb_manager *mgr)
+{
+   assert(mgr);
+   return (struct pool_pb_manager *)mgr;
+}
+
+
+struct pool_buffer
+{
+   struct pb_buffer base;
+   
+   struct pool_pb_manager *mgr;
+   
+   struct list_head head;
+   
+   size_t start;
+};
+
+
+static INLINE struct pool_buffer *
+pool_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   return (struct pool_buffer *)buf;
+}
+
+
+
+static void
+pool_buffer_destroy(struct pb_buffer *buf)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   
+   assert(pool_buf->base.base.refcount == 0);
+
+   _glthread_LOCK_MUTEX(pool->mutex);
+   LIST_ADD(&pool_buf->head, &pool->free);
+   pool->numFree++;
+   _glthread_UNLOCK_MUTEX(pool->mutex);
+}
+
+
+static void *
+pool_buffer_map(struct pb_buffer *buf, unsigned flags)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   void *map;
+
+   _glthread_LOCK_MUTEX(pool->mutex);
+   map = (unsigned char *) pool->map + pool_buf->start;
+   _glthread_UNLOCK_MUTEX(pool->mutex);
+   return map;
+}
+
+
+static void
+pool_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static void
+pool_buffer_get_base_buffer(struct pb_buffer *buf,
+                            struct pb_buffer **base_buf,
+                            unsigned *offset)
+{
+   struct pool_buffer *pool_buf = pool_buffer(buf);
+   struct pool_pb_manager *pool = pool_buf->mgr;
+   pb_get_base_buffer(pool->buffer, base_buf, offset);
+   *offset += pool_buf->start;
+}
+
+
+static const struct pb_vtbl 
+pool_buffer_vtbl = {
+      pool_buffer_destroy,
+      pool_buffer_map,
+      pool_buffer_unmap,
+      pool_buffer_get_base_buffer
+};
+
+
+static struct pb_buffer *
+pool_bufmgr_create_buffer(struct pb_manager *mgr,
+                          size_t size,
+                          const struct pb_desc *desc)
+{
+   struct pool_pb_manager *pool = pool_pb_manager(mgr);
+   struct pool_buffer *pool_buf;
+   struct list_head *item;
+
+   assert(size == pool->bufSize);
+   assert(pool->bufAlign % desc->alignment == 0);
+   
+   _glthread_LOCK_MUTEX(pool->mutex);
+
+   if (pool->numFree == 0) {
+      _glthread_UNLOCK_MUTEX(pool->mutex);
+      debug_printf("warning: out of fixed size buffer objects\n");
+      return NULL;
+   }
+
+   item = pool->free.next;
+
+   if (item == &pool->free) {
+      _glthread_UNLOCK_MUTEX(pool->mutex);
+      debug_printf("error: fixed size buffer pool corruption\n");
+      return NULL;
+   }
+
+   LIST_DEL(item);
+   --pool->numFree;
+
+   _glthread_UNLOCK_MUTEX(pool->mutex);
+   
+   pool_buf = LIST_ENTRY(struct pool_buffer, item, head);
+   assert(pool_buf->base.base.refcount == 0);
+   pool_buf->base.base.refcount = 1;
+   pool_buf->base.base.alignment = desc->alignment;
+   pool_buf->base.base.usage = desc->usage;
+   
+   return SUPER(pool_buf);
+}
+
+
+static void
+pool_bufmgr_destroy(struct pb_manager *mgr)
+{
+   struct pool_pb_manager *pool = pool_pb_manager(mgr);
+   _glthread_LOCK_MUTEX(pool->mutex);
+
+   FREE(pool->bufs);
+   
+   pb_unmap(pool->buffer);
+   pb_reference(&pool->buffer, NULL);
+   
+   _glthread_UNLOCK_MUTEX(pool->mutex);
+   
+   FREE(mgr);
+}
+
+
+struct pb_manager *
+pool_bufmgr_create(struct pb_manager *provider, 
+                   size_t numBufs, 
+                   size_t bufSize,
+                   const struct pb_desc *desc) 
+{
+   struct pool_pb_manager *pool;
+   struct pool_buffer *pool_buf;
+   size_t i;
+
+   pool = (struct pool_pb_manager *)CALLOC(1, sizeof(*pool));
+   if (!pool)
+      return NULL;
+
+   pool->base.destroy = pool_bufmgr_destroy;
+   pool->base.create_buffer = pool_bufmgr_create_buffer;
+
+   LIST_INITHEAD(&pool->free);
+
+   pool->numTot = numBufs;
+   pool->numFree = numBufs;
+   pool->bufSize = bufSize;
+   pool->bufAlign = desc->alignment; 
+   
+   _glthread_INIT_MUTEX(pool->mutex);
+
+   pool->buffer = provider->create_buffer(provider, numBufs*bufSize, desc); 
+   if (!pool->buffer)
+      goto failure;
+
+   pool->map = pb_map(pool->buffer,
+                          PIPE_BUFFER_USAGE_CPU_READ |
+                          PIPE_BUFFER_USAGE_CPU_WRITE);
+   if(!pool->map)
+      goto failure;
+
+   pool->bufs = (struct pool_buffer *)CALLOC(numBufs, sizeof(*pool->bufs));
+   if (!pool->bufs)
+      goto failure;
+
+   pool_buf = pool->bufs;
+   for (i = 0; i < numBufs; ++i) {
+      pool_buf->base.base.refcount = 0;
+      pool_buf->base.base.alignment = 0;
+      pool_buf->base.base.usage = 0;
+      pool_buf->base.base.size = bufSize;
+      pool_buf->base.vtbl = &pool_buffer_vtbl;
+      pool_buf->mgr = pool;
+      pool_buf->start = i * bufSize;
+      LIST_ADDTAIL(&pool_buf->head, &pool->free);
+      pool_buf++;
+   }
+
+   return SUPER(pool);
+   
+failure:
+   if(pool->bufs)
+      FREE(pool->bufs);
+   if(pool->map)
+      pb_unmap(pool->buffer);
+   if(pool->buffer)
+      pb_reference(&pool->buffer, NULL);
+   if(pool)
+      FREE(pool);
+   return NULL;
+}
diff --git a/src/gallium/auxiliary/pipebuffer/pb_winsys.c b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
new file mode 100644
index 00000000000..978944091fd
--- /dev/null
+++ b/src/gallium/auxiliary/pipebuffer/pb_winsys.c
@@ -0,0 +1,170 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/**
+ * \file
+ * Implementation of client buffer (also designated as "user buffers"), which
+ * are just state-tracker owned data masqueraded as buffers.
+ * 
+ * \author Jos� Fonseca <[email protected]>
+ */
+
+
+#include "pipe/p_winsys.h"
+#include "pipe/p_util.h"
+
+#include "pb_buffer.h"
+
+
+/**
+ * User buffers are special buffers that initially reference memory
+ * held by the user but which may if necessary copy that memory into
+ * device memory behind the scenes, for submission to hardware.
+ *
+ * These are particularly useful when the referenced data is never
+ * submitted to hardware at all, in the particular case of software
+ * vertex processing.
+ */
+struct pb_user_buffer 
+{
+   struct pb_buffer base;
+   void *data;
+};
+
+
+extern const struct pb_vtbl pb_user_buffer_vtbl;
+
+
+static INLINE struct pb_user_buffer *
+pb_user_buffer(struct pb_buffer *buf)
+{
+   assert(buf);
+   assert(buf->vtbl == &pb_user_buffer_vtbl);
+   return (struct pb_user_buffer *)buf;
+}
+
+
+static void
+pb_user_buffer_destroy(struct pb_buffer *buf)
+{
+   assert(buf);
+   FREE(buf);
+}
+
+
+static void *
+pb_user_buffer_map(struct pb_buffer *buf, 
+                   unsigned flags)
+{
+   return pb_user_buffer(buf)->data;
+}
+
+
+static void
+pb_user_buffer_unmap(struct pb_buffer *buf)
+{
+   /* No-op */
+}
+
+
+static void
+pb_user_buffer_get_base_buffer(struct pb_buffer *buf,
+                               struct pb_buffer **base_buf,
+                               unsigned *offset)
+{
+   *base_buf = buf;
+   *offset = 0;
+}
+
+
+const struct pb_vtbl 
+pb_user_buffer_vtbl = {
+      pb_user_buffer_destroy,
+      pb_user_buffer_map,
+      pb_user_buffer_unmap,
+      pb_user_buffer_get_base_buffer
+};
+
+
+static struct pipe_buffer *
+pb_winsys_user_buffer_create(struct pipe_winsys *winsys,
+                             void *data, 
+                             unsigned bytes) 
+{
+   struct pb_user_buffer *buf = CALLOC_STRUCT(pb_user_buffer);
+
+   if(!buf)
+      return NULL;
+   
+   buf->base.base.refcount = 1;
+   buf->base.base.size = bytes;
+   buf->base.base.alignment = 0;
+   buf->base.base.usage = 0;
+
+   buf->base.vtbl = &pb_user_buffer_vtbl;   
+   buf->data = data;
+   
+   return &buf->base.base;
+}
+
+
+static void *
+pb_winsys_buffer_map(struct pipe_winsys *winsys,
+                     struct pipe_buffer *buf,
+                     unsigned flags)
+{
+   (void)winsys;
+   return pb_map(pb_buffer(buf), flags);
+}
+
+
+static void
+pb_winsys_buffer_unmap(struct pipe_winsys *winsys,
+                       struct pipe_buffer *buf)
+{
+   (void)winsys;
+   pb_unmap(pb_buffer(buf));
+}
+
+
+static void
+pb_winsys_buffer_destroy(struct pipe_winsys *winsys,
+                         struct pipe_buffer *buf)
+{
+   (void)winsys;
+   pb_destroy(pb_buffer(buf));
+}
+
+
+void 
+pb_init_winsys(struct pipe_winsys *winsys)
+{
+   winsys->user_buffer_create = pb_winsys_user_buffer_create;
+   winsys->buffer_map = pb_winsys_buffer_map;
+   winsys->buffer_unmap = pb_winsys_buffer_unmap;
+   winsys->buffer_destroy = pb_winsys_buffer_destroy;
+}
diff --git a/src/gallium/auxiliary/tgsi/Makefile b/src/gallium/auxiliary/tgsi/Makefile
new file mode 100644
index 00000000000..12a8bd0409e
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/Makefile
@@ -0,0 +1,3 @@
+default:
+	cd ../.. ; make
+
diff --git a/src/gallium/auxiliary/tgsi/exec/Makefile b/src/gallium/auxiliary/tgsi/exec/Makefile
new file mode 100644
index 00000000000..eb8b14e0e89
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/Makefile
@@ -0,0 +1,3 @@
+default:
+	cd ../../.. ; make
+
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
new file mode 100644
index 00000000000..d7b18dc9c59
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.c
@@ -0,0 +1,2476 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI interpretor/executor.
+ *
+ * Flow control information:
+ *
+ * Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
+ * flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
+ * care since a condition may be true for some quad components but false
+ * for other components.
+ *
+ * We basically execute all statements (even if they're in the part of
+ * an IF/ELSE clause that's "not taken") and use a special mask to
+ * control writing to destination registers.  This is the ExecMask.
+ * See store_dest().
+ *
+ * The ExecMask is computed from three other masks (CondMask, LoopMask and
+ * ContMask) which are controlled by the flow control instructions (namely:
+ * (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
+ *
+ *
+ * Authors:
+ *   Michal Krol
+ *   Brian Paul
+ */
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_state.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi_exec.h"
+
+#define TILE_TOP_LEFT     0
+#define TILE_TOP_RIGHT    1
+#define TILE_BOTTOM_LEFT  2
+#define TILE_BOTTOM_RIGHT 3
+
+/*
+ * Shorthand locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TEMP_0_I           TGSI_EXEC_TEMP_00000000_I
+#define TEMP_0_C           TGSI_EXEC_TEMP_00000000_C
+#define TEMP_7F_I          TGSI_EXEC_TEMP_7FFFFFFF_I
+#define TEMP_7F_C          TGSI_EXEC_TEMP_7FFFFFFF_C
+#define TEMP_80_I          TGSI_EXEC_TEMP_80000000_I
+#define TEMP_80_C          TGSI_EXEC_TEMP_80000000_C
+#define TEMP_FF_I          TGSI_EXEC_TEMP_FFFFFFFF_I
+#define TEMP_FF_C          TGSI_EXEC_TEMP_FFFFFFFF_C
+#define TEMP_1_I           TGSI_EXEC_TEMP_ONE_I
+#define TEMP_1_C           TGSI_EXEC_TEMP_ONE_C
+#define TEMP_2_I           TGSI_EXEC_TEMP_TWO_I
+#define TEMP_2_C           TGSI_EXEC_TEMP_TWO_C
+#define TEMP_128_I         TGSI_EXEC_TEMP_128_I
+#define TEMP_128_C         TGSI_EXEC_TEMP_128_C
+#define TEMP_M128_I        TGSI_EXEC_TEMP_MINUS_128_I
+#define TEMP_M128_C        TGSI_EXEC_TEMP_MINUS_128_C
+#define TEMP_KILMASK_I     TGSI_EXEC_TEMP_KILMASK_I
+#define TEMP_KILMASK_C     TGSI_EXEC_TEMP_KILMASK_C
+#define TEMP_OUTPUT_I      TGSI_EXEC_TEMP_OUTPUT_I
+#define TEMP_OUTPUT_C      TGSI_EXEC_TEMP_OUTPUT_C
+#define TEMP_PRIMITIVE_I   TGSI_EXEC_TEMP_PRIMITIVE_I
+#define TEMP_PRIMITIVE_C   TGSI_EXEC_TEMP_PRIMITIVE_C
+#define TEMP_R0            TGSI_EXEC_TEMP_R0
+
+#define FOR_EACH_CHANNEL(CHAN)\
+   for (CHAN = 0; CHAN < 4; CHAN++)
+
+#define IS_CHANNEL_ENABLED(INST, CHAN)\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IS_CHANNEL_ENABLED2(INST, CHAN)\
+   ((INST).FullDstRegisters[1].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
+   FOR_EACH_CHANNEL( CHAN )\
+      if (IS_CHANNEL_ENABLED2( INST, CHAN ))
+
+
+/** The execution mask depends on the conditional mask and the loop mask */
+#define UPDATE_EXEC_MASK(MACH) \
+      MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
+
+
+#define CHAN_X  0
+#define CHAN_Y  1
+#define CHAN_Z  2
+#define CHAN_W  3
+
+
+
+/**
+ * Initialize machine state by expanding tokens to full instructions,
+ * allocating temporary storage, setting up constants, etc.
+ * After this, we can call tgsi_exec_machine_run() many times.
+ */
+void 
+tgsi_exec_machine_bind_shader(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_token *tokens,
+   uint numSamplers,
+   struct tgsi_sampler *samplers)
+{
+   uint k;
+   struct tgsi_parse_context parse;
+   struct tgsi_exec_labels *labels = &mach->Labels;
+   struct tgsi_full_instruction *instructions;
+   struct tgsi_full_declaration *declarations;
+   uint maxInstructions = 10, numInstructions = 0;
+   uint maxDeclarations = 10, numDeclarations = 0;
+   uint instno = 0;
+
+#if 0
+   tgsi_dump(tokens, 0);
+#endif
+
+   mach->Tokens = tokens;
+   mach->Samplers = samplers;
+
+   k = tgsi_parse_init (&parse, mach->Tokens);
+   if (k != TGSI_PARSE_OK) {
+      debug_printf( "Problem parsing!\n" );
+      return;
+   }
+
+   mach->Processor = parse.FullHeader.Processor.Processor;
+   mach->ImmLimit = 0;
+   labels->count = 0;
+
+   declarations = (struct tgsi_full_declaration *)
+      MALLOC( maxDeclarations * sizeof(struct tgsi_full_declaration) );
+
+   instructions = (struct tgsi_full_instruction *)
+      MALLOC( maxInstructions * sizeof(struct tgsi_full_instruction) );
+
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      uint pointer = parse.Position;
+      uint i;
+
+      tgsi_parse_token( &parse );
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         /* save expanded declaration */
+         if (numDeclarations == maxDeclarations) {
+            declarations = REALLOC(declarations,
+                                   maxDeclarations
+                                   * sizeof(struct tgsi_full_declaration),
+                                   (maxDeclarations + 10)
+                                   * sizeof(struct tgsi_full_declaration));
+            maxDeclarations += 10;
+         }
+         memcpy(declarations + numDeclarations,
+                &parse.FullToken.FullDeclaration,
+                sizeof(declarations[0]));
+         numDeclarations++;
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            uint size = parse.FullToken.FullImmediate.Immediate.Size - 1;
+            assert( size % 4 == 0 );
+            assert( mach->ImmLimit + size / 4 <= TGSI_EXEC_NUM_IMMEDIATES );
+
+            for( i = 0; i < size; i++ ) {
+               mach->Imms[mach->ImmLimit + i / 4][i % 4] = 
+		  parse.FullToken.FullImmediate.u.ImmediateFloat32[i].Float;
+            }
+            mach->ImmLimit += size / 4;
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         assert( labels->count < 128 );
+
+         labels->labels[labels->count][0] = instno;
+         labels->labels[labels->count][1] = pointer;
+         labels->count++;
+
+         /* save expanded instruction */
+         if (numInstructions == maxInstructions) {
+            instructions = REALLOC(instructions,
+                                   maxInstructions
+                                   * sizeof(struct tgsi_full_instruction),
+                                   (maxInstructions + 10)
+                                   * sizeof(struct tgsi_full_instruction));
+            maxInstructions += 10;
+         }
+         memcpy(instructions + numInstructions,
+                &parse.FullToken.FullInstruction,
+                sizeof(instructions[0]));
+         numInstructions++;
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+   tgsi_parse_free (&parse);
+
+   if (mach->Declarations) {
+      FREE( mach->Declarations );
+   }
+   mach->Declarations = declarations;
+   mach->NumDeclarations = numDeclarations;
+
+   if (mach->Instructions) {
+      FREE( mach->Instructions );
+   }
+   mach->Instructions = instructions;
+   mach->NumInstructions = numInstructions;
+}
+
+
+void
+tgsi_exec_machine_init(
+   struct tgsi_exec_machine *mach )
+{
+   uint i;
+
+   mach->Temps = (struct tgsi_exec_vector *) tgsi_align_128bit( mach->_Temps);
+   mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
+
+   /* Setup constants. */
+   for( i = 0; i < 4; i++ ) {
+      mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].u[i] = 0x00000000;
+      mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].u[i] = 0x7FFFFFFF;
+      mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].u[i] = 0x80000000;
+      mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].u[i] = 0xFFFFFFFF;
+      mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].f[i] = 1.0f;
+      mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].f[i] = 2.0f;
+      mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].f[i] = 128.0f;
+      mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].f[i] = -128.0f;
+   }
+}
+
+
+void
+tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach)
+{
+   if (mach->Instructions) {
+      FREE(mach->Instructions);
+      mach->Instructions = NULL;
+      mach->NumInstructions = 0;
+   }
+   if (mach->Declarations) {
+      FREE(mach->Declarations);
+      mach->Declarations = NULL;
+      mach->NumDeclarations = 0;
+   }
+}
+
+
+static void
+micro_abs(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) fabs( (double) src->f[0] );
+   dst->f[1] = (float) fabs( (double) src->f[1] );
+   dst->f[2] = (float) fabs( (double) src->f[2] );
+   dst->f[3] = (float) fabs( (double) src->f[3] );
+}
+
+static void
+micro_add(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] + src1->f[0];
+   dst->f[1] = src0->f[1] + src1->f[1];
+   dst->f[2] = src0->f[2] + src1->f[2];
+   dst->f[3] = src0->f[3] + src1->f[3];
+}
+
+static void
+micro_iadd(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] + src1->i[0];
+   dst->i[1] = src0->i[1] + src1->i[1];
+   dst->i[2] = src0->i[2] + src1->i[2];
+   dst->i[3] = src0->i[3] + src1->i[3];
+}
+
+static void
+micro_and(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] & src1->u[0];
+   dst->u[1] = src0->u[1] & src1->u[1];
+   dst->u[2] = src0->u[2] & src1->u[2];
+   dst->u[3] = src0->u[3] & src1->u[3];
+}
+
+static void
+micro_ceil(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) ceil( (double) src->f[0] );
+   dst->f[1] = (float) ceil( (double) src->f[1] );
+   dst->f[2] = (float) ceil( (double) src->f[2] );
+   dst->f[3] = (float) ceil( (double) src->f[3] );
+}
+
+static void
+micro_cos(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) cos( (double) src->f[0] );
+   dst->f[1] = (float) cos( (double) src->f[1] );
+   dst->f[2] = (float) cos( (double) src->f[2] );
+   dst->f[3] = (float) cos( (double) src->f[3] );
+}
+
+static void
+micro_ddx(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_BOTTOM_RIGHT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_ddy(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] =
+   dst->f[1] =
+   dst->f[2] =
+   dst->f[3] = src->f[TILE_TOP_LEFT] - src->f[TILE_BOTTOM_LEFT];
+}
+
+static void
+micro_div(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] / src1->f[0];
+   dst->f[1] = src0->f[1] / src1->f[1];
+   dst->f[2] = src0->f[2] / src1->f[2];
+   dst->f[3] = src0->f[3] / src1->f[3];
+}
+
+static void
+micro_udiv(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] / src1->u[0];
+   dst->u[1] = src0->u[1] / src1->u[1];
+   dst->u[2] = src0->u[2] / src1->u[2];
+   dst->u[3] = src0->u[3] / src1->u[3];
+}
+
+static void
+micro_eq(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] == src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] == src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] == src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] == src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ieq(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] == src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] == src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] == src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] == src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_exp2(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src)
+{
+   dst->f[0] = (float) pow( 2.0, (double) src->f[0] );
+   dst->f[1] = (float) pow( 2.0, (double) src->f[1] );
+   dst->f[2] = (float) pow( 2.0, (double) src->f[2] );
+   dst->f[3] = (float) pow( 2.0, (double) src->f[3] );
+}
+
+static void
+micro_f2it(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->i[0] = (int) src->f[0];
+   dst->i[1] = (int) src->f[1];
+   dst->i[2] = (int) src->f[2];
+   dst->i[3] = (int) src->f[3];
+}
+
+static void
+micro_f2ut(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->u[0] = (uint) src->f[0];
+   dst->u[1] = (uint) src->f[1];
+   dst->u[2] = (uint) src->f[2];
+   dst->u[3] = (uint) src->f[3];
+}
+
+static void
+micro_flr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) floor( (double) src->f[0] );
+   dst->f[1] = (float) floor( (double) src->f[1] );
+   dst->f[2] = (float) floor( (double) src->f[2] );
+   dst->f[3] = (float) floor( (double) src->f[3] );
+}
+
+static void
+micro_frc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = src->f[0] - (float) floor( (double) src->f[0] );
+   dst->f[1] = src->f[1] - (float) floor( (double) src->f[1] );
+   dst->f[2] = src->f[2] - (float) floor( (double) src->f[2] );
+   dst->f[3] = src->f[3] - (float) floor( (double) src->f[3] );
+}
+
+static void
+micro_ge(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] >= src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] >= src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] >= src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] >= src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_i2f(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) src->i[0];
+   dst->f[1] = (float) src->i[1];
+   dst->f[2] = (float) src->i[2];
+   dst->f[3] = (float) src->i[3];
+}
+
+static void
+micro_lg2(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) log( (double) src->f[0] ) * 1.442695f;
+   dst->f[1] = (float) log( (double) src->f[1] ) * 1.442695f;
+   dst->f[2] = (float) log( (double) src->f[2] ) * 1.442695f;
+   dst->f[3] = (float) log( (double) src->f[3] ) * 1.442695f;
+}
+
+static void
+micro_lt(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src2->f[0] : src3->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src2->f[1] : src3->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src2->f[2] : src3->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src2->f[3] : src3->f[3];
+}
+
+static void
+micro_ilt(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src2->i[0] : src3->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src2->i[1] : src3->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src2->i[2] : src3->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src2->i[3] : src3->i[3];
+}
+
+static void
+micro_ult(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2,
+   const union tgsi_exec_channel *src3 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src2->u[0] : src3->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src2->u[1] : src3->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src2->u[2] : src3->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src2->u[3] : src3->u[3];
+}
+
+static void
+micro_max(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] > src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] > src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] > src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] > src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imax(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] > src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] > src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] > src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] > src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umax(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] > src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] > src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] > src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] > src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_min(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] < src1->f[0] ? src0->f[0] : src1->f[0];
+   dst->f[1] = src0->f[1] < src1->f[1] ? src0->f[1] : src1->f[1];
+   dst->f[2] = src0->f[2] < src1->f[2] ? src0->f[2] : src1->f[2];
+   dst->f[3] = src0->f[3] < src1->f[3] ? src0->f[3] : src1->f[3];
+}
+
+static void
+micro_imin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] < src1->i[0] ? src0->i[0] : src1->i[0];
+   dst->i[1] = src0->i[1] < src1->i[1] ? src0->i[1] : src1->i[1];
+   dst->i[2] = src0->i[2] < src1->i[2] ? src0->i[2] : src1->i[2];
+   dst->i[3] = src0->i[3] < src1->i[3] ? src0->i[3] : src1->i[3];
+}
+
+static void
+micro_umin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] < src1->u[0] ? src0->u[0] : src1->u[0];
+   dst->u[1] = src0->u[1] < src1->u[1] ? src0->u[1] : src1->u[1];
+   dst->u[2] = src0->u[2] < src1->u[2] ? src0->u[2] : src1->u[2];
+   dst->u[3] = src0->u[3] < src1->u[3] ? src0->u[3] : src1->u[3];
+}
+
+static void
+micro_umod(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] % src1->u[0];
+   dst->u[1] = src0->u[1] % src1->u[1];
+   dst->u[2] = src0->u[2] % src1->u[2];
+   dst->u[3] = src0->u[3] % src1->u[3];
+}
+
+static void
+micro_mul(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] * src1->f[0];
+   dst->f[1] = src0->f[1] * src1->f[1];
+   dst->f[2] = src0->f[2] * src1->f[2];
+   dst->f[3] = src0->f[3] * src1->f[3];
+}
+
+static void
+micro_imul(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] * src1->i[0];
+   dst->i[1] = src0->i[1] * src1->i[1];
+   dst->i[2] = src0->i[2] * src1->i[2];
+   dst->i[3] = src0->i[3] * src1->i[3];
+}
+
+static void
+micro_imul64(
+   union tgsi_exec_channel *dst0,
+   union tgsi_exec_channel *dst1,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst1->i[0] = src0->i[0] * src1->i[0];
+   dst1->i[1] = src0->i[1] * src1->i[1];
+   dst1->i[2] = src0->i[2] * src1->i[2];
+   dst1->i[3] = src0->i[3] * src1->i[3];
+   dst0->i[0] = 0;
+   dst0->i[1] = 0;
+   dst0->i[2] = 0;
+   dst0->i[3] = 0;
+}
+
+static void
+micro_umul64(
+   union tgsi_exec_channel *dst0,
+   union tgsi_exec_channel *dst1,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst1->u[0] = src0->u[0] * src1->u[0];
+   dst1->u[1] = src0->u[1] * src1->u[1];
+   dst1->u[2] = src0->u[2] * src1->u[2];
+   dst1->u[3] = src0->u[3] * src1->u[3];
+   dst0->u[0] = 0;
+   dst0->u[1] = 0;
+   dst0->u[2] = 0;
+   dst0->u[3] = 0;
+}
+
+static void
+micro_movc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1,
+   const union tgsi_exec_channel *src2 )
+{
+   dst->u[0] = src0->u[0] ? src1->u[0] : src2->u[0];
+   dst->u[1] = src0->u[1] ? src1->u[1] : src2->u[1];
+   dst->u[2] = src0->u[2] ? src1->u[2] : src2->u[2];
+   dst->u[3] = src0->u[3] ? src1->u[3] : src2->u[3];
+}
+
+static void
+micro_neg(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = -src->f[0];
+   dst->f[1] = -src->f[1];
+   dst->f[2] = -src->f[2];
+   dst->f[3] = -src->f[3];
+}
+
+static void
+micro_ineg(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->i[0] = -src->i[0];
+   dst->i[1] = -src->i[1];
+   dst->i[2] = -src->i[2];
+   dst->i[3] = -src->i[3];
+}
+
+static void
+micro_not(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->u[0] = ~src->u[0];
+   dst->u[1] = ~src->u[1];
+   dst->u[2] = ~src->u[2];
+   dst->u[3] = ~src->u[3];
+}
+
+static void
+micro_or(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] | src1->u[0];
+   dst->u[1] = src0->u[1] | src1->u[1];
+   dst->u[2] = src0->u[2] | src1->u[2];
+   dst->u[3] = src0->u[3] | src1->u[3];
+}
+
+static void
+micro_pow(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = (float) pow( (double) src0->f[0], (double) src1->f[0] );
+   dst->f[1] = (float) pow( (double) src0->f[1], (double) src1->f[1] );
+   dst->f[2] = (float) pow( (double) src0->f[2], (double) src1->f[2] );
+   dst->f[3] = (float) pow( (double) src0->f[3], (double) src1->f[3] );
+}
+
+static void
+micro_rnd(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) floor( (double) (src->f[0] + 0.5f) );
+   dst->f[1] = (float) floor( (double) (src->f[1] + 0.5f) );
+   dst->f[2] = (float) floor( (double) (src->f[2] + 0.5f) );
+   dst->f[3] = (float) floor( (double) (src->f[3] + 0.5f) );
+}
+
+static void
+micro_shl(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] << src1->i[0];
+   dst->i[1] = src0->i[1] << src1->i[1];
+   dst->i[2] = src0->i[2] << src1->i[2];
+   dst->i[3] = src0->i[3] << src1->i[3];
+}
+
+static void
+micro_ishr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->i[0] = src0->i[0] >> src1->i[0];
+   dst->i[1] = src0->i[1] >> src1->i[1];
+   dst->i[2] = src0->i[2] >> src1->i[2];
+   dst->i[3] = src0->i[3] >> src1->i[3];
+}
+
+static void
+micro_trunc(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0 )
+{
+   dst->f[0] = (float) (int) src0->f[0];
+   dst->f[1] = (float) (int) src0->f[1];
+   dst->f[2] = (float) (int) src0->f[2];
+   dst->f[3] = (float) (int) src0->f[3];
+}
+
+static void
+micro_ushr(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] >> src1->u[0];
+   dst->u[1] = src0->u[1] >> src1->u[1];
+   dst->u[2] = src0->u[2] >> src1->u[2];
+   dst->u[3] = src0->u[3] >> src1->u[3];
+}
+
+static void
+micro_sin(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) sin( (double) src->f[0] );
+   dst->f[1] = (float) sin( (double) src->f[1] );
+   dst->f[2] = (float) sin( (double) src->f[2] );
+   dst->f[3] = (float) sin( (double) src->f[3] );
+}
+
+static void
+micro_sqrt( union tgsi_exec_channel *dst,
+            const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) sqrt( (double) src->f[0] );
+   dst->f[1] = (float) sqrt( (double) src->f[1] );
+   dst->f[2] = (float) sqrt( (double) src->f[2] );
+   dst->f[3] = (float) sqrt( (double) src->f[3] );
+}
+
+static void
+micro_sub(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->f[0] = src0->f[0] - src1->f[0];
+   dst->f[1] = src0->f[1] - src1->f[1];
+   dst->f[2] = src0->f[2] - src1->f[2];
+   dst->f[3] = src0->f[3] - src1->f[3];
+}
+
+static void
+micro_u2f(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src )
+{
+   dst->f[0] = (float) src->u[0];
+   dst->f[1] = (float) src->u[1];
+   dst->f[2] = (float) src->u[2];
+   dst->f[3] = (float) src->u[3];
+}
+
+static void
+micro_xor(
+   union tgsi_exec_channel *dst,
+   const union tgsi_exec_channel *src0,
+   const union tgsi_exec_channel *src1 )
+{
+   dst->u[0] = src0->u[0] ^ src1->u[0];
+   dst->u[1] = src0->u[1] ^ src1->u[1];
+   dst->u[2] = src0->u[2] ^ src1->u[2];
+   dst->u[3] = src0->u[3] ^ src1->u[3];
+}
+
+static void
+fetch_src_file_channel(
+   const struct tgsi_exec_machine *mach,
+   const uint file,
+   const uint swizzle,
+   const union tgsi_exec_channel *index,
+   union tgsi_exec_channel *chan )
+{
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( file ) {
+      case TGSI_FILE_CONSTANT:
+         chan->f[0] = mach->Consts[index->i[0]][swizzle];
+         chan->f[1] = mach->Consts[index->i[1]][swizzle];
+         chan->f[2] = mach->Consts[index->i[2]][swizzle];
+         chan->f[3] = mach->Consts[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_INPUT:
+         chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_IMMEDIATE:
+         assert( index->i[0] < (int) mach->ImmLimit );
+         chan->f[0] = mach->Imms[index->i[0]][swizzle];
+         assert( index->i[1] < (int) mach->ImmLimit );
+         chan->f[1] = mach->Imms[index->i[1]][swizzle];
+         assert( index->i[2] < (int) mach->ImmLimit );
+         chan->f[2] = mach->Imms[index->i[2]][swizzle];
+         assert( index->i[3] < (int) mach->ImmLimit );
+         chan->f[3] = mach->Imms[index->i[3]][swizzle];
+         break;
+
+      case TGSI_FILE_ADDRESS:
+         chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      case TGSI_FILE_OUTPUT:
+         /* vertex/fragment output vars can be read too */
+         chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
+         chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
+         chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
+         chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      *chan = mach->Temps[TEMP_0_I].xyzw[TEMP_0_C];
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      *chan = mach->Temps[TEMP_1_I].xyzw[TEMP_1_C];
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+static void
+fetch_source(
+   const struct tgsi_exec_machine *mach,
+   union tgsi_exec_channel *chan,
+   const struct tgsi_full_src_register *reg,
+   const uint chan_index )
+{
+   union tgsi_exec_channel index;
+   uint swizzle;
+
+   index.i[0] =
+   index.i[1] =
+   index.i[2] =
+   index.i[3] = reg->SrcRegister.Index;
+
+   if (reg->SrcRegister.Indirect) {
+      union tgsi_exec_channel index2;
+      union tgsi_exec_channel indir_index;
+
+      index2.i[0] =
+      index2.i[1] =
+      index2.i[2] =
+      index2.i[3] = reg->SrcRegisterInd.Index;
+
+      swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterInd, CHAN_X );
+      fetch_src_file_channel(
+         mach,
+         reg->SrcRegisterInd.File,
+         swizzle,
+         &index2,
+         &indir_index );
+
+      index.i[0] += indir_index.i[0];
+      index.i[1] += indir_index.i[1];
+      index.i[2] += indir_index.i[2];
+      index.i[3] += indir_index.i[3];
+   }
+
+   if( reg->SrcRegister.Dimension ) {
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_INPUT:
+         index.i[0] *= 17;
+         index.i[1] *= 17;
+         index.i[2] *= 17;
+         index.i[3] *= 17;
+         break;
+      case TGSI_FILE_CONSTANT:
+         index.i[0] *= 4096;
+         index.i[1] *= 4096;
+         index.i[2] *= 4096;
+         index.i[3] *= 4096;
+         break;
+      default:
+         assert( 0 );
+      }
+
+      index.i[0] += reg->SrcRegisterDim.Index;
+      index.i[1] += reg->SrcRegisterDim.Index;
+      index.i[2] += reg->SrcRegisterDim.Index;
+      index.i[3] += reg->SrcRegisterDim.Index;
+
+      if (reg->SrcRegisterDim.Indirect) {
+         union tgsi_exec_channel index2;
+         union tgsi_exec_channel indir_index;
+
+         index2.i[0] =
+         index2.i[1] =
+         index2.i[2] =
+         index2.i[3] = reg->SrcRegisterDimInd.Index;
+
+         swizzle = tgsi_util_get_src_register_swizzle( &reg->SrcRegisterDimInd, CHAN_X );
+         fetch_src_file_channel(
+            mach,
+            reg->SrcRegisterDimInd.File,
+            swizzle,
+            &index2,
+            &indir_index );
+
+         index.i[0] += indir_index.i[0];
+         index.i[1] += indir_index.i[1];
+         index.i[2] += indir_index.i[2];
+         index.i[3] += indir_index.i[3];
+      }
+   }
+
+   swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+   fetch_src_file_channel(
+      mach,
+      reg->SrcRegister.File,
+      swizzle,
+      &index,
+      chan );
+
+   switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      micro_abs( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      micro_abs( chan, chan );
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      micro_neg( chan, chan );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+
+   if (reg->SrcRegisterExtMod.Complement) {
+      micro_sub( chan, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], chan );
+   }
+}
+
+static void
+store_dest(
+   struct tgsi_exec_machine *mach,
+   const union tgsi_exec_channel *chan,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   uint chan_index )
+{
+   union tgsi_exec_channel *dst;
+
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_NULL:
+      return;
+
+   case TGSI_FILE_OUTPUT:
+      dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+                           + reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      dst = &mach->Temps[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      dst = &mach->Addrs[reg->DstRegister.Index].xyzw[chan_index];
+      break;
+
+   default:
+      assert( 0 );
+      return;
+   }
+
+   switch (inst->Instruction.Saturate)
+   {
+   case TGSI_SAT_NONE:
+      if (mach->ExecMask & 0x1)
+         dst->i[0] = chan->i[0];
+      if (mach->ExecMask & 0x2)
+         dst->i[1] = chan->i[1];
+      if (mach->ExecMask & 0x4)
+         dst->i[2] = chan->i[2];
+      if (mach->ExecMask & 0x8)
+         dst->i[3] = chan->i[3];
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+      /* XXX need to obey ExecMask here */
+      micro_max(dst, chan, &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C]);
+      micro_min(dst, dst, &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C]);
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+#define FETCH(VAL,INDEX,CHAN)\
+    fetch_source (mach, VAL, &inst->FullSrcRegisters[INDEX], CHAN)
+
+#define STORE(VAL,INDEX,CHAN)\
+    store_dest (mach, VAL, &inst->FullDstRegisters[INDEX], inst, CHAN )
+
+
+/**
+ * Execute ARB-style KIL which is predicated by a src register.
+ * Kill fragment if any of the four values is less than zero.
+ */
+static void
+exec_kilp(struct tgsi_exec_machine *mach,
+          const struct tgsi_full_instruction *inst)
+{
+   uint uniquemask;
+   uint chan_index;
+   uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
+   union tgsi_exec_channel r[1];
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   for (chan_index = 0; chan_index < 4; chan_index++)
+   {
+      uint swizzle;
+      uint i;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle (
+                        &inst->FullSrcRegisters[0],
+                        chan_index);
+
+      /* check if the component has not been already tested */
+      if (uniquemask & (1 << swizzle))
+         continue;
+      uniquemask |= 1 << swizzle;
+
+      FETCH(&r[0], 0, chan_index);
+      for (i = 0; i < 4; i++)
+         if (r[0].f[i] < 0.0f)
+            kilmask |= 1 << i;
+   }
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
+}
+
+
+/*
+ * Fetch a texel using STR texture coordinates.
+ */
+static void
+fetch_texel( struct tgsi_sampler *sampler,
+             const union tgsi_exec_channel *s,
+             const union tgsi_exec_channel *t,
+             const union tgsi_exec_channel *p,
+             float lodbias,  /* XXX should be float[4] */
+             union tgsi_exec_channel *r,
+             union tgsi_exec_channel *g,
+             union tgsi_exec_channel *b,
+             union tgsi_exec_channel *a )
+{
+   uint j;
+   float rgba[NUM_CHANNELS][QUAD_SIZE];
+
+   sampler->get_samples(sampler, s->f, t->f, p->f, lodbias, rgba);
+
+   for (j = 0; j < 4; j++) {
+      r->f[j] = rgba[0][j];
+      g->f[j] = rgba[1][j];
+      b->f[j] = rgba[2][j];
+      a->f[j] = rgba[3][j];
+   }
+}
+
+
+static void
+exec_tex(struct tgsi_exec_machine *mach,
+         const struct tgsi_full_instruction *inst,
+         boolean biasLod)
+{
+   const uint unit = inst->FullSrcRegisters[1].SrcRegister.Index;
+   union tgsi_exec_channel r[8];
+   uint chan_index;
+   float lodBias;
+
+   /*   debug_printf("Sampler %u unit %u\n", sampler, unit); */
+
+   switch (inst->InstructionExtTexture.Texture) {
+   case TGSI_TEXTURE_1D:
+
+      FETCH(&r[0], 0, CHAN_X);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[1], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[1] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[1], 0, CHAN_W);
+         lodBias = r[2].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], NULL, NULL, lodBias,  /* S, T, P, BIAS */
+                  &r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
+      break;
+
+   case TGSI_TEXTURE_2D:
+   case TGSI_TEXTURE_RECT:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,  /* inputs */
+                  &r[0], &r[1], &r[2], &r[3]);  /* outputs */
+      break;
+
+   case TGSI_TEXTURE_3D:
+   case TGSI_TEXTURE_CUBE:
+
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 0, CHAN_Z);
+
+      switch (inst->FullSrcRegisters[0].SrcRegisterExtSwz.ExtDivide) {
+      case TGSI_EXTSWIZZLE_W:
+         FETCH(&r[3], 0, CHAN_W);
+         micro_div( &r[0], &r[0], &r[3] );
+         micro_div( &r[1], &r[1], &r[3] );
+         micro_div( &r[2], &r[2], &r[3] );
+         break;
+
+      case TGSI_EXTSWIZZLE_ONE:
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if (biasLod) {
+         FETCH(&r[3], 0, CHAN_W);
+         lodBias = r[3].f[0];
+      }
+      else
+         lodBias = 0.0;
+
+      fetch_texel(&mach->Samplers[unit],
+                  &r[0], &r[1], &r[2], lodBias,
+                  &r[0], &r[1], &r[2], &r[3]);
+      break;
+
+   default:
+      assert (0);
+   }
+
+   FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+      STORE( &r[chan_index], 0, chan_index );
+   }
+}
+
+
+/**
+ * Evaluate a constant-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_constant_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   unsigned i;
+
+   for( i = 0; i < QUAD_SIZE; i++ ) {
+      mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
+   }
+}
+
+/**
+ * Evaluate a linear-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_linear_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0;
+   mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
+   mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
+   mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
+}
+
+/**
+ * Evaluate a perspective-valued coefficient at the position of the
+ * current quad.
+ */
+static void
+eval_perspective_coef(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan )
+{
+   const float x = mach->QuadPos.xyzw[0].f[0];
+   const float y = mach->QuadPos.xyzw[1].f[0];
+   const float dadx = mach->InterpCoefs[attrib].dadx[chan];
+   const float dady = mach->InterpCoefs[attrib].dady[chan];
+   const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
+   const float *w = mach->QuadPos.xyzw[3].f;
+   /* divide by W here */
+   mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
+   mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
+   mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
+   mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
+}
+
+
+typedef void (* eval_coef_func)(
+   struct tgsi_exec_machine *mach,
+   unsigned attrib,
+   unsigned chan );
+
+static void
+exec_declaration(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_full_declaration *decl )
+{
+   if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
+      if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+         unsigned first, last, mask;
+         eval_coef_func eval;
+
+         assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+         first = decl->u.DeclarationRange.First;
+         last = decl->u.DeclarationRange.Last;
+         mask = decl->Declaration.UsageMask;
+
+         switch( decl->Interpolation.Interpolate ) {
+         case TGSI_INTERPOLATE_CONSTANT:
+            eval = eval_constant_coef;
+            break;
+
+         case TGSI_INTERPOLATE_LINEAR:
+            eval = eval_linear_coef;
+            break;
+
+         case TGSI_INTERPOLATE_PERSPECTIVE:
+            eval = eval_perspective_coef;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         if( mask == TGSI_WRITEMASK_XYZW ) {
+            unsigned i, j;
+
+            for( i = first; i <= last; i++ ) {
+               for( j = 0; j < NUM_CHANNELS; j++ ) {
+                  eval( mach, i, j );
+               }
+            }
+         }
+         else {
+            unsigned i, j;
+
+            for( j = 0; j < NUM_CHANNELS; j++ ) {
+               if( mask & (1 << j) ) {
+                  for( i = first; i <= last; i++ ) {
+                     eval( mach, i, j );
+                  }
+               }
+            }
+         }
+      }
+   }
+}
+
+static void
+exec_instruction(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_full_instruction *inst,
+   int *pc )
+{
+   uint chan_index;
+   union tgsi_exec_channel r[8];
+
+   (*pc)++;
+
+   switch (inst->Instruction.Opcode) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 FETCH( &r[0], 0, chan_index );
+	 micro_f2it( &r[0], &r[0] );
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_X );
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	    micro_max( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Y );
+	 }
+
+	 if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	    FETCH( &r[1], 0, CHAN_Y );
+	    micro_max( &r[1], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+
+	    FETCH( &r[2], 0, CHAN_W );
+	    micro_min( &r[2], &r[2], &mach->Temps[TEMP_128_I].xyzw[TEMP_128_C] );
+	    micro_max( &r[2], &r[2], &mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C] );
+	    micro_pow( &r[1], &r[1], &r[2] );
+	    micro_lt( &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+	    STORE( &r[0], 0, CHAN_Z );
+	 }
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sqrt( &r[0], &r[0] );
+      micro_div( &r[0], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_LOG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
+      {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_mul( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Z );
+      FETCH( &r[2], 1, CHAN_Z );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+    case TGSI_OPCODE_DP4:
+    /* TGSI_OPCODE_DOT4 */
+       FETCH(&r[0], 0, CHAN_X);
+       FETCH(&r[1], 1, CHAN_X);
+
+       micro_mul( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Y);
+       FETCH(&r[2], 1, CHAN_Y);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_Z);
+       FETCH(&r[2], 1, CHAN_Z);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+       FETCH(&r[1], 0, CHAN_W);
+       FETCH(&r[2], 1, CHAN_W);
+
+       micro_mul( &r[1], &r[1], &r[2] );
+       micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+	 STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+	 FETCH( &r[0], 0, CHAN_Y );
+	 FETCH( &r[1], 1, CHAN_Y);
+	 micro_mul( &r[0], &r[0], &r[1] );
+	 STORE( &r[0], 0, CHAN_Y );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+	 FETCH( &r[0], 0, CHAN_Z );
+	 STORE( &r[0], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+	 FETCH( &r[0], 1, CHAN_W );
+	 STORE( &r[0], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_min()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         /* XXX use micro_max()?? */
+         micro_lt( &r[0], &r[0], &r[1], &r[1], &r[0] );
+
+         STORE(&r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_mul( &r[0], &r[0], &r[1] );
+         FETCH( &r[1], 2, chan_index );
+         micro_add( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+
+         micro_sub( &r[0], &r[0], &r[1] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_sub( &r[1], &r[1], &r[2] );
+         micro_mul( &r[0], &r[0], &r[1] );
+         micro_add( &r[0], &r[0], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CND0:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+      /* TGSI_OPCODE_DP2A */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_frc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_flr( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_rnd( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+    /* TGSI_OPCODE_EX2 */
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_pow( &r[0], &mach->Temps[TEMP_2_I].xyzw[TEMP_2_C], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( &r[0], 0, CHAN_X );
+      micro_lg2( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+      /* TGSI_OPCODE_POW */
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_pow( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      /* TGSI_OPCODE_XPD */
+      FETCH(&r[0], 0, CHAN_Y);
+      FETCH(&r[1], 1, CHAN_Z);
+
+      micro_mul( &r[2], &r[0], &r[1] );
+
+      FETCH(&r[3], 0, CHAN_Z);
+      FETCH(&r[4], 1, CHAN_Y);
+
+      micro_mul( &r[5], &r[3], &r[4] );
+      micro_sub( &r[2], &r[2], &r[5] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
+         STORE( &r[2], 0, CHAN_X );
+      }
+
+      FETCH(&r[2], 1, CHAN_X);
+
+      micro_mul( &r[3], &r[3], &r[2] );
+
+      FETCH(&r[5], 0, CHAN_X);
+
+      micro_mul( &r[1], &r[1], &r[5] );
+      micro_sub( &r[3], &r[3], &r[1] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
+         STORE( &r[3], 0, CHAN_Y );
+      }
+
+      micro_mul( &r[5], &r[5], &r[4] );
+      micro_mul( &r[0], &r[0], &r[2] );
+      micro_sub( &r[5], &r[5], &r[0] );
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
+         STORE( &r[5], 0, CHAN_Z );
+      }
+
+      if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+    case TGSI_OPCODE_MULTIPLYMATRIX:
+       assert (0);
+       break;
+
+    case TGSI_OPCODE_ABS:
+       FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+          FETCH(&r[0], 0, chan_index);
+
+          micro_abs( &r[0], &r[0] );
+
+          STORE(&r[0], 0, chan_index);
+       }
+       break;
+
+   case TGSI_OPCODE_RCC:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH(&r[0], 0, CHAN_X);
+      FETCH(&r[1], 1, CHAN_X);
+
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Y);
+      FETCH(&r[2], 1, CHAN_Y);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 0, CHAN_Z);
+      FETCH(&r[2], 1, CHAN_Z);
+
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FETCH(&r[1], 1, CHAN_W);
+
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH(&r[0], 0, CHAN_X);
+
+      micro_cos( &r[0], &r[0] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+	 STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddx( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDY:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ddy( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_KILP:
+      exec_kilp (mach, inst);
+      break;
+
+   case TGSI_OPCODE_KIL:
+      /* for enabled ExecMask bits, set the killed bit */
+      mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= mach->ExecMask;
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_RFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1],
+                   &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C],
+                   &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SFL:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_SGT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_lt( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( &r[0], 0, CHAN_X );
+      micro_sin( &r[0], &r[0] );
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ge( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SNE:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_eq( &r[0], &r[0], &r[1], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_STR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TEX:
+      /* simple texture lookup */
+      /* src[0] = texcoord */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, FALSE);
+      break;
+
+   case TGSI_OPCODE_TXB:
+      /* Texture lookup with lod bias */
+      /* src[0] = texcoord (src[0].w = LOD bias) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_TXD:
+      /* Texture lookup with explict partial derivatives */
+      /* src[0] = texcoord */
+      /* src[1] = d[strq]/dx */
+      /* src[2] = d[strq]/dy */
+      /* src[3] = sampler unit */
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXL:
+      /* Texture lookup with explit LOD */
+      /* src[0] = texcoord (src[0].w = LOD) */
+      /* src[1] = sampler unit */
+      exec_tex(mach, inst, TRUE);
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_X2D:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ARR:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_BRA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CAL:
+      /* skip the call if no execution channels are enabled */
+      if (mach->ExecMask) {
+         /* do the call */
+
+         /* push the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+         mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+         assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+         assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+         mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+
+         assert(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
+         mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
+
+         /* note that PC was already incremented above */
+         mach->CallStack[mach->CallStackTop++] = *pc;
+         *pc = inst->InstructionExtLabel.Label;
+      }
+      break;
+
+   case TGSI_OPCODE_RET:
+      mach->FuncMask &= ~mach->ExecMask;
+      UPDATE_EXEC_MASK(mach);
+
+      if (mach->ExecMask == 0x0) {
+         /* really return now (otherwise, keep executing */
+
+         if (mach->CallStackTop == 0) {
+            /* returning from main() */
+            *pc = -1;
+            return;
+         }
+         *pc = mach->CallStack[--mach->CallStackTop];
+
+         /* pop the Cond, Loop, Cont stacks */
+         assert(mach->CondStackTop > 0);
+         mach->CondMask = mach->CondStack[--mach->CondStackTop];
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+         assert(mach->FuncStackTop > 0);
+         mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
+
+         UPDATE_EXEC_MASK(mach);
+      }
+      break;
+
+   case TGSI_OPCODE_SSG:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CMP:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH(&r[0], 0, chan_index);
+         FETCH(&r[1], 1, chan_index);
+         FETCH(&r[2], 2, chan_index);
+
+         micro_lt( &r[0], &r[0], &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], &r[1], &r[2] );
+
+         STORE(&r[0], 0, chan_index);
+      }
+      break;
+
+   case TGSI_OPCODE_SCS:
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( &r[0], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+         micro_cos( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_X );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         micro_sin( &r[1], &r[0] );
+         STORE( &r[1], 0, CHAN_Y );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
+      }
+      if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_NRM:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_DIV:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_DP2:
+      FETCH( &r[0], 0, CHAN_X );
+      FETCH( &r[1], 1, CHAN_X );
+      micro_mul( &r[0], &r[0], &r[1] );
+
+      FETCH( &r[1], 0, CHAN_Y );
+      FETCH( &r[2], 1, CHAN_Y );
+      micro_mul( &r[1], &r[1], &r[2] );
+      micro_add( &r[0], &r[0], &r[1] );
+
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_IF:
+      /* push CondMask */
+      assert(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
+      mach->CondStack[mach->CondStackTop++] = mach->CondMask;
+      FETCH( &r[0], 0, CHAN_X );
+      /* update CondMask */
+      if( ! r[0].u[0] ) {
+         mach->CondMask &= ~0x1;
+      }
+      if( ! r[0].u[1] ) {
+         mach->CondMask &= ~0x2;
+      }
+      if( ! r[0].u[2] ) {
+         mach->CondMask &= ~0x4;
+      }
+      if( ! r[0].u[3] ) {
+         mach->CondMask &= ~0x8;
+      }
+      UPDATE_EXEC_MASK(mach);
+      /* Todo: If CondMask==0, jump to ELSE */
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      /* invert CondMask wrt previous mask */
+      {
+         uint prevMask;
+         assert(mach->CondStackTop > 0);
+         prevMask = mach->CondStack[mach->CondStackTop - 1];
+         mach->CondMask = ~mach->CondMask & prevMask;
+         UPDATE_EXEC_MASK(mach);
+         /* Todo: If CondMask==0, jump to ENDIF */
+      }
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      /* pop CondMask */
+      assert(mach->CondStackTop > 0);
+      mach->CondMask = mach->CondStack[--mach->CondStackTop];
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_END:
+      /* halt execution */
+      *pc = -1;
+      break;
+
+   case TGSI_OPCODE_REP:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+       assert (0);
+       break;
+
+   case TGSI_OPCODE_PUSHA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_POPA:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_ceil( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_I2F:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_i2f( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_NOT:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_not( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         micro_trunc( &r[0], &r[0] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHL:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_shl( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SHR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_ishr( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_AND:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_and( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_OR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_or( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_XOR:
+      FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( &r[0], 0, chan_index );
+         FETCH( &r[1], 1, chan_index );
+         micro_xor( &r[0], &r[0], &r[1] );
+         STORE( &r[0], 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SAD:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXF:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      assert (0);
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
+      mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      /* fall-through (for now) */
+   case TGSI_OPCODE_BGNLOOP2:
+      /* push LoopMask and ContMasks */
+      assert(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
+      assert(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
+      mach->ContStack[mach->ContStackTop++] = mach->ContMask;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      /* fall-through (for now at least) */
+   case TGSI_OPCODE_ENDLOOP2:
+      /* Restore ContMask, but don't pop */
+      assert(mach->ContStackTop > 0);
+      mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
+      if (mach->LoopMask) {
+         /* repeat loop: jump to instruction just past BGNLOOP */
+         *pc = inst->InstructionExtLabel.Label + 1;
+      }
+      else {
+         /* exit loop: pop LoopMask */
+         assert(mach->LoopStackTop > 0);
+         mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
+         /* pop ContMask */
+         assert(mach->ContStackTop > 0);
+         mach->ContMask = mach->ContStack[--mach->ContStackTop];
+      }
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BRK:
+      /* turn off loop channels for each enabled exec channel */
+      mach->LoopMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_CONT:
+      /* turn off cont channels for each enabled exec channel */
+      mach->ContMask &= ~mach->ExecMask;
+      /* Todo: if mach->LoopMask == 0, jump to end of loop */
+      UPDATE_EXEC_MASK(mach);
+      break;
+
+   case TGSI_OPCODE_BGNSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_ENDSUB:
+      /* no-op */
+      break;
+
+   case TGSI_OPCODE_NOISE1:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE2:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE3:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOISE4:
+      assert( 0 );
+      break;
+
+   case TGSI_OPCODE_NOP:
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
+
+/**
+ * Run TGSI interpreter.
+ * \return bitmask of "alive" quad components
+ */
+uint
+tgsi_exec_machine_run( struct tgsi_exec_machine *mach )
+{
+   uint i;
+   int pc = 0;
+
+   mach->CondMask = 0xf;
+   mach->LoopMask = 0xf;
+   mach->ContMask = 0xf;
+   mach->FuncMask = 0xf;
+   mach->ExecMask = 0xf;
+
+   mach->CondStackTop = 0; /* temporarily subvert this assertion */
+   assert(mach->CondStackTop == 0);
+   assert(mach->LoopStackTop == 0);
+   assert(mach->ContStackTop == 0);
+   assert(mach->CallStackTop == 0);
+
+   mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
+   mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
+
+   if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
+      mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
+      mach->Primitives[0] = 0;
+   }
+
+
+   /* execute declarations (interpolants) */
+   for (i = 0; i < mach->NumDeclarations; i++) {
+      exec_declaration( mach, mach->Declarations+i );
+   }
+
+   /* execute instructions, until pc is set to -1 */
+   while (pc != -1) {
+      assert(pc < mach->NumInstructions);
+      exec_instruction( mach, mach->Instructions + pc, &pc );
+   }
+
+#if 0
+   /* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
+   if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
+      /*
+       * Scale back depth component.
+       */
+      for (i = 0; i < 4; i++)
+         mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
+   }
+#endif
+
+   return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
+}
+
+
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
new file mode 100644
index 00000000000..45c49dd007c
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_exec.h
@@ -0,0 +1,243 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#if !defined TGSI_EXEC_H
+#define TGSI_EXEC_H
+
+#include "pipe/p_compiler.h"
+
+#if defined __cplusplus
+extern "C" {
+#endif
+
+#define NUM_CHANNELS 4  /* R,G,B,A */
+#define QUAD_SIZE    4  /* 4 pixel/quad */
+
+/**
+  * Registers may be treated as float, signed int or unsigned int.
+  */
+union tgsi_exec_channel
+{
+   float    f[QUAD_SIZE];
+   int      i[QUAD_SIZE];
+   unsigned u[QUAD_SIZE];
+};
+
+/**
+  * A vector[RGBA] of channels[4 pixels]
+  */
+struct tgsi_exec_vector
+{
+   union tgsi_exec_channel xyzw[NUM_CHANNELS];
+};
+
+/**
+ * For fragment programs, information for computing fragment input
+ * values from plane equation of the triangle/line.
+ */
+struct tgsi_interp_coef
+{
+   float a0[NUM_CHANNELS];	/* in an xyzw layout */
+   float dadx[NUM_CHANNELS];
+   float dady[NUM_CHANNELS];
+};
+
+
+struct softpipe_tile_cache;  /**< Opaque to TGSI */
+
+/**
+ * Information for sampling textures, which must be implemented
+ * by code outside the TGSI executor.
+ */
+struct tgsi_sampler
+{
+   const struct pipe_sampler_state *state;
+   struct pipe_texture *texture;
+   /** Get samples for four fragments in a quad */
+   void (*get_samples)(struct tgsi_sampler *sampler,
+                       const float s[QUAD_SIZE],
+                       const float t[QUAD_SIZE],
+                       const float p[QUAD_SIZE],
+                       float lodbias,
+                       float rgba[NUM_CHANNELS][QUAD_SIZE]);
+   void *pipe; /*XXX temporary*/
+   struct softpipe_tile_cache *cache;
+};
+
+/**
+ * For branching/calling subroutines.
+ */
+struct tgsi_exec_labels
+{
+   unsigned labels[128][2];
+   unsigned count;
+};
+
+/*
+ * Locations of various utility registers (_I = Index, _C = Channel)
+ */
+#define TGSI_EXEC_TEMP_00000000_I   32
+#define TGSI_EXEC_TEMP_00000000_C   0
+
+#define TGSI_EXEC_TEMP_7FFFFFFF_I   32
+#define TGSI_EXEC_TEMP_7FFFFFFF_C   1
+
+#define TGSI_EXEC_TEMP_80000000_I   32
+#define TGSI_EXEC_TEMP_80000000_C   2
+
+#define TGSI_EXEC_TEMP_FFFFFFFF_I   32
+#define TGSI_EXEC_TEMP_FFFFFFFF_C   3
+
+#define TGSI_EXEC_TEMP_ONE_I        33
+#define TGSI_EXEC_TEMP_ONE_C        0
+
+#define TGSI_EXEC_TEMP_TWO_I        33
+#define TGSI_EXEC_TEMP_TWO_C        1
+
+#define TGSI_EXEC_TEMP_128_I        33
+#define TGSI_EXEC_TEMP_128_C        2
+
+#define TGSI_EXEC_TEMP_MINUS_128_I  33
+#define TGSI_EXEC_TEMP_MINUS_128_C  3
+
+#define TGSI_EXEC_TEMP_KILMASK_I    34
+#define TGSI_EXEC_TEMP_KILMASK_C    0
+
+#define TGSI_EXEC_TEMP_OUTPUT_I     34
+#define TGSI_EXEC_TEMP_OUTPUT_C     1
+
+#define TGSI_EXEC_TEMP_PRIMITIVE_I  34
+#define TGSI_EXEC_TEMP_PRIMITIVE_C  2
+
+#define TGSI_EXEC_TEMP_R0           35
+
+#define TGSI_EXEC_NUM_TEMPS   (32 + 4)
+#define TGSI_EXEC_NUM_ADDRS   1
+#define TGSI_EXEC_NUM_IMMEDIATES  256
+
+#define TGSI_EXEC_MAX_COND_NESTING  10
+#define TGSI_EXEC_MAX_LOOP_NESTING  10
+#define TGSI_EXEC_MAX_CALL_NESTING  10
+
+/**
+ * Run-time virtual machine state for executing TGSI shader.
+ */
+struct tgsi_exec_machine
+{
+   /*
+    * 32 program temporaries
+    * 4  internal temporaries
+    * 1  address
+    * 1  temporary of padding to align to 16 bytes
+    */
+   struct tgsi_exec_vector       _Temps[TGSI_EXEC_NUM_TEMPS + TGSI_EXEC_NUM_ADDRS + 1];
+
+   /*
+    * This will point to _Temps after aligning to 16B boundary.
+    */
+   struct tgsi_exec_vector       *Temps;
+   struct tgsi_exec_vector       *Addrs;
+
+   struct tgsi_sampler           *Samplers;
+
+   float                         Imms[TGSI_EXEC_NUM_IMMEDIATES][4];
+   unsigned                      ImmLimit;
+   float                         (*Consts)[4];
+   struct tgsi_exec_vector       *Inputs;
+   struct tgsi_exec_vector       *Outputs;
+   const struct tgsi_token       *Tokens;
+   unsigned                      Processor;
+
+   /* GEOMETRY processor only. */
+   unsigned                      *Primitives;
+
+   /* FRAGMENT processor only. */
+   const struct tgsi_interp_coef *InterpCoefs;
+   struct tgsi_exec_vector       QuadPos;
+
+   /* Conditional execution masks */
+   uint CondMask;  /**< For IF/ELSE/ENDIF */
+   uint LoopMask;  /**< For BGNLOOP/ENDLOOP */
+   uint ContMask;  /**< For loop CONT statements */
+   uint FuncMask;  /**< For function calls */
+   uint ExecMask;  /**< = CondMask & LoopMask */
+
+   /** Condition mask stack (for nested conditionals) */
+   uint CondStack[TGSI_EXEC_MAX_COND_NESTING];
+   int CondStackTop;
+
+   /** Loop mask stack (for nested loops) */
+   uint LoopStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int LoopStackTop;
+
+   /** Loop continue mask stack (see comments in tgsi_exec.c) */
+   uint ContStack[TGSI_EXEC_MAX_LOOP_NESTING];
+   int ContStackTop;
+
+   /** Function execution mask stack (for executing subroutine code) */
+   uint FuncStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int FuncStackTop;
+
+   /** Function call stack for saving/restoring the program counter */
+   uint CallStack[TGSI_EXEC_MAX_CALL_NESTING];
+   int CallStackTop;
+
+   struct tgsi_full_instruction *Instructions;
+   uint NumInstructions;
+
+   struct tgsi_full_declaration *Declarations;
+   uint NumDeclarations;
+
+   struct tgsi_exec_labels Labels;
+};
+
+void
+tgsi_exec_machine_init(
+   struct tgsi_exec_machine *mach );
+
+
+void 
+tgsi_exec_machine_bind_shader(
+   struct tgsi_exec_machine *mach,
+   const struct tgsi_token *tokens,
+   uint numSamplers,
+   struct tgsi_sampler *samplers);
+
+uint
+tgsi_exec_machine_run(
+   struct tgsi_exec_machine *mach );
+
+
+void
+tgsi_exec_machine_free_data(struct tgsi_exec_machine *mach);
+
+
+#if defined __cplusplus
+} /* extern "C" */
+#endif
+
+#endif /* TGSI_EXEC_H */
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
new file mode 100755
index 00000000000..62c6a69c63f
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.c
@@ -0,0 +1,2379 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi_exec.h"
+#include "tgsi_sse2.h"
+
+#include "x86/rtasm/x86sse.h"
+
+#if defined(__i386__) || defined(__386__)
+
+#define DUMP_SSE  0
+
+#if DUMP_SSE
+
+static void
+_print_reg(
+   struct x86_reg reg )
+{
+   if (reg.mod != mod_REG) 
+      debug_printf( "[" );
+      
+   switch( reg.file ) {
+   case file_REG32:
+      switch( reg.idx ) {
+      case reg_AX:
+         debug_printf( "EAX" );
+         break;
+      case reg_CX:
+         debug_printf( "ECX" );
+         break;
+      case reg_DX:
+         debug_printf( "EDX" );
+         break;
+      case reg_BX:
+         debug_printf( "EBX" );
+         break;
+      case reg_SP:
+         debug_printf( "ESP" );
+         break;
+      case reg_BP:
+         debug_printf( "EBP" );
+         break;
+      case reg_SI:
+         debug_printf( "ESI" );
+         break;
+      case reg_DI:
+         debug_printf( "EDI" );
+         break;
+      }
+      break;
+   case file_MMX:
+      assert( 0 );
+      break;
+   case file_XMM:
+      debug_printf( "XMM%u", reg.idx );
+      break;
+   case file_x87:
+      assert( 0 );
+      break;
+   }
+
+   if (reg.mod == mod_DISP8 ||
+       reg.mod == mod_DISP32)
+      debug_printf("+%d", reg.disp);
+
+   if (reg.mod != mod_REG) 
+      debug_printf( "]" );
+}
+
+static void
+_fill(
+   const char  *op )
+{
+   unsigned count = 10 - strlen( op );
+
+   while( count-- ) {
+      debug_printf( " " );
+   }
+}
+
+#define DUMP_START() debug_printf( "\nsse-dump start ----------------" )
+#define DUMP_END() debug_printf( "\nsse-dump end ----------------\n" )
+#define DUMP( OP ) debug_printf( "\n%s", OP )
+#define DUMP_I( OP, I ) do {\
+   debug_printf( "\n%s", OP );\
+   _fill( OP );\
+   debug_printf( "%u", I ); } while( 0 )
+#define DUMP_R( OP, R0 ) do {\
+   debug_printf( "\n%s", OP );\
+   _fill( OP );\
+   _print_reg( R0 ); } while( 0 )
+#define DUMP_RR( OP, R0, R1 ) do {\
+   debug_printf( "\n%s", OP );\
+   _fill( OP );\
+   _print_reg( R0 );\
+   debug_printf( ", " );\
+   _print_reg( R1 ); } while( 0 )
+#define DUMP_RRI( OP, R0, R1, I ) do {\
+   debug_printf( "\n%s", OP );\
+   _fill( OP );\
+   _print_reg( R0 );\
+   debug_printf( ", " );\
+   _print_reg( R1 );\
+   debug_printf( ", " );\
+   debug_printf( "%u", I ); } while( 0 )
+
+#else
+
+#define DUMP_START()
+#define DUMP_END()
+#define DUMP( OP )
+#define DUMP_I( OP, I )
+#define DUMP_R( OP, R0 )
+#define DUMP_RR( OP, R0, R1 )
+#define DUMP_RRI( OP, R0, R1, I )
+
+#endif
+
+#define FOR_EACH_CHANNEL( CHAN )\
+   for( CHAN = 0; CHAN < 4; CHAN++ )
+
+#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   ((INST).FullDstRegisters[0].DstRegister.WriteMask & (1 << (CHAN)))
+
+#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
+   if( IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
+
+#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
+   FOR_EACH_CHANNEL( CHAN )\
+      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
+
+#define CHAN_X 0
+#define CHAN_Y 1
+#define CHAN_Z 2
+#define CHAN_W 3
+
+#define TEMP_R0   TGSI_EXEC_TEMP_R0
+
+/**
+ * X86 utility functions.
+ */
+
+static struct x86_reg
+make_xmm(
+   unsigned xmm )
+{
+   return x86_make_reg(
+      file_XMM,
+      (enum x86_reg_name) xmm );
+}
+
+/**
+ * X86 register mapping helpers.
+ */
+
+static struct x86_reg
+get_const_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_CX );
+}
+
+static struct x86_reg
+get_input_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_AX );
+}
+
+static struct x86_reg
+get_output_base( void )
+{
+   return x86_make_reg(
+      file_REG32,
+      reg_DX );
+}
+
+static struct x86_reg
+get_temp_base( void )
+{
+#ifdef WIN32
+   return x86_make_reg(
+      file_REG32,
+      reg_BX );
+#else
+   return x86_make_reg(
+      file_REG32,
+      reg_SI );
+#endif
+}
+
+static struct x86_reg
+get_coef_base( void )
+{
+   return get_output_base();
+}
+
+/**
+ * Data access helpers.
+ */
+
+static struct x86_reg
+get_argument(
+   unsigned index )
+{
+   return x86_make_disp(
+      x86_make_reg( file_REG32, reg_SP ),
+      (index + 1) * 4 );
+}
+
+static struct x86_reg
+get_const(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_const_base(),
+      (vec * 4 + chan) * 4 );
+}
+
+static struct x86_reg
+get_input(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_input_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_output(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_output_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_temp(
+   unsigned vec,
+   unsigned chan )
+{
+   return x86_make_disp(
+      get_temp_base(),
+      (vec * 4 + chan) * 16 );
+}
+
+static struct x86_reg
+get_coef(
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   return x86_make_disp(
+      get_coef_base(),
+      ((vec * 3 + member) * 4 + chan) * 4 );
+}
+
+/**
+ * X86 rtasm wrappers.
+ */
+
+static void
+emit_addps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "ADDPS", dst, src );
+   sse_addps( func, dst, src );
+}
+
+static void
+emit_andnps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "ANDNPS", dst, src );
+   sse_andnps( func, dst, src );
+}
+
+static void
+emit_andps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "ANDPS", dst, src );
+   sse_andps( func, dst, src );
+}
+
+static void
+emit_call(
+   struct x86_function  *func,
+   void                 (* addr)() )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+
+   DUMP_I( "CALL", addr );
+   x86_mov_reg_imm( func, ecx, (unsigned long) addr );
+   x86_call( func, ecx );
+}
+
+static void
+emit_cmpps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src,
+   enum sse_cc          cc )
+{
+   DUMP_RRI( "CMPPS", dst, src, cc );
+   sse_cmpps( func, dst, src, cc );
+}
+
+static void
+emit_cvttps2dq(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "CVTTPS2DQ", dst, src );
+   sse2_cvttps2dq( func, dst, src );
+}
+
+static void
+emit_maxps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MAXPS", dst, src );
+   sse_maxps( func, dst, src );
+}
+
+static void
+emit_minps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MINPS", dst, src );
+   sse_minps( func, dst, src );
+}
+
+static void
+emit_mov(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MOV", dst, src );
+   x86_mov( func, dst, src );
+}
+
+static void
+emit_movaps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MOVAPS", dst, src );
+   sse_movaps( func, dst, src );
+}
+
+static void
+emit_movss(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MOVSS", dst, src );
+   sse_movss( func, dst, src );
+}
+
+static void
+emit_movups(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MOVUPS", dst, src );
+   sse_movups( func, dst, src );
+}
+
+static void
+emit_mulps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "MULPS", dst, src );
+   sse_mulps( func, dst, src );
+}
+
+static void
+emit_or(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "OR", dst, src );
+   x86_or( func, dst, src );
+}
+
+static void
+emit_orps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "ORPS", dst, src );
+   sse_orps( func, dst, src );
+}
+
+static void
+emit_pmovmskb(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "PMOVMSKB", dst, src );
+   sse_pmovmskb( func, dst, src );
+}
+
+static void
+emit_pop(
+   struct x86_function  *func,
+   struct x86_reg       dst )
+{
+   DUMP_R( "POP", dst );
+   x86_pop( func, dst );
+}
+
+static void
+emit_push(
+   struct x86_function  *func,
+   struct x86_reg       dst )
+{
+   DUMP_R( "PUSH", dst );
+   x86_push( func, dst );
+}
+
+static void
+emit_rcpps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "RCPPS", dst, src );
+   sse2_rcpps( func, dst, src );
+}
+
+#ifdef WIN32
+static void
+emit_retw(
+   struct x86_function  *func,
+   unsigned             size )
+{
+   DUMP_I( "RET", size );
+   x86_retw( func, size );
+}
+#else
+static void
+emit_ret(
+   struct x86_function  *func )
+{
+   DUMP( "RET" );
+   x86_ret( func );
+}
+#endif
+
+static void
+emit_rsqrtps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "RSQRTPS", dst, src );
+   sse_rsqrtps( func, dst, src );
+}
+
+static void
+emit_shufps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src,
+   unsigned char        shuf )
+{
+   DUMP_RRI( "SHUFPS", dst, src, shuf );
+   sse_shufps( func, dst, src, shuf );
+}
+
+static void
+emit_subps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "SUBPS", dst, src );
+   sse_subps( func, dst, src );
+}
+
+static void
+emit_xorps(
+   struct x86_function  *func,
+   struct x86_reg       dst,
+   struct x86_reg       src )
+{
+   DUMP_RR( "XORPS", dst, src );
+   sse_xorps( func, dst, src );
+}
+
+/**
+ * Data fetch helpers.
+ */
+
+static void
+emit_const(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movss(
+      func,
+      make_xmm( xmm ),
+      get_const( vec, chan ) );
+   emit_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+static void
+emit_inputf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movups(
+      func,
+      make_xmm( xmm ),
+      get_input( vec, chan ) );
+}
+
+static void
+emit_output(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movups(
+      func,
+      get_output( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_tempf(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movaps(
+      func,
+      make_xmm( xmm ),
+      get_temp( vec, chan ) );
+}
+
+static void
+emit_coef(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan,
+   unsigned member )
+{
+   emit_movss(
+      func,
+      make_xmm( xmm ),
+      get_coef( vec, chan, member ) );
+   emit_shufps(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ),
+      SHUF( 0, 0, 0, 0 ) );
+}
+
+/**
+ * Data store helpers.
+ */
+
+static void
+emit_inputs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movups(
+      func,
+      get_input( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_temps(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_movaps(
+      func,
+      get_temp( vec, chan ),
+      make_xmm( xmm ) );
+}
+
+static void
+emit_addrs(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_temps(
+      func,
+      xmm,
+      vec + TGSI_EXEC_NUM_TEMPS,
+      chan );
+}
+
+/**
+ * Coefficent fetch helpers.
+ */
+
+static void
+emit_coef_a0(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      0 );
+}
+
+static void
+emit_coef_dadx(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      1 );
+}
+
+static void
+emit_coef_dady(
+   struct x86_function *func,
+   unsigned xmm,
+   unsigned vec,
+   unsigned chan )
+{
+   emit_coef(
+      func,
+      xmm,
+      vec,
+      chan,
+      2 );
+}
+
+/**
+ * Function call helpers.
+ */
+
+static void
+emit_push_gp(
+   struct x86_function *func )
+{
+   emit_push(
+      func,
+      get_const_base() );
+   emit_push(
+      func,
+      get_input_base() );
+   emit_push(
+      func,
+      get_output_base() );
+
+   /* It is important on non-win32 platforms that temp base is pushed last.
+    */
+   emit_push(
+      func,
+      get_temp_base() );
+}
+
+static void
+emit_pop_gp(
+   struct x86_function *func )
+{
+   /* Restore GP registers in a reverse order.
+    */
+   emit_pop(
+      func,
+      get_temp_base() );
+   emit_pop(
+      func,
+      get_output_base() );
+   emit_pop(
+      func,
+      get_input_base() );
+   emit_pop(
+      func,
+      get_const_base() );
+}
+
+static void
+emit_func_call_dst(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   void (*code)() )
+{
+   emit_movaps(
+      func,
+      get_temp( TEMP_R0, 0 ),
+      make_xmm( xmm_dst ) );
+
+   emit_push_gp(
+      func );
+
+#ifdef WIN32
+   emit_push(
+      func,
+      get_temp( TEMP_R0, 0 ) );
+#endif
+
+   emit_call(
+      func,
+      code );
+
+   emit_pop_gp(
+      func );
+
+   emit_movaps(
+      func,
+      make_xmm( xmm_dst ),
+      get_temp( TEMP_R0, 0 ) );
+}
+
+static void
+emit_func_call_dst_src(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src,
+   void (*code)() )
+{
+   emit_movaps(
+      func,
+      get_temp( TEMP_R0, 1 ),
+      make_xmm( xmm_src ) );
+
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      code );
+}
+
+/**
+ * Low-level instruction translators.
+ */
+
+static void
+emit_abs(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   emit_andps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_7FFFFFFF_I,
+         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
+}
+
+static void
+emit_add(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_addps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void XSTDCALL
+cos4f(
+   float *store )
+{
+#ifdef WIN32
+   store[0] = (float) cos( (double) store[0] );
+   store[1] = (float) cos( (double) store[1] );
+   store[2] = (float) cos( (double) store[2] );
+   store[3] = (float) cos( (double) store[3] );
+#else
+   const unsigned X = TEMP_R0 * 16;
+   store[X + 0] = cosf( store[X + 0] );
+   store[X + 1] = cosf( store[X + 1] );
+   store[X + 2] = cosf( store[X + 2] );
+   store[X + 3] = cosf( store[X + 3] );
+#endif
+}
+
+static void
+emit_cos(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      cos4f );
+}
+
+static void XSTDCALL
+ex24f(
+   float *store )
+{
+#ifdef WIN32
+   store[0] = (float) pow( 2.0, (double) store[0] );
+   store[1] = (float) pow( 2.0, (double) store[1] );
+   store[2] = (float) pow( 2.0, (double) store[2] );
+   store[3] = (float) pow( 2.0, (double) store[3] );
+#else
+   const unsigned X = TEMP_R0 * 16;
+   store[X + 0] = powf( 2.0f, store[X + 0] );
+   store[X + 1] = powf( 2.0f, store[X + 1] );
+   store[X + 2] = powf( 2.0f, store[X + 2] );
+   store[X + 3] = powf( 2.0f, store[X + 3] );
+#endif
+}
+
+static void
+emit_ex2(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      ex24f );
+}
+
+static void
+emit_f2it(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   emit_cvttps2dq(
+      func,
+      make_xmm( xmm ),
+      make_xmm( xmm ) );
+}
+
+static void XSTDCALL
+flr4f(
+   float *store )
+{
+#ifdef WIN32
+   const unsigned X = 0;
+#else
+   const unsigned X = TEMP_R0 * 16;
+#endif
+   store[X + 0] = (float) floor( (double) store[X + 0] );
+   store[X + 1] = (float) floor( (double) store[X + 1] );
+   store[X + 2] = (float) floor( (double) store[X + 2] );
+   store[X + 3] = (float) floor( (double) store[X + 3] );
+}
+
+static void
+emit_flr(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      flr4f );
+}
+
+static void XSTDCALL
+frc4f(
+   float *store )
+{
+#ifdef WIN32
+   const unsigned X = 0;
+#else
+   const unsigned X = TEMP_R0 * 16;
+#endif
+   store[X + 0] -= (float) floor( (double) store[X + 0] );
+   store[X + 1] -= (float) floor( (double) store[X + 1] );
+   store[X + 2] -= (float) floor( (double) store[X + 2] );
+   store[X + 3] -= (float) floor( (double) store[X + 3] );
+}
+
+static void
+emit_frc(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      frc4f );
+}
+
+static void XSTDCALL
+lg24f(
+   float *store )
+{
+#ifdef WIN32
+   const unsigned X = 0;
+#else
+   const unsigned X = TEMP_R0 * 16;
+#endif
+   store[X + 0] = LOG2( store[X + 0] );
+   store[X + 1] = LOG2( store[X + 1] );
+   store[X + 2] = LOG2( store[X + 2] );
+   store[X + 3] = LOG2( store[X + 3] );
+}
+
+static void
+emit_lg2(
+   struct x86_function *func,
+   unsigned xmm_dst )
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      lg24f );
+}
+
+static void
+emit_MOV(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_movups(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_mul (struct x86_function *func,
+          unsigned xmm_dst,
+          unsigned xmm_src)
+{
+   emit_mulps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_neg(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   emit_xorps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void XSTDCALL
+pow4f(
+   float *store )
+{
+#ifdef WIN32
+   store[0] = (float) pow( (double) store[0], (double) store[4] );
+   store[1] = (float) pow( (double) store[1], (double) store[5] );
+   store[2] = (float) pow( (double) store[2], (double) store[6] );
+   store[3] = (float) pow( (double) store[3], (double) store[7] );
+#else
+   const unsigned X = TEMP_R0 * 16;
+   store[X + 0] = powf( store[X + 0], store[X + 4] );
+   store[X + 1] = powf( store[X + 1], store[X + 5] );
+   store[X + 2] = powf( store[X + 2], store[X + 6] );
+   store[X + 3] = powf( store[X + 3], store[X + 7] );
+#endif
+}
+
+static void
+emit_pow(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_func_call_dst_src(
+      func,
+      xmm_dst,
+      xmm_src,
+      pow4f );
+}
+
+static void
+emit_rcp (
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_rcpps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_rsqrt(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_rsqrtps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+static void
+emit_setsign(
+   struct x86_function *func,
+   unsigned xmm )
+{
+   emit_orps(
+      func,
+      make_xmm( xmm ),
+      get_temp(
+         TGSI_EXEC_TEMP_80000000_I,
+         TGSI_EXEC_TEMP_80000000_C ) );
+}
+
+static void XSTDCALL
+sin4f(
+   float *store )
+{
+#ifdef WIN32
+   store[0] = (float) sin( (double) store[0] );
+   store[1] = (float) sin( (double) store[1] );
+   store[2] = (float) sin( (double) store[2] );
+   store[3] = (float) sin( (double) store[3] );
+#else
+   const unsigned X = TEMP_R0 * 16;
+   store[X + 0] = sinf( store[X + 0] );
+   store[X + 1] = sinf( store[X + 1] );
+   store[X + 2] = sinf( store[X + 2] );
+   store[X + 3] = sinf( store[X + 3] );
+#endif
+}
+
+static void
+emit_sin (struct x86_function *func,
+          unsigned xmm_dst)
+{
+   emit_func_call_dst(
+      func,
+      xmm_dst,
+      sin4f );
+}
+
+static void
+emit_sub(
+   struct x86_function *func,
+   unsigned xmm_dst,
+   unsigned xmm_src )
+{
+   emit_subps(
+      func,
+      make_xmm( xmm_dst ),
+      make_xmm( xmm_src ) );
+}
+
+/**
+ * Register fetch.
+ */
+
+static void
+emit_fetch(
+   struct x86_function *func,
+   unsigned xmm,
+   const struct tgsi_full_src_register *reg,
+   const unsigned chan_index )
+{
+   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( reg, chan_index );
+
+   switch( swizzle ) {
+   case TGSI_EXTSWIZZLE_X:
+   case TGSI_EXTSWIZZLE_Y:
+   case TGSI_EXTSWIZZLE_Z:
+   case TGSI_EXTSWIZZLE_W:
+      switch( reg->SrcRegister.File ) {
+      case TGSI_FILE_CONSTANT:
+         emit_const(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_INPUT:
+         emit_inputf(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      case TGSI_FILE_TEMPORARY:
+         emit_tempf(
+            func,
+            xmm,
+            reg->SrcRegister.Index,
+            swizzle );
+         break;
+
+      default:
+         assert( 0 );
+      }
+      break;
+
+   case TGSI_EXTSWIZZLE_ZERO:
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_00000000_I,
+         TGSI_EXEC_TEMP_00000000_C );
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      emit_tempf(
+         func,
+         xmm,
+         TGSI_EXEC_TEMP_ONE_I,
+         TGSI_EXEC_TEMP_ONE_C );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
+   case TGSI_UTIL_SIGN_CLEAR:
+      emit_abs( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      emit_setsign( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      emit_neg( func, xmm );
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+   }
+}
+
+#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
+   emit_fetch( FUNC, XMM, &(INST).FullSrcRegisters[INDEX], CHAN )
+
+/**
+ * Register store.
+ */
+
+static void
+emit_store(
+   struct x86_function *func,
+   unsigned xmm,
+   const struct tgsi_full_dst_register *reg,
+   const struct tgsi_full_instruction *inst,
+   unsigned chan_index )
+{
+   switch( reg->DstRegister.File ) {
+   case TGSI_FILE_OUTPUT:
+      emit_output(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   case TGSI_FILE_TEMPORARY:
+      emit_temps(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   case TGSI_FILE_ADDRESS:
+      emit_addrs(
+         func,
+         xmm,
+         reg->DstRegister.Index,
+         chan_index );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+
+   case TGSI_SAT_ZERO_ONE:
+//      assert( 0 );
+      break;
+
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      assert( 0 );
+      break;
+   }
+}
+
+#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
+   emit_store( FUNC, XMM, &(INST).FullDstRegisters[INDEX], &(INST), CHAN )
+
+/**
+ * High-level instruction translators.
+ */
+
+static void
+emit_kil(
+   struct x86_function *func,
+   const struct tgsi_full_src_register *reg )
+{
+   unsigned uniquemask;
+   unsigned registers[4];
+   unsigned nextregister = 0;
+   unsigned firstchan = ~0;
+   unsigned chan_index;
+
+   /* This mask stores component bits that were already tested. Note that
+    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
+    * tested. */
+   uniquemask = (1 << TGSI_EXTSWIZZLE_ZERO) | (1 << TGSI_EXTSWIZZLE_ONE);
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      unsigned swizzle;
+
+      /* unswizzle channel */
+      swizzle = tgsi_util_get_full_src_register_extswizzle(
+         reg,
+         chan_index );
+
+      /* check if the component has not been already tested */
+      if( !(uniquemask & (1 << swizzle)) ) {
+         uniquemask |= 1 << swizzle;
+
+         /* allocate register */
+         registers[chan_index] = nextregister;
+         emit_fetch(
+            func,
+            nextregister,
+            reg,
+            chan_index );
+         nextregister++;
+
+         /* mark the first channel used */
+         if( firstchan == ~0 ) {
+            firstchan = chan_index;
+         }
+      }
+   }
+
+   emit_push(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+   emit_push(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+
+   FOR_EACH_CHANNEL( chan_index ) {
+      if( uniquemask & (1 << chan_index) ) {
+         emit_cmpps(
+            func,
+            make_xmm( registers[chan_index] ),
+            get_temp(
+               TGSI_EXEC_TEMP_00000000_I,
+               TGSI_EXEC_TEMP_00000000_C ),
+            cc_LessThan );
+
+         if( chan_index == firstchan ) {
+            emit_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               make_xmm( registers[chan_index] ) );
+         }
+         else {
+            emit_pmovmskb(
+               func,
+               x86_make_reg( file_REG32, reg_DX ),
+               make_xmm( registers[chan_index] ) );
+            emit_or(
+               func,
+               x86_make_reg( file_REG32, reg_AX ),
+               x86_make_reg( file_REG32, reg_DX ) );
+         }
+      }
+   }
+
+   emit_or(
+      func,
+      get_temp(
+         TGSI_EXEC_TEMP_KILMASK_I,
+         TGSI_EXEC_TEMP_KILMASK_C ),
+      x86_make_reg( file_REG32, reg_AX ) );
+
+   emit_pop(
+      func,
+      x86_make_reg( file_REG32, reg_DX ) );
+   emit_pop(
+      func,
+      x86_make_reg( file_REG32, reg_AX ) );
+}
+
+static void
+emit_setcc(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst,
+   enum sse_cc cc )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      emit_cmpps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ),
+         cc );
+      emit_andps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static void
+emit_cmp(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+      FETCH( func, *inst, 0, 0, chan_index );
+      FETCH( func, *inst, 1, 1, chan_index );
+      FETCH( func, *inst, 2, 2, chan_index );
+      emit_cmpps(
+         func,
+         make_xmm( 0 ),
+         get_temp(
+            TGSI_EXEC_TEMP_00000000_I,
+            TGSI_EXEC_TEMP_00000000_C ),
+         cc_LessThan );
+      emit_andps(
+         func,
+         make_xmm( 1 ),
+         make_xmm( 0 ) );
+      emit_andnps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 2 ) );
+      emit_orps(
+         func,
+         make_xmm( 0 ),
+         make_xmm( 1 ) );
+      STORE( func, *inst, 0, 0, chan_index );
+   }
+}
+
+static int
+emit_instruction(
+   struct x86_function *func,
+   struct tgsi_full_instruction *inst )
+{
+   unsigned chan_index;
+
+   switch( inst->Instruction.Opcode ) {
+   case TGSI_OPCODE_ARL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_f2it( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MOV:
+   /* TGSI_OPCODE_SWZ */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LIT:
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+         emit_tempf(
+            func,
+            0,
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C);
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
+            STORE( func, *inst, 0, 0, CHAN_X );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
+            STORE( func, *inst, 0, 0, CHAN_W );
+         }
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            emit_maxps(
+               func,
+               make_xmm( 0 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            STORE( func, *inst, 0, 0, CHAN_Y );
+         }
+         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+            FETCH( func, *inst, 1, 0, CHAN_Y );
+            emit_maxps(
+               func,
+               make_xmm( 1 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_00000000_I,
+                  TGSI_EXEC_TEMP_00000000_C ) );
+            FETCH( func, *inst, 2, 0, CHAN_W );
+            emit_minps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_128_I,
+                  TGSI_EXEC_TEMP_128_C ) );
+            emit_maxps(
+               func,
+               make_xmm( 2 ),
+               get_temp(
+                  TGSI_EXEC_TEMP_MINUS_128_I,
+                  TGSI_EXEC_TEMP_MINUS_128_C ) );
+            emit_pow( func, 1, 2 );
+            FETCH( func, *inst, 0, 0, CHAN_X );
+            emit_xorps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 2 ) );
+            emit_cmpps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 0 ),
+               cc_LessThanEqual );
+            emit_andps(
+               func,
+               make_xmm( 2 ),
+               make_xmm( 1 ) );
+            STORE( func, *inst, 2, 0, CHAN_Z );
+         }
+      }
+      break;
+
+   case TGSI_OPCODE_RCP:
+   /* TGSI_OPCODE_RECIP */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rcp( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RSQ:
+   /* TGSI_OPCODE_RECIPSQRT */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_rsqrt( func, 0, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_EXP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MUL:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ADD:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_add( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP3:
+   /* TGSI_OPCODE_DOT3 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DP4:
+   /* TGSI_OPCODE_DOT4 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul(func, 1, 2 );
+      emit_add(func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_W );
+      FETCH( func, *inst, 2, 1, CHAN_W );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DST:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_tempf(
+            func,
+            0,
+            TGSI_EXEC_TEMP_ONE_I,
+            TGSI_EXEC_TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 1, 1, CHAN_Y );
+         emit_mul( func, 0, 1 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, 0, CHAN_Z );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, 1, CHAN_W );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MIN:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_minps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_MAX:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_maxps(
+            func,
+            make_xmm( 0 ),
+            make_xmm( 1 ) );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLT:
+   /* TGSI_OPCODE_SETLT */
+      emit_setcc( func, inst, cc_LessThan );
+      break;
+
+   case TGSI_OPCODE_SGE:
+   /* TGSI_OPCODE_SETGE */
+      emit_setcc( func, inst, cc_NotLessThan );
+      break;
+
+   case TGSI_OPCODE_MAD:
+   /* TGSI_OPCODE_MADD */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SUB:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         emit_sub( func, 0, 1 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LERP:
+   /* TGSI_OPCODE_LRP */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         FETCH( func, *inst, 1, 1, chan_index );
+         FETCH( func, *inst, 2, 2, chan_index );
+         emit_sub( func, 1, 2 );
+         emit_mul( func, 0, 1 );
+         emit_add( func, 0, 2 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CND0:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DOT2ADD:
+   /* TGSI_OPCODE_DP2A */
+      return 0;
+      break;
+
+   case TGSI_OPCODE_INDEX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NEGATE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FRAC:
+   /* TGSI_OPCODE_FRC */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_frc( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CLAMP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_FLOOR:
+   /* TGSI_OPCODE_FLR */
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_flr( func, 0 );
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_ROUND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EXPBASE2:
+   /* TGSI_OPCODE_EX2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_ex2( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_LOGBASE2:
+   /* TGSI_OPCODE_LG2 */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_lg2( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_POWER:
+   /* TGSI_OPCODE_POW */
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_pow( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+   /* TGSI_OPCODE_XPD */
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
+         FETCH( func, *inst, 1, 1, CHAN_Z );
+         FETCH( func, *inst, 3, 0, CHAN_Z );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         FETCH( func, *inst, 4, 1, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         emit_MOV( func, 2, 0 );
+         emit_mul( func, 2, 1 );
+         emit_MOV( func, 5, 3 );
+         emit_mul( func, 5, 4 );
+         emit_sub( func, 2, 5 );
+         STORE( func, *inst, 2, 0, CHAN_X );
+      }
+      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
+          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
+         FETCH( func, *inst, 2, 1, CHAN_X );
+         FETCH( func, *inst, 5, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         emit_mul( func, 3, 2 );
+         emit_mul( func, 1, 5 );
+         emit_sub( func, 3, 1 );
+         STORE( func, *inst, 3, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         emit_mul( func, 5, 4 );
+         emit_mul( func, 0, 2 );
+         emit_sub( func, 5, 0 );
+         STORE( func, *inst, 5, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_MULTIPLYMATRIX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ABS:
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         FETCH( func, *inst, 0, 0, chan_index );
+         emit_abs( func, 0) ;
+
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_RCC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DPH:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      FETCH( func, *inst, 1, 1, CHAN_X );
+      emit_mul( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Y );
+      FETCH( func, *inst, 2, 1, CHAN_Y );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 0, CHAN_Z );
+      FETCH( func, *inst, 2, 1, CHAN_Z );
+      emit_mul( func, 1, 2 );
+      emit_add( func, 0, 1 );
+      FETCH( func, *inst, 1, 1, CHAN_W );
+      emit_add( func, 0, 1 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_COS:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_cos( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_DDX:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DDY:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_KIL:
+      emit_kil( func, &inst->FullSrcRegisters[0] );
+      break;
+
+   case TGSI_OPCODE_PK2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PK4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SEQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SFL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SGT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SIN:
+      FETCH( func, *inst, 0, 0, CHAN_X );
+      emit_sin( func, 0 );
+      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+         STORE( func, *inst, 0, 0, chan_index );
+      }
+      break;
+
+   case TGSI_OPCODE_SLE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SNE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_STR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TEX:
+      if (0) {
+	 /* Disable dummy texture code: 
+	  */
+	 emit_tempf(
+	    func,
+	    0,
+	    TGSI_EXEC_TEMP_ONE_I,
+	    TGSI_EXEC_TEMP_ONE_C );
+	 FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
+	    STORE( func, *inst, 0, 0, chan_index );
+	 }
+      }
+      else {
+	 return 0;
+      }
+      break;
+
+   case TGSI_OPCODE_TXD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2H:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP2US:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4B:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_UP4UB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_X2D:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ARR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CAL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_RET:
+   case TGSI_OPCODE_END:
+#ifdef WIN32
+      emit_retw( func, 16 );
+#else
+      emit_ret( func );
+#endif
+      break;
+
+   case TGSI_OPCODE_SSG:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CMP:
+      emit_cmp (func, inst);
+      break;
+
+   case TGSI_OPCODE_SCS:
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
+         FETCH( func, *inst, 0, 0, CHAN_X );
+         emit_cos( func, 0 );
+         STORE( func, *inst, 0, 0, CHAN_X );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
+         FETCH( func, *inst, 0, 0, CHAN_Y );
+         emit_sin( func, 0 );
+         STORE( func, *inst, 0, 0, CHAN_Y );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
+         FETCH( func, *inst, 0, TGSI_EXEC_TEMP_00000000_I, TGSI_EXEC_TEMP_00000000_C );
+         STORE( func, *inst, 0, 0, CHAN_Z );
+      }
+      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
+         FETCH( func, *inst, 0, TGSI_EXEC_TEMP_ONE_I, TGSI_EXEC_TEMP_ONE_C );
+         STORE( func, *inst, 0, 0, CHAN_W );
+      }
+      break;
+
+   case TGSI_OPCODE_TXB:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NRM:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DIV:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_DP2:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_BRK:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_IF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_LOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_REP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ELSE:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDIF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDLOOP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDREP:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_PUSHA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_POPA:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CEIL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_I2F:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_NOT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TRUNC:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHL:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SHR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_AND:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_OR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_MOD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_XOR:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_SAD:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXF:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_TXQ:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_CONT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_EMIT:
+      return 0;
+      break;
+
+   case TGSI_OPCODE_ENDPRIM:
+      return 0;
+      break;
+
+   default:
+      return 0;
+   }
+   
+   return 1;
+}
+
+static void
+emit_declaration(
+   struct x86_function *func,
+   struct tgsi_full_declaration *decl )
+{
+   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
+      unsigned first, last, mask;
+      unsigned i, j;
+
+      assert( decl->Declaration.Declare == TGSI_DECLARE_RANGE );
+
+      first = decl->u.DeclarationRange.First;
+      last = decl->u.DeclarationRange.Last;
+      mask = decl->Declaration.UsageMask;
+
+      for( i = first; i <= last; i++ ) {
+         for( j = 0; j < NUM_CHANNELS; j++ ) {
+            if( mask & (1 << j) ) {
+               switch( decl->Interpolation.Interpolate ) {
+               case TGSI_INTERPOLATE_CONSTANT:
+                  emit_coef_a0( func, 0, i, j );
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_LINEAR:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_coef_a0( func, 4, i, j );
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               case TGSI_INTERPOLATE_PERSPECTIVE:
+                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
+                  emit_coef_dadx( func, 1, i, j );
+                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
+                  emit_coef_dady( func, 3, i, j );
+                  emit_mul( func, 0, 1 );    /* x * dadx */
+                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
+                  emit_coef_a0( func, 5, i, j );
+                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
+                  emit_mul( func, 2, 3 );    /* y * dady */
+                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
+                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
+                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
+                  emit_inputs( func, 0, i, j );
+                  break;
+
+               default:
+                  assert( 0 );
+		  break;
+               }
+            }
+         }
+      }
+   }
+}
+
+unsigned
+tgsi_emit_sse2(
+   struct tgsi_token *tokens,
+   struct x86_function *func )
+{
+   struct tgsi_parse_context parse;
+   unsigned ok = 1;
+
+   DUMP_START();
+
+   func->csr = func->store;
+
+   emit_mov(
+      func,
+      get_input_base(),
+      get_argument( 0 ) );
+   emit_mov(
+      func,
+      get_output_base(),
+      get_argument( 1 ) );
+   emit_mov(
+      func,
+      get_const_base(),
+      get_argument( 2 ) );
+   emit_mov(
+      func,
+      get_temp_base(),
+      get_argument( 3 ) );
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         ok = emit_instruction(
+	    func,
+	    &parse.FullToken.FullInstruction );
+
+	 if (!ok) {
+	    debug_printf("failed to translate tgsi opcode %d\n", 
+			 parse.FullToken.FullInstruction.Instruction.Opcode );
+	 }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* XXX implement this */
+	 ok = 0;
+	 debug_printf("failed to emit immediate value\n");
+	 break;
+
+      default:
+         assert( 0 );
+	 ok = 0;
+	 break;
+      }
+   }
+
+   tgsi_parse_free( &parse );
+
+   DUMP_END();
+
+   return ok;
+}
+
+/**
+ * Fragment shaders are responsible for interpolating shader inputs. Because on
+ * x86 we have only 4 GP registers, and here we have 5 shader arguments (input,
+ * output, const, temp and coef), the code is split into two phases --
+ * DECLARATION and INSTRUCTION phase.
+ * GP register holding the output argument is aliased with the coeff argument,
+ * as outputs are not needed in the DECLARATION phase.
+ */
+unsigned
+tgsi_emit_sse2_fs(
+   struct tgsi_token *tokens,
+   struct x86_function *func )
+{
+   struct tgsi_parse_context parse;
+   boolean instruction_phase = FALSE;
+   unsigned ok = 1;
+
+   DUMP_START();
+
+   func->csr = func->store;
+
+   /* DECLARATION phase, do not load output argument. */
+   emit_mov(
+      func,
+      get_input_base(),
+      get_argument( 0 ) );
+   emit_mov(
+      func,
+      get_const_base(),
+      get_argument( 2 ) );
+   emit_mov(
+      func,
+      get_temp_base(),
+      get_argument( 3 ) );
+   emit_mov(
+      func,
+      get_coef_base(),
+      get_argument( 4 ) );
+
+   tgsi_parse_init( &parse, tokens );
+
+   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         emit_declaration(
+            func,
+            &parse.FullToken.FullDeclaration );
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         if( !instruction_phase ) {
+            /* INSTRUCTION phase, overwrite coeff with output. */
+            instruction_phase = TRUE;
+            emit_mov(
+               func,
+               get_output_base(),
+               get_argument( 1 ) );
+         }
+         ok = emit_instruction(
+            func,
+            &parse.FullToken.FullInstruction );
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         /* XXX implement this */
+	 ok = 0;
+         break;
+
+      default:
+	 ok = 0;
+         assert( 0 );
+      }
+   }
+
+   tgsi_parse_free( &parse );
+
+   DUMP_END();
+
+   return ok;
+}
+
+#endif /* i386 */
diff --git a/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
new file mode 100755
index 00000000000..9bee3717665
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/exec/tgsi_sse2.h
@@ -0,0 +1,26 @@
+#if !defined TGSI_SSE2_H
+#define TGSI_SSE2_H
+
+#if defined __cplusplus
+extern "C" {
+#endif // defined __cplusplus
+
+struct tgsi_token;
+struct x86_function;
+
+unsigned
+tgsi_emit_sse2(
+   struct tgsi_token *tokens,
+   struct x86_function *function );
+
+unsigned
+tgsi_emit_sse2_fs(
+   struct tgsi_token *tokens,
+   struct x86_function *function );
+
+#if defined __cplusplus
+} // extern "C"
+#endif // defined __cplusplus
+
+#endif // !defined TGSI_SSE2_H
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_build.c b/src/gallium/auxiliary/tgsi/util/tgsi_build.c
new file mode 100644
index 00000000000..a00ff1c2a5f
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_build.c
@@ -0,0 +1,1371 @@
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_build.h"
+#include "tgsi_parse.h"
+
+/*
+ * version
+ */
+
+struct tgsi_version
+tgsi_build_version( void )
+{
+   struct tgsi_version  version;
+
+   version.MajorVersion = 1;
+   version.MinorVersion = 1;
+   version.Padding = 0;
+
+   return version;
+}
+
+/*
+ * header
+ */
+
+struct tgsi_header
+tgsi_build_header( void )
+{
+   struct tgsi_header header;
+
+   header.HeaderSize = 1;
+   header.BodySize = 0;
+
+   return header;
+}
+
+static void
+header_headersize_grow( struct tgsi_header *header )
+{
+   assert( header->HeaderSize < 0xFF );
+   assert( header->BodySize == 0 );
+
+   header->HeaderSize++;
+}
+
+static void
+header_bodysize_grow( struct tgsi_header *header )
+{
+   assert( header->BodySize < 0xFFFFFF );
+
+   header->BodySize++;
+}
+
+struct tgsi_processor
+tgsi_default_processor( void )
+{
+   struct tgsi_processor processor;
+
+   processor.Processor = TGSI_PROCESSOR_FRAGMENT;
+   processor.Padding = 0;
+
+   return processor;
+}
+
+struct tgsi_processor
+tgsi_build_processor(
+   unsigned type,
+   struct tgsi_header *header )
+{
+   struct tgsi_processor processor;
+
+   processor = tgsi_default_processor();
+   processor.Processor = type;
+
+   header_headersize_grow( header );
+
+   return processor;
+}
+
+/*
+ * declaration
+ */
+
+struct tgsi_declaration
+tgsi_default_declaration( void )
+{
+   struct tgsi_declaration declaration;
+
+   declaration.Type = TGSI_TOKEN_TYPE_DECLARATION;
+   declaration.Size = 1;
+   declaration.File = TGSI_FILE_NULL;
+   declaration.Declare = TGSI_DECLARE_RANGE;
+   declaration.UsageMask = TGSI_WRITEMASK_XYZW;
+   declaration.Interpolate = 0;
+   declaration.Semantic = 0;
+   declaration.Padding = 0;
+   declaration.Extended = 0;
+
+   return declaration;
+}
+
+struct tgsi_declaration
+tgsi_build_declaration(
+   unsigned file,
+   unsigned declare,
+   unsigned usage_mask,
+   unsigned interpolate,
+   unsigned semantic,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration declaration;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( declare <= TGSI_DECLARE_MASK );
+
+   declaration = tgsi_default_declaration();
+   declaration.File = file;
+   declaration.Declare = declare;
+   declaration.UsageMask = usage_mask;
+   declaration.Interpolate = interpolate;
+   declaration.Semantic = semantic;
+
+   header_bodysize_grow( header );
+
+   return declaration;
+}
+
+static void
+declaration_grow(
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   assert( declaration->Size < 0xFF );
+
+   declaration->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_full_declaration
+tgsi_default_full_declaration( void )
+{
+   struct tgsi_full_declaration  full_declaration;
+
+   full_declaration.Declaration  = tgsi_default_declaration();
+   full_declaration.Interpolation = tgsi_default_declaration_interpolation();
+   full_declaration.Semantic = tgsi_default_declaration_semantic();
+
+   return full_declaration;
+}
+
+unsigned
+tgsi_build_full_declaration(
+   const struct tgsi_full_declaration *full_decl,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0;
+   struct tgsi_declaration *declaration;
+
+   if( maxsize <= size )
+     return 0;
+   declaration = (struct tgsi_declaration *) &tokens[size];
+   size++;
+
+   *declaration = tgsi_build_declaration(
+      full_decl->Declaration.File,
+      full_decl->Declaration.Declare,
+      full_decl->Declaration.UsageMask,
+      full_decl->Declaration.Interpolate,
+      full_decl->Declaration.Semantic,
+      header );
+
+   switch( full_decl->Declaration.Declare )  {
+   case TGSI_DECLARE_RANGE:
+   {
+      struct tgsi_declaration_range *dr;
+
+      if( maxsize <= size )
+         return 0;
+      dr = (struct tgsi_declaration_range *) &tokens[size];
+      size++;
+
+      *dr = tgsi_build_declaration_range(
+         full_decl->u.DeclarationRange.First,
+         full_decl->u.DeclarationRange.Last,
+         declaration,
+         header );
+      break;
+    }
+
+   case TGSI_DECLARE_MASK:
+   {
+      struct tgsi_declaration_mask *dm;
+
+      if( maxsize <= size )
+         return 0;
+      dm = (struct tgsi_declaration_mask  *) &tokens[size];
+      size++;
+
+      *dm = tgsi_build_declaration_mask(
+         full_decl->u.DeclarationMask.Mask,
+         declaration,
+         header );
+      break;
+   }
+
+   default:
+      assert( 0 );
+   }
+
+   if( full_decl->Declaration.Interpolate ) {
+      struct tgsi_declaration_interpolation *di;
+
+      if( maxsize <= size )
+         return  0;
+      di = (struct tgsi_declaration_interpolation *) &tokens[size];
+      size++;
+
+      *di = tgsi_build_declaration_interpolation(
+         full_decl->Interpolation.Interpolate,
+         declaration,
+         header );
+   }
+
+   if( full_decl->Declaration.Semantic ) {
+      struct tgsi_declaration_semantic *ds;
+
+      if( maxsize <= size )
+         return  0;
+      ds = (struct tgsi_declaration_semantic *) &tokens[size];
+      size++;
+
+      *ds = tgsi_build_declaration_semantic(
+         full_decl->Semantic.SemanticName,
+         full_decl->Semantic.SemanticIndex,
+         declaration,
+         header );
+   }
+
+   return size;
+}
+
+struct tgsi_declaration_range
+tgsi_build_declaration_range(
+   unsigned first,
+   unsigned last,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_range declaration_range;
+
+   assert( last >= first );
+   assert( last <= 0xFFFF );
+
+   declaration_range.First = first;
+   declaration_range.Last = last;
+
+   declaration_grow( declaration, header );
+
+   return declaration_range;
+}
+
+struct tgsi_declaration_mask
+tgsi_build_declaration_mask(
+   unsigned mask,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_mask  declaration_mask;
+
+   declaration_mask.Mask = mask;
+
+   declaration_grow( declaration, header );
+
+   return declaration_mask;
+}
+
+struct tgsi_declaration_interpolation
+tgsi_default_declaration_interpolation( void )
+{
+   struct tgsi_declaration_interpolation di;
+
+   di.Interpolate = TGSI_INTERPOLATE_CONSTANT;
+   di.Padding = 0;
+
+   return di;
+}
+
+struct tgsi_declaration_interpolation
+tgsi_build_declaration_interpolation(
+   unsigned interpolate,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_interpolation di;
+
+   assert( interpolate <= TGSI_INTERPOLATE_PERSPECTIVE );
+
+   di = tgsi_default_declaration_interpolation();
+   di.Interpolate = interpolate;
+
+   declaration_grow( declaration, header );
+
+   return di;
+}
+
+struct tgsi_declaration_semantic
+tgsi_default_declaration_semantic( void )
+{
+   struct tgsi_declaration_semantic ds;
+
+   ds.SemanticName = TGSI_SEMANTIC_POSITION;
+   ds.SemanticIndex = 0;
+   ds.Padding = 0;
+
+   return ds;
+}
+
+struct tgsi_declaration_semantic
+tgsi_build_declaration_semantic(
+   unsigned semantic_name,
+   unsigned semantic_index,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header )
+{
+   struct tgsi_declaration_semantic ds;
+
+   assert( semantic_name <= TGSI_SEMANTIC_COUNT );
+   assert( semantic_index <= 0xFFFF );
+
+   ds = tgsi_default_declaration_semantic();
+   ds.SemanticName = semantic_name;
+   ds.SemanticIndex = semantic_index;
+
+   declaration_grow( declaration, header );
+
+   return ds;
+}
+
+/*
+ * immediate
+ */
+
+struct tgsi_immediate
+tgsi_default_immediate( void )
+{
+   struct tgsi_immediate immediate;
+
+   immediate.Type = TGSI_TOKEN_TYPE_IMMEDIATE;
+   immediate.Size = 1;
+   immediate.DataType = TGSI_IMM_FLOAT32;
+   immediate.Padding = 0;
+   immediate.Extended = 0;
+
+   return immediate;
+}
+
+struct tgsi_immediate
+tgsi_build_immediate(
+   struct tgsi_header *header )
+{
+   struct tgsi_immediate immediate;
+
+   immediate = tgsi_default_immediate();
+
+   header_bodysize_grow( header );
+
+   return immediate;
+}
+
+struct tgsi_full_immediate
+tgsi_default_full_immediate( void )
+{
+   struct tgsi_full_immediate fullimm;
+
+   fullimm.Immediate = tgsi_default_immediate();
+   fullimm.u.Pointer = (void *) 0;
+
+   return fullimm;
+}
+
+static void
+immediate_grow(
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header )
+{
+   assert( immediate->Size < 0xFF );
+
+   immediate->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_immediate_float32
+tgsi_build_immediate_float32(
+   float value,
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header )
+{
+   struct tgsi_immediate_float32 immediate_float32;
+
+   immediate_float32.Float = value;
+
+   immediate_grow( immediate, header );
+
+   return immediate_float32;
+}
+
+unsigned
+tgsi_build_full_immediate(
+   const struct tgsi_full_immediate *full_imm,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize )
+{
+   unsigned size = 0, i;
+   struct tgsi_immediate *immediate;
+
+   if( maxsize <= size )
+      return 0;
+   immediate = (struct tgsi_immediate *) &tokens[size];
+   size++;
+
+   *immediate = tgsi_build_immediate( header );
+
+   for( i = 0; i < full_imm->Immediate.Size - 1; i++ ) {
+      struct tgsi_immediate_float32 *if32;
+
+      if( maxsize <= size )
+         return  0;
+      if32 = (struct tgsi_immediate_float32 *) &tokens[size];
+      size++;
+
+      *if32 = tgsi_build_immediate_float32(
+         full_imm->u.ImmediateFloat32[i].Float,
+         immediate,
+         header );
+   }
+
+   return size;
+}
+
+/*
+ * instruction
+ */
+
+struct tgsi_instruction
+tgsi_default_instruction( void )
+{
+   struct tgsi_instruction instruction;
+
+   instruction.Type = TGSI_TOKEN_TYPE_INSTRUCTION;
+   instruction.Size = 1;
+   instruction.Opcode = TGSI_OPCODE_MOV;
+   instruction.Saturate = TGSI_SAT_NONE;
+   instruction.NumDstRegs = 1;
+   instruction.NumSrcRegs = 1;
+   instruction.Padding  = 0;
+   instruction.Extended = 0;
+
+   return instruction;
+}
+
+struct tgsi_instruction
+tgsi_build_instruction(
+   unsigned opcode,
+   unsigned saturate,
+   unsigned num_dst_regs,
+   unsigned num_src_regs,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction instruction;
+
+   assert (opcode <= TGSI_OPCODE_LAST);
+   assert (saturate <= TGSI_SAT_MINUS_PLUS_ONE);
+   assert (num_dst_regs <= 3);
+   assert (num_src_regs <= 15);
+
+   instruction = tgsi_default_instruction();
+   instruction.Opcode = opcode;
+   instruction.Saturate = saturate;
+   instruction.NumDstRegs = num_dst_regs;
+   instruction.NumSrcRegs = num_src_regs;
+
+   header_bodysize_grow( header );
+
+   return instruction;
+}
+
+static void
+instruction_grow(
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   assert (instruction->Size <   0xFF);
+
+   instruction->Size++;
+
+   header_bodysize_grow( header );
+}
+
+struct tgsi_full_instruction
+tgsi_default_full_instruction( void )
+{
+   struct tgsi_full_instruction full_instruction;
+   unsigned i;
+
+   full_instruction.Instruction = tgsi_default_instruction();
+   full_instruction.InstructionExtNv = tgsi_default_instruction_ext_nv();
+   full_instruction.InstructionExtLabel = tgsi_default_instruction_ext_label();
+   full_instruction.InstructionExtTexture = tgsi_default_instruction_ext_texture();
+   for( i = 0;  i < TGSI_FULL_MAX_DST_REGISTERS; i++ ) {
+      full_instruction.FullDstRegisters[i] = tgsi_default_full_dst_register();
+   }
+   for( i = 0;  i < TGSI_FULL_MAX_SRC_REGISTERS; i++ ) {
+      full_instruction.FullSrcRegisters[i] = tgsi_default_full_src_register();
+   }
+
+   return full_instruction;
+}
+
+unsigned
+tgsi_build_full_instruction(
+   const struct tgsi_full_instruction *full_inst,
+   struct  tgsi_token *tokens,
+   struct  tgsi_header *header,
+   unsigned  maxsize )
+{
+   unsigned size = 0;
+   unsigned i;
+   struct tgsi_instruction *instruction;
+   struct tgsi_token *prev_token;
+
+   if( maxsize <= size )
+      return 0;
+   instruction = (struct tgsi_instruction *) &tokens[size];
+   size++;
+
+   *instruction = tgsi_build_instruction(
+      full_inst->Instruction.Opcode,
+      full_inst->Instruction.Saturate,
+      full_inst->Instruction.NumDstRegs,
+      full_inst->Instruction.NumSrcRegs,
+      header );
+   prev_token = (struct tgsi_token  *) instruction;
+
+   if( tgsi_compare_instruction_ext_nv(
+         full_inst->InstructionExtNv,
+         tgsi_default_instruction_ext_nv() ) ) {
+      struct tgsi_instruction_ext_nv *instruction_ext_nv;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_nv =
+         (struct  tgsi_instruction_ext_nv *) &tokens[size];
+      size++;
+
+      *instruction_ext_nv  = tgsi_build_instruction_ext_nv(
+         full_inst->InstructionExtNv.Precision,
+         full_inst->InstructionExtNv.CondDstIndex,
+         full_inst->InstructionExtNv.CondFlowIndex,
+         full_inst->InstructionExtNv.CondMask,
+         full_inst->InstructionExtNv.CondSwizzleX,
+         full_inst->InstructionExtNv.CondSwizzleY,
+         full_inst->InstructionExtNv.CondSwizzleZ,
+         full_inst->InstructionExtNv.CondSwizzleW,
+         full_inst->InstructionExtNv.CondDstUpdate,
+         full_inst->InstructionExtNv.CondFlowEnable,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_ext_nv;
+   }
+
+   if( tgsi_compare_instruction_ext_label(
+         full_inst->InstructionExtLabel,
+         tgsi_default_instruction_ext_label() ) ) {
+      struct tgsi_instruction_ext_label *instruction_ext_label;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_label =
+         (struct  tgsi_instruction_ext_label *) &tokens[size];
+      size++;
+
+      *instruction_ext_label = tgsi_build_instruction_ext_label(
+         full_inst->InstructionExtLabel.Label,
+         prev_token,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) instruction_ext_label;
+   }
+
+   if( tgsi_compare_instruction_ext_texture(
+         full_inst->InstructionExtTexture,
+         tgsi_default_instruction_ext_texture() ) ) {
+      struct tgsi_instruction_ext_texture *instruction_ext_texture;
+
+      if( maxsize <= size )
+         return 0;
+      instruction_ext_texture =
+         (struct  tgsi_instruction_ext_texture *) &tokens[size];
+      size++;
+
+      *instruction_ext_texture = tgsi_build_instruction_ext_texture(
+         full_inst->InstructionExtTexture.Texture,
+         prev_token,
+         instruction,
+         header   );
+      prev_token = (struct tgsi_token  *) instruction_ext_texture;
+   }
+
+   for( i = 0;  i <   full_inst->Instruction.NumDstRegs; i++ ) {
+      const struct tgsi_full_dst_register *reg = &full_inst->FullDstRegisters[i];
+      struct tgsi_dst_register *dst_register;
+      struct tgsi_token *prev_token;
+
+      if( maxsize <= size )
+         return 0;
+      dst_register = (struct tgsi_dst_register *) &tokens[size];
+      size++;
+
+      *dst_register = tgsi_build_dst_register(
+         reg->DstRegister.File,
+         reg->DstRegister.WriteMask,
+         reg->DstRegister.Index,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) dst_register;
+
+      if( tgsi_compare_dst_register_ext_concode(
+            reg->DstRegisterExtConcode,
+            tgsi_default_dst_register_ext_concode() ) ) {
+         struct tgsi_dst_register_ext_concode *dst_register_ext_concode;
+
+         if( maxsize <= size )
+            return 0;
+         dst_register_ext_concode =
+            (struct  tgsi_dst_register_ext_concode *) &tokens[size];
+         size++;
+
+         *dst_register_ext_concode =   tgsi_build_dst_register_ext_concode(
+            reg->DstRegisterExtConcode.CondMask,
+            reg->DstRegisterExtConcode.CondSwizzleX,
+            reg->DstRegisterExtConcode.CondSwizzleY,
+            reg->DstRegisterExtConcode.CondSwizzleZ,
+            reg->DstRegisterExtConcode.CondSwizzleW,
+            reg->DstRegisterExtConcode.CondSrcIndex,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) dst_register_ext_concode;
+      }
+
+      if( tgsi_compare_dst_register_ext_modulate(
+            reg->DstRegisterExtModulate,
+            tgsi_default_dst_register_ext_modulate() ) ) {
+         struct tgsi_dst_register_ext_modulate *dst_register_ext_modulate;
+
+         if( maxsize <= size )
+            return 0;
+         dst_register_ext_modulate =
+            (struct  tgsi_dst_register_ext_modulate *) &tokens[size];
+         size++;
+
+         *dst_register_ext_modulate = tgsi_build_dst_register_ext_modulate(
+            reg->DstRegisterExtModulate.Modulate,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) dst_register_ext_modulate;
+      }
+   }
+
+   for( i = 0;  i < full_inst->Instruction.NumSrcRegs; i++ ) {
+      const struct tgsi_full_src_register *reg = &full_inst->FullSrcRegisters[i];
+      struct tgsi_src_register *src_register;
+      struct tgsi_token *prev_token;
+
+      if( maxsize <= size )
+         return 0;
+      src_register = (struct tgsi_src_register *)  &tokens[size];
+      size++;
+
+      *src_register = tgsi_build_src_register(
+         reg->SrcRegister.File,
+         reg->SrcRegister.SwizzleX,
+         reg->SrcRegister.SwizzleY,
+         reg->SrcRegister.SwizzleZ,
+         reg->SrcRegister.SwizzleW,
+         reg->SrcRegister.Negate,
+         reg->SrcRegister.Indirect,
+         reg->SrcRegister.Dimension,
+         reg->SrcRegister.Index,
+         instruction,
+         header );
+      prev_token = (struct tgsi_token  *) src_register;
+
+      if( tgsi_compare_src_register_ext_swz(
+            reg->SrcRegisterExtSwz,
+            tgsi_default_src_register_ext_swz() ) ) {
+         struct tgsi_src_register_ext_swz *src_register_ext_swz;
+
+         if( maxsize <= size )
+            return 0;
+         src_register_ext_swz =
+            (struct  tgsi_src_register_ext_swz *) &tokens[size];
+         size++;
+
+         *src_register_ext_swz = tgsi_build_src_register_ext_swz(
+            reg->SrcRegisterExtSwz.ExtSwizzleX,
+            reg->SrcRegisterExtSwz.ExtSwizzleY,
+            reg->SrcRegisterExtSwz.ExtSwizzleZ,
+            reg->SrcRegisterExtSwz.ExtSwizzleW,
+            reg->SrcRegisterExtSwz.NegateX,
+            reg->SrcRegisterExtSwz.NegateY,
+            reg->SrcRegisterExtSwz.NegateZ,
+            reg->SrcRegisterExtSwz.NegateW,
+            reg->SrcRegisterExtSwz.ExtDivide,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) src_register_ext_swz;
+      }
+
+      if( tgsi_compare_src_register_ext_mod(
+            reg->SrcRegisterExtMod,
+            tgsi_default_src_register_ext_mod() ) ) {
+         struct tgsi_src_register_ext_mod *src_register_ext_mod;
+
+         if( maxsize <= size )
+            return 0;
+         src_register_ext_mod =
+            (struct  tgsi_src_register_ext_mod *) &tokens[size];
+         size++;
+
+         *src_register_ext_mod = tgsi_build_src_register_ext_mod(
+            reg->SrcRegisterExtMod.Complement,
+            reg->SrcRegisterExtMod.Bias,
+            reg->SrcRegisterExtMod.Scale2X,
+            reg->SrcRegisterExtMod.Absolute,
+            reg->SrcRegisterExtMod.Negate,
+            prev_token,
+            instruction,
+            header );
+         prev_token = (struct tgsi_token  *) src_register_ext_mod;
+      }
+
+      if( reg->SrcRegister.Indirect ) {
+         struct  tgsi_src_register *ind;
+
+         if( maxsize <= size )
+            return 0;
+         ind = (struct tgsi_src_register *) &tokens[size];
+         size++;
+
+         *ind = tgsi_build_src_register(
+            reg->SrcRegisterInd.File,
+            reg->SrcRegisterInd.SwizzleX,
+            reg->SrcRegisterInd.SwizzleY,
+            reg->SrcRegisterInd.SwizzleZ,
+            reg->SrcRegisterInd.SwizzleW,
+            reg->SrcRegisterInd.Negate,
+            reg->SrcRegisterInd.Indirect,
+            reg->SrcRegisterInd.Dimension,
+            reg->SrcRegisterInd.Index,
+            instruction,
+            header );
+      }
+
+      if( reg->SrcRegister.Dimension ) {
+         struct  tgsi_dimension *dim;
+
+         assert( !reg->SrcRegisterDim.Dimension );
+
+         if( maxsize <= size )
+            return 0;
+         dim = (struct tgsi_dimension *) &tokens[size];
+         size++;
+
+         *dim = tgsi_build_dimension(
+            reg->SrcRegisterDim.Indirect,
+            reg->SrcRegisterDim.Index,
+            instruction,
+            header );
+
+         if( reg->SrcRegisterDim.Indirect ) {
+            struct tgsi_src_register *ind;
+
+            if( maxsize <= size )
+               return 0;
+            ind = (struct tgsi_src_register *) &tokens[size];
+            size++;
+
+            *ind = tgsi_build_src_register(
+               reg->SrcRegisterDimInd.File,
+               reg->SrcRegisterDimInd.SwizzleX,
+               reg->SrcRegisterDimInd.SwizzleY,
+               reg->SrcRegisterDimInd.SwizzleZ,
+               reg->SrcRegisterDimInd.SwizzleW,
+               reg->SrcRegisterDimInd.Negate,
+               reg->SrcRegisterDimInd.Indirect,
+               reg->SrcRegisterDimInd.Dimension,
+               reg->SrcRegisterDimInd.Index,
+               instruction,
+               header );
+         }
+      }
+   }
+
+   return size;
+}
+
+struct tgsi_instruction_ext_nv
+tgsi_default_instruction_ext_nv( void )
+{
+   struct tgsi_instruction_ext_nv instruction_ext_nv;
+
+   instruction_ext_nv.Type = TGSI_INSTRUCTION_EXT_TYPE_NV;
+   instruction_ext_nv.Precision = TGSI_PRECISION_DEFAULT;
+   instruction_ext_nv.CondDstIndex = 0;
+   instruction_ext_nv.CondFlowIndex = 0;
+   instruction_ext_nv.CondMask = TGSI_CC_TR;
+   instruction_ext_nv.CondSwizzleX = TGSI_SWIZZLE_X;
+   instruction_ext_nv.CondSwizzleY = TGSI_SWIZZLE_Y;
+   instruction_ext_nv.CondSwizzleZ = TGSI_SWIZZLE_Z;
+   instruction_ext_nv.CondSwizzleW = TGSI_SWIZZLE_W;
+   instruction_ext_nv.CondDstUpdate = 0;
+   instruction_ext_nv.CondFlowEnable = 0;
+   instruction_ext_nv.Padding = 0;
+   instruction_ext_nv.Extended = 0;
+
+   return instruction_ext_nv;
+}
+
+union token_u32
+{
+   unsigned u32;
+};
+
+unsigned
+tgsi_compare_instruction_ext_nv(
+   struct tgsi_instruction_ext_nv a,
+   struct tgsi_instruction_ext_nv b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_instruction_ext_nv
+tgsi_build_instruction_ext_nv(
+   unsigned precision,
+   unsigned cond_dst_index,
+   unsigned cond_flow_index,
+   unsigned cond_mask,
+   unsigned cond_swizzle_x,
+   unsigned cond_swizzle_y,
+   unsigned cond_swizzle_z,
+   unsigned cond_swizzle_w,
+   unsigned cond_dst_update,
+   unsigned cond_flow_update,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_nv instruction_ext_nv;
+
+   instruction_ext_nv = tgsi_default_instruction_ext_nv();
+   instruction_ext_nv.Precision = precision;
+   instruction_ext_nv.CondDstIndex = cond_dst_index;
+   instruction_ext_nv.CondFlowIndex = cond_flow_index;
+   instruction_ext_nv.CondMask = cond_mask;
+   instruction_ext_nv.CondSwizzleX = cond_swizzle_x;
+   instruction_ext_nv.CondSwizzleY = cond_swizzle_y;
+   instruction_ext_nv.CondSwizzleZ = cond_swizzle_z;
+   instruction_ext_nv.CondSwizzleW = cond_swizzle_w;
+   instruction_ext_nv.CondDstUpdate = cond_dst_update;
+   instruction_ext_nv.CondFlowEnable = cond_flow_update;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_nv;
+}
+
+struct tgsi_instruction_ext_label
+tgsi_default_instruction_ext_label( void )
+{
+   struct tgsi_instruction_ext_label instruction_ext_label;
+
+   instruction_ext_label.Type = TGSI_INSTRUCTION_EXT_TYPE_LABEL;
+   instruction_ext_label.Label = 0;
+   instruction_ext_label.Padding = 0;
+   instruction_ext_label.Extended = 0;
+
+   return instruction_ext_label;
+}
+
+unsigned
+tgsi_compare_instruction_ext_label(
+   struct tgsi_instruction_ext_label a,
+   struct tgsi_instruction_ext_label b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_instruction_ext_label
+tgsi_build_instruction_ext_label(
+   unsigned label,
+   struct tgsi_token  *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_label instruction_ext_label;
+
+   instruction_ext_label = tgsi_default_instruction_ext_label();
+   instruction_ext_label.Label = label;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_label;
+}
+
+struct tgsi_instruction_ext_texture
+tgsi_default_instruction_ext_texture( void )
+{
+   struct tgsi_instruction_ext_texture instruction_ext_texture;
+
+   instruction_ext_texture.Type = TGSI_INSTRUCTION_EXT_TYPE_TEXTURE;
+   instruction_ext_texture.Texture = TGSI_TEXTURE_UNKNOWN;
+   instruction_ext_texture.Padding = 0;
+   instruction_ext_texture.Extended = 0;
+
+   return instruction_ext_texture;
+}
+
+unsigned
+tgsi_compare_instruction_ext_texture(
+   struct tgsi_instruction_ext_texture a,
+   struct tgsi_instruction_ext_texture b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_instruction_ext_texture
+tgsi_build_instruction_ext_texture(
+   unsigned texture,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_instruction_ext_texture instruction_ext_texture;
+
+   instruction_ext_texture = tgsi_default_instruction_ext_texture();
+   instruction_ext_texture.Texture = texture;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return instruction_ext_texture;
+}
+
+struct tgsi_src_register
+tgsi_default_src_register( void )
+{
+   struct tgsi_src_register src_register;
+
+   src_register.File = TGSI_FILE_NULL;
+   src_register.SwizzleX = TGSI_SWIZZLE_X;
+   src_register.SwizzleY = TGSI_SWIZZLE_Y;
+   src_register.SwizzleZ = TGSI_SWIZZLE_Z;
+   src_register.SwizzleW = TGSI_SWIZZLE_W;
+   src_register.Negate = 0;
+   src_register.Indirect = 0;
+   src_register.Dimension = 0;
+   src_register.Index = 0;
+   src_register.Extended = 0;
+
+   return src_register;
+}
+
+struct tgsi_src_register
+tgsi_build_src_register(
+   unsigned file,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   unsigned negate,
+   unsigned indirect,
+   unsigned dimension,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register   src_register;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( swizzle_x <= TGSI_SWIZZLE_W );
+   assert( swizzle_y <= TGSI_SWIZZLE_W );
+   assert( swizzle_z <= TGSI_SWIZZLE_W );
+   assert( swizzle_w <= TGSI_SWIZZLE_W );
+   assert( negate <= 1 );
+   assert( index >= -0x8000 && index <= 0x7FFF );
+
+   src_register = tgsi_default_src_register();
+   src_register.File = file;
+   src_register.SwizzleX = swizzle_x;
+   src_register.SwizzleY = swizzle_y;
+   src_register.SwizzleZ = swizzle_z;
+   src_register.SwizzleW = swizzle_w;
+   src_register.Negate = negate;
+   src_register.Indirect = indirect;
+   src_register.Dimension = dimension;
+   src_register.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return src_register;
+}
+
+struct tgsi_full_src_register
+tgsi_default_full_src_register( void )
+{
+   struct tgsi_full_src_register full_src_register;
+
+   full_src_register.SrcRegister = tgsi_default_src_register();
+   full_src_register.SrcRegisterExtSwz = tgsi_default_src_register_ext_swz();
+   full_src_register.SrcRegisterExtMod = tgsi_default_src_register_ext_mod();
+   full_src_register.SrcRegisterInd = tgsi_default_src_register();
+   full_src_register.SrcRegisterDim = tgsi_default_dimension();
+   full_src_register.SrcRegisterDimInd = tgsi_default_src_register();
+
+   return full_src_register;
+}
+
+struct tgsi_src_register_ext_swz
+tgsi_default_src_register_ext_swz( void )
+{
+   struct tgsi_src_register_ext_swz src_register_ext_swz;
+
+   src_register_ext_swz.Type = TGSI_SRC_REGISTER_EXT_TYPE_SWZ;
+   src_register_ext_swz.ExtSwizzleX = TGSI_EXTSWIZZLE_X;
+   src_register_ext_swz.ExtSwizzleY = TGSI_EXTSWIZZLE_Y;
+   src_register_ext_swz.ExtSwizzleZ = TGSI_EXTSWIZZLE_Z;
+   src_register_ext_swz.ExtSwizzleW = TGSI_EXTSWIZZLE_W;
+   src_register_ext_swz.NegateX = 0;
+   src_register_ext_swz.NegateY = 0;
+   src_register_ext_swz.NegateZ = 0;
+   src_register_ext_swz.NegateW = 0;
+   src_register_ext_swz.ExtDivide = TGSI_EXTSWIZZLE_ONE;
+   src_register_ext_swz.Padding = 0;
+   src_register_ext_swz.Extended = 0;
+
+   return src_register_ext_swz;
+}
+
+unsigned
+tgsi_compare_src_register_ext_swz(
+   struct tgsi_src_register_ext_swz a,
+   struct tgsi_src_register_ext_swz b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_src_register_ext_swz
+tgsi_build_src_register_ext_swz(
+   unsigned ext_swizzle_x,
+   unsigned ext_swizzle_y,
+   unsigned ext_swizzle_z,
+   unsigned ext_swizzle_w,
+   unsigned negate_x,
+   unsigned negate_y,
+   unsigned negate_z,
+   unsigned negate_w,
+   unsigned ext_divide,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register_ext_swz src_register_ext_swz;
+
+   assert( ext_swizzle_x <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_y <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_z <= TGSI_EXTSWIZZLE_ONE );
+   assert( ext_swizzle_w <= TGSI_EXTSWIZZLE_ONE );
+   assert( negate_x <= 1 );
+   assert( negate_y <= 1 );
+   assert( negate_z <= 1 );
+   assert( negate_w <= 1 );
+   assert( ext_divide <= TGSI_EXTSWIZZLE_ONE );
+
+   src_register_ext_swz = tgsi_default_src_register_ext_swz();
+   src_register_ext_swz.ExtSwizzleX = ext_swizzle_x;
+   src_register_ext_swz.ExtSwizzleY = ext_swizzle_y;
+   src_register_ext_swz.ExtSwizzleZ = ext_swizzle_z;
+   src_register_ext_swz.ExtSwizzleW = ext_swizzle_w;
+   src_register_ext_swz.NegateX = negate_x;
+   src_register_ext_swz.NegateY = negate_y;
+   src_register_ext_swz.NegateZ = negate_z;
+   src_register_ext_swz.NegateW = negate_w;
+   src_register_ext_swz.ExtDivide = ext_divide;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return src_register_ext_swz;
+}
+
+struct tgsi_src_register_ext_mod
+tgsi_default_src_register_ext_mod( void )
+{
+   struct tgsi_src_register_ext_mod src_register_ext_mod;
+
+   src_register_ext_mod.Type = TGSI_SRC_REGISTER_EXT_TYPE_MOD;
+   src_register_ext_mod.Complement = 0;
+   src_register_ext_mod.Bias = 0;
+   src_register_ext_mod.Scale2X = 0;
+   src_register_ext_mod.Absolute = 0;
+   src_register_ext_mod.Negate = 0;
+   src_register_ext_mod.Padding = 0;
+   src_register_ext_mod.Extended = 0;
+
+   return src_register_ext_mod;
+}
+
+unsigned
+tgsi_compare_src_register_ext_mod(
+   struct tgsi_src_register_ext_mod a,
+   struct tgsi_src_register_ext_mod b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_src_register_ext_mod
+tgsi_build_src_register_ext_mod(
+   unsigned complement,
+   unsigned bias,
+   unsigned scale_2x,
+   unsigned absolute,
+   unsigned negate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_src_register_ext_mod src_register_ext_mod;
+
+   assert( complement <= 1 );
+   assert( bias <= 1 );
+   assert( scale_2x <= 1 );
+   assert( absolute <= 1 );
+   assert( negate <= 1 );
+
+   src_register_ext_mod = tgsi_default_src_register_ext_mod();
+   src_register_ext_mod.Complement = complement;
+   src_register_ext_mod.Bias = bias;
+   src_register_ext_mod.Scale2X = scale_2x;
+   src_register_ext_mod.Absolute = absolute;
+   src_register_ext_mod.Negate = negate;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return src_register_ext_mod;
+}
+
+struct tgsi_dimension
+tgsi_default_dimension( void )
+{
+   struct tgsi_dimension dimension;
+
+   dimension.Indirect = 0;
+   dimension.Dimension = 0;
+   dimension.Padding = 0;
+   dimension.Index = 0;
+   dimension.Extended = 0;
+
+   return dimension;
+}
+
+struct tgsi_dimension
+tgsi_build_dimension(
+   unsigned indirect,
+   unsigned index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dimension dimension;
+
+   dimension = tgsi_default_dimension();
+   dimension.Indirect = indirect;
+   dimension.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return dimension;
+}
+
+struct tgsi_dst_register
+tgsi_default_dst_register( void )
+{
+   struct tgsi_dst_register dst_register;
+
+   dst_register.File = TGSI_FILE_NULL;
+   dst_register.WriteMask = TGSI_WRITEMASK_XYZW;
+   dst_register.Indirect = 0;
+   dst_register.Dimension = 0;
+   dst_register.Index = 0;
+   dst_register.Padding = 0;
+   dst_register.Extended = 0;
+
+   return dst_register;
+}
+
+struct tgsi_dst_register
+tgsi_build_dst_register(
+   unsigned file,
+   unsigned mask,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register dst_register;
+
+   assert( file <= TGSI_FILE_IMMEDIATE );
+   assert( mask <= TGSI_WRITEMASK_XYZW );
+   assert( index >= -32768 && index <= 32767 );
+
+   dst_register = tgsi_default_dst_register();
+   dst_register.File = file;
+   dst_register.WriteMask = mask;
+   dst_register.Index = index;
+
+   instruction_grow( instruction, header );
+
+   return dst_register;
+}
+
+struct tgsi_full_dst_register
+tgsi_default_full_dst_register( void )
+{
+   struct tgsi_full_dst_register full_dst_register;
+
+   full_dst_register.DstRegister = tgsi_default_dst_register();
+   full_dst_register.DstRegisterExtConcode =
+      tgsi_default_dst_register_ext_concode();
+   full_dst_register.DstRegisterExtModulate =
+      tgsi_default_dst_register_ext_modulate();
+
+   return full_dst_register;
+}
+
+struct tgsi_dst_register_ext_concode
+tgsi_default_dst_register_ext_concode( void )
+{
+   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
+
+   dst_register_ext_concode.Type = TGSI_DST_REGISTER_EXT_TYPE_CONDCODE;
+   dst_register_ext_concode.CondMask = TGSI_CC_TR;
+   dst_register_ext_concode.CondSwizzleX = TGSI_SWIZZLE_X;
+   dst_register_ext_concode.CondSwizzleY = TGSI_SWIZZLE_Y;
+   dst_register_ext_concode.CondSwizzleZ = TGSI_SWIZZLE_Z;
+   dst_register_ext_concode.CondSwizzleW = TGSI_SWIZZLE_W;
+   dst_register_ext_concode.CondSrcIndex = 0;
+   dst_register_ext_concode.Padding = 0;
+   dst_register_ext_concode.Extended = 0;
+
+   return dst_register_ext_concode;
+}
+
+unsigned
+tgsi_compare_dst_register_ext_concode(
+   struct tgsi_dst_register_ext_concode a,
+   struct tgsi_dst_register_ext_concode b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_dst_register_ext_concode
+tgsi_build_dst_register_ext_concode(
+   unsigned cc,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   int index,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register_ext_concode dst_register_ext_concode;
+
+   assert( cc <= TGSI_CC_FL );
+   assert( swizzle_x <= TGSI_SWIZZLE_W );
+   assert( swizzle_y <= TGSI_SWIZZLE_W );
+   assert( swizzle_z <= TGSI_SWIZZLE_W );
+   assert( swizzle_w <= TGSI_SWIZZLE_W );
+   assert( index >= -32768 && index <= 32767 );
+
+   dst_register_ext_concode = tgsi_default_dst_register_ext_concode();
+   dst_register_ext_concode.CondMask = cc;
+   dst_register_ext_concode.CondSwizzleX = swizzle_x;
+   dst_register_ext_concode.CondSwizzleY = swizzle_y;
+   dst_register_ext_concode.CondSwizzleZ = swizzle_z;
+   dst_register_ext_concode.CondSwizzleW = swizzle_w;
+   dst_register_ext_concode.CondSrcIndex = index;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return dst_register_ext_concode;
+}
+
+struct tgsi_dst_register_ext_modulate
+tgsi_default_dst_register_ext_modulate( void )
+{
+   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
+
+   dst_register_ext_modulate.Type = TGSI_DST_REGISTER_EXT_TYPE_MODULATE;
+   dst_register_ext_modulate.Modulate = TGSI_MODULATE_1X;
+   dst_register_ext_modulate.Padding = 0;
+   dst_register_ext_modulate.Extended = 0;
+
+   return dst_register_ext_modulate;
+}
+
+unsigned
+tgsi_compare_dst_register_ext_modulate(
+   struct tgsi_dst_register_ext_modulate a,
+   struct tgsi_dst_register_ext_modulate b )
+{
+   a.Padding = b.Padding = 0;
+   a.Extended = b.Extended = 0;
+   return ((union token_u32 *) &a)->u32 != ((union token_u32 *) &b)->u32;
+}
+
+struct tgsi_dst_register_ext_modulate
+tgsi_build_dst_register_ext_modulate(
+   unsigned modulate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header )
+{
+   struct tgsi_dst_register_ext_modulate dst_register_ext_modulate;
+
+   assert( modulate <= TGSI_MODULATE_EIGHTH );
+
+   dst_register_ext_modulate = tgsi_default_dst_register_ext_modulate();
+   dst_register_ext_modulate.Modulate = modulate;
+
+   prev_token->Extended = 1;
+   instruction_grow( instruction, header );
+
+   return dst_register_ext_modulate;
+}
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_build.h b/src/gallium/auxiliary/tgsi/util/tgsi_build.h
new file mode 100644
index 00000000000..116c78abf34
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_build.h
@@ -0,0 +1,320 @@
+#if !defined TGSI_BUILD_H
+#define TGSI_BUILD_H
+
+#if defined __cplusplus
+extern "C" {
+#endif // defined __cplusplus
+
+/*
+ * version
+ */
+
+struct tgsi_version
+tgsi_build_version( void );
+
+/*
+ * header
+ */
+
+struct tgsi_header
+tgsi_build_header( void );
+
+struct tgsi_processor
+tgsi_default_processor( void );
+
+struct tgsi_processor
+tgsi_build_processor(
+   unsigned processor,
+   struct tgsi_header *header );
+
+/*
+ * declaration
+ */
+
+struct tgsi_declaration
+tgsi_default_declaration( void );
+
+struct tgsi_declaration
+tgsi_build_declaration(
+   unsigned file,
+   unsigned declare,
+   unsigned usage_mask,
+   unsigned interpolate,
+   unsigned semantic,
+   struct tgsi_header *header );
+
+struct tgsi_full_declaration
+tgsi_default_full_declaration( void );
+
+unsigned
+tgsi_build_full_declaration(
+   const struct tgsi_full_declaration *full_decl,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+struct tgsi_declaration_range
+tgsi_build_declaration_range(
+   unsigned first,
+   unsigned last,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+struct tgsi_declaration_mask
+tgsi_build_declaration_mask(
+   unsigned mask,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+struct tgsi_declaration_interpolation
+tgsi_default_declaration_interpolation( void );
+
+struct tgsi_declaration_interpolation
+tgsi_build_declaration_interpolation(
+   unsigned interpolate,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+struct tgsi_declaration_semantic
+tgsi_default_declaration_semantic( void );
+
+struct tgsi_declaration_semantic
+tgsi_build_declaration_semantic(
+   unsigned semantic_name,
+   unsigned semantic_index,
+   struct tgsi_declaration *declaration,
+   struct tgsi_header *header );
+
+/*
+ * immediate
+ */
+
+struct tgsi_immediate
+tgsi_default_immediate( void );
+
+struct tgsi_immediate
+tgsi_build_immediate(
+   struct tgsi_header *header );
+
+struct tgsi_full_immediate
+tgsi_default_full_immediate( void );
+
+struct tgsi_immediate_float32
+tgsi_build_immediate_float32(
+   float value,
+   struct tgsi_immediate *immediate,
+   struct tgsi_header *header );
+
+unsigned
+tgsi_build_full_immediate(
+   const struct tgsi_full_immediate *full_imm,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+/*
+ * instruction
+ */
+
+struct tgsi_instruction
+tgsi_default_instruction( void );
+
+struct tgsi_instruction
+tgsi_build_instruction(
+   unsigned opcode,
+   unsigned saturate,
+   unsigned num_dst_regs,
+   unsigned num_src_regs,
+   struct tgsi_header *header );
+
+struct tgsi_full_instruction
+tgsi_default_full_instruction( void );
+
+unsigned
+tgsi_build_full_instruction(
+   const struct tgsi_full_instruction *full_inst,
+   struct tgsi_token *tokens,
+   struct tgsi_header *header,
+   unsigned maxsize );
+
+struct tgsi_instruction_ext_nv
+tgsi_default_instruction_ext_nv( void );
+
+unsigned
+tgsi_compare_instruction_ext_nv(
+   struct tgsi_instruction_ext_nv a,
+   struct tgsi_instruction_ext_nv b );
+
+struct tgsi_instruction_ext_nv
+tgsi_build_instruction_ext_nv(
+   unsigned precision,
+   unsigned cond_dst_index,
+   unsigned cond_flow_index,
+   unsigned cond_mask,
+   unsigned cond_swizzle_x,
+   unsigned cond_swizzle_y,
+   unsigned cond_swizzle_z,
+   unsigned cond_swizzle_w,
+   unsigned cond_dst_update,
+   unsigned cond_flow_update,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_instruction_ext_label
+tgsi_default_instruction_ext_label( void );
+
+unsigned
+tgsi_compare_instruction_ext_label(
+   struct tgsi_instruction_ext_label a,
+   struct tgsi_instruction_ext_label b );
+
+struct tgsi_instruction_ext_label
+tgsi_build_instruction_ext_label(
+   unsigned label,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_instruction_ext_texture
+tgsi_default_instruction_ext_texture( void );
+
+unsigned
+tgsi_compare_instruction_ext_texture(
+   struct tgsi_instruction_ext_texture a,
+   struct tgsi_instruction_ext_texture b );
+
+struct tgsi_instruction_ext_texture
+tgsi_build_instruction_ext_texture(
+   unsigned texture,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_src_register
+tgsi_default_src_register( void );
+
+struct tgsi_src_register
+tgsi_build_src_register(
+   unsigned file,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   unsigned negate,
+   unsigned indirect,
+   unsigned dimension,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_full_src_register
+tgsi_default_full_src_register( void );
+
+struct tgsi_src_register_ext_swz
+tgsi_default_src_register_ext_swz( void );
+
+unsigned
+tgsi_compare_src_register_ext_swz(
+   struct tgsi_src_register_ext_swz a,
+   struct tgsi_src_register_ext_swz b );
+
+struct tgsi_src_register_ext_swz
+tgsi_build_src_register_ext_swz(
+   unsigned ext_swizzle_x,
+   unsigned ext_swizzle_y,
+   unsigned ext_swizzle_z,
+   unsigned ext_swizzle_w,
+   unsigned negate_x,
+   unsigned negate_y,
+   unsigned negate_z,
+   unsigned negate_w,
+   unsigned ext_divide,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_src_register_ext_mod
+tgsi_default_src_register_ext_mod( void );
+
+unsigned
+tgsi_compare_src_register_ext_mod(
+   struct tgsi_src_register_ext_mod a,
+   struct tgsi_src_register_ext_mod b );
+
+struct tgsi_src_register_ext_mod
+tgsi_build_src_register_ext_mod(
+   unsigned complement,
+   unsigned bias,
+   unsigned scale_2x,
+   unsigned absolute,
+   unsigned negate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dimension
+tgsi_default_dimension( void );
+
+struct tgsi_dimension
+tgsi_build_dimension(
+   unsigned indirect,
+   unsigned index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dst_register
+tgsi_default_dst_register( void );
+
+struct tgsi_dst_register
+tgsi_build_dst_register(
+   unsigned file,
+   unsigned mask,
+   int index,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_full_dst_register
+tgsi_default_full_dst_register( void );
+
+struct tgsi_dst_register_ext_concode
+tgsi_default_dst_register_ext_concode( void );
+
+unsigned
+tgsi_compare_dst_register_ext_concode(
+   struct tgsi_dst_register_ext_concode a,
+   struct tgsi_dst_register_ext_concode b );
+
+struct tgsi_dst_register_ext_concode
+tgsi_build_dst_register_ext_concode(
+   unsigned cc,
+   unsigned swizzle_x,
+   unsigned swizzle_y,
+   unsigned swizzle_z,
+   unsigned swizzle_w,
+   int index,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+struct tgsi_dst_register_ext_modulate
+tgsi_default_dst_register_ext_modulate( void );
+
+unsigned
+tgsi_compare_dst_register_ext_modulate(
+   struct tgsi_dst_register_ext_modulate a,
+   struct tgsi_dst_register_ext_modulate b );
+
+struct tgsi_dst_register_ext_modulate
+tgsi_build_dst_register_ext_modulate(
+   unsigned modulate,
+   struct tgsi_token *prev_token,
+   struct tgsi_instruction *instruction,
+   struct tgsi_header *header );
+
+#if defined __cplusplus
+} // extern "C"
+#endif // defined __cplusplus
+
+#endif // !defined TGSI_BUILD_H
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
new file mode 100644
index 00000000000..b5c54847e0b
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
@@ -0,0 +1,1581 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include <stdio.h> 
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_dump.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+
+struct gen_dump
+{
+   unsigned tabs;
+   void  (* write)(
+               struct gen_dump   *dump,
+               const void        *data,
+               unsigned          size );
+};
+
+struct text_dump
+{
+   struct gen_dump   base;
+   char              *text;
+   unsigned          length;
+   unsigned          capacity;
+};
+
+static void
+_text_dump_write(
+   struct gen_dump   *dump,
+   const void        *data,
+   unsigned          size )
+{
+   struct text_dump  *td = (struct text_dump *) dump;
+   unsigned          new_length = td->length + size;
+
+   if( new_length >= td->capacity ) {
+      unsigned new_capacity = td->capacity;
+
+      do {
+         if( new_capacity == 0 ) {
+            new_capacity = 256;
+         }
+         else {
+            new_capacity *= 2;
+         }
+      } while( new_length >= new_capacity );
+      td->text = (char *) REALLOC(
+         td->text,
+         td->capacity,
+         new_capacity );
+      td->capacity = new_capacity;
+   }
+   memcpy(
+      &td->text[td->length],
+      data,
+      size );
+   td->length = new_length;
+   td->text[td->length] = '\0';
+}
+
+struct file_dump
+{
+   struct gen_dump   base;
+   FILE              *file;
+};
+
+static void
+_file_dump_write(
+   struct gen_dump   *dump,
+   const void        *data,
+   unsigned          size )
+{
+   struct file_dump  *fd = (struct file_dump *) dump;
+
+#if 0
+   fwrite( data, 1, size, fd->file );
+#else
+   {
+      unsigned i;
+
+      for (i = 0; i < size; i++ ) {
+         fprintf( fd->file, "%c", ((const char *) data)[i] );
+      }
+   }
+#endif
+}
+
+static void
+gen_dump_str(
+   struct gen_dump   *dump,
+   const char        *str )
+{
+   unsigned i;
+   size_t   len = strlen( str );
+
+   for (i = 0; i < len; i++) {
+      dump->write( dump, &str[i], 1 );
+      if (str[i] == '\n') {
+         unsigned i;
+
+         for (i = 0; i < dump->tabs; i++) {
+            dump->write( dump, "    ", 4 );
+         }
+      }
+   }
+}
+
+static void
+gen_dump_chr(
+   struct gen_dump   *dump,
+   const char        chr )
+{
+   dump->write( dump, &chr, 1 );
+}
+
+static void
+gen_dump_uix(
+   struct gen_dump   *dump,
+   const unsigned    ui )
+{
+   char  str[36];
+
+   sprintf( str, "0x%x", ui );
+   gen_dump_str( dump, str );
+}
+
+static void
+gen_dump_uid(
+   struct gen_dump   *dump,
+   const unsigned    ui )
+{
+   char  str[16];
+
+   sprintf( str, "%u", ui );
+   gen_dump_str( dump, str );
+}
+
+static void
+gen_dump_sid(
+   struct gen_dump   *dump,
+   const int         si )
+{
+   char  str[16];
+
+   sprintf( str, "%d", si );
+   gen_dump_str( dump, str );
+}
+
+static void
+gen_dump_flt(
+   struct gen_dump   *dump,
+   const float       flt )
+{
+   char  str[48];
+
+   sprintf( str, "%10.4f", flt );
+   gen_dump_str( dump, str );
+}
+
+static void
+gen_dump_enum(
+   struct gen_dump   *dump,
+   const unsigned    e,
+   const char        **enums,
+   const unsigned    enums_count )
+{
+   if (e >= enums_count) {
+      gen_dump_uid( dump, e );
+   }
+   else {
+      gen_dump_str( dump, enums[e] );
+   }
+}
+
+static void
+gen_dump_tab(
+   struct gen_dump   *dump )
+{
+   ++dump->tabs;
+}
+
+static void
+gen_dump_untab(
+   struct gen_dump   *dump )
+{
+   assert( dump->tabs > 0 );
+
+   --dump->tabs;
+}
+
+#define TXT(S)          gen_dump_str( dump, S )
+#define CHR(C)          gen_dump_chr( dump, C )
+#define UIX(I)          gen_dump_uix( dump, I )
+#define UID(I)          gen_dump_uid( dump, I )
+#define SID(I)          gen_dump_sid( dump, I )
+#define FLT(F)          gen_dump_flt( dump, F )
+#define TAB()           gen_dump_tab( dump )
+#define UNT()           gen_dump_untab( dump )
+#define ENM(E,ENUMS)    gen_dump_enum( dump, E, ENUMS, sizeof( ENUMS ) / sizeof( *ENUMS ) )
+
+static const char *TGSI_PROCESSOR_TYPES[] =
+{
+   "PROCESSOR_FRAGMENT",
+   "PROCESSOR_VERTEX",
+   "PROCESSOR_GEOMETRY"
+};
+
+static const char *TGSI_PROCESSOR_TYPES_SHORT[] =
+{
+   "FRAG",
+   "VERT",
+   "GEOM"
+};
+
+static const char *TGSI_TOKEN_TYPES[] =
+{
+   "TOKEN_TYPE_DECLARATION",
+   "TOKEN_TYPE_IMMEDIATE",
+   "TOKEN_TYPE_INSTRUCTION"
+};
+
+static const char *TGSI_FILES[] =
+{
+   "FILE_NULL",
+   "FILE_CONSTANT",
+   "FILE_INPUT",
+   "FILE_OUTPUT",
+   "FILE_TEMPORARY",
+   "FILE_SAMPLER",
+   "FILE_ADDRESS",
+   "FILE_IMMEDIATE"
+};
+
+static const char *TGSI_FILES_SHORT[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM"
+};
+
+static const char *TGSI_DECLARES[] =
+{
+   "DECLARE_RANGE",
+   "DECLARE_MASK"
+};
+
+static const char *TGSI_INTERPOLATES[] =
+{
+   "INTERPOLATE_CONSTANT",
+   "INTERPOLATE_LINEAR",
+   "INTERPOLATE_PERSPECTIVE",
+   "INTERPOLATE_ATTRIB"
+};
+
+static const char *TGSI_INTERPOLATES_SHORT[] =
+{
+   "CONSTANT",
+   "LINEAR",
+   "PERSPECTIVE",
+   "ATTRIB"
+};
+
+static const char *TGSI_SEMANTICS[] =
+{
+   "SEMANTIC_POSITION",
+   "SEMANTIC_COLOR",
+   "SEMANTIC_BCOLOR",
+   "SEMANTIC_FOG",
+   "SEMANTIC_PSIZE",
+   "SEMANTIC_GENERIC,"
+};
+
+static const char *TGSI_SEMANTICS_SHORT[] =
+{
+   "POSITION",
+   "COLOR",
+   "BCOLOR",
+   "FOG",
+   "PSIZE",
+   "GENERIC",
+};
+
+static const char *TGSI_IMMS[] =
+{
+   "IMM_FLOAT32"
+};
+
+static const char *TGSI_IMMS_SHORT[] =
+{
+   "FLT32"
+};
+
+static const char *TGSI_OPCODES[] =
+{
+   "OPCODE_ARL",
+   "OPCODE_MOV",
+   "OPCODE_LIT",
+   "OPCODE_RCP",
+   "OPCODE_RSQ",
+   "OPCODE_EXP",
+   "OPCODE_LOG",
+   "OPCODE_MUL",
+   "OPCODE_ADD",
+   "OPCODE_DP3",
+   "OPCODE_DP4",
+   "OPCODE_DST",
+   "OPCODE_MIN",
+   "OPCODE_MAX",
+   "OPCODE_SLT",
+   "OPCODE_SGE",
+   "OPCODE_MAD",
+   "OPCODE_SUB",
+   "OPCODE_LERP",
+   "OPCODE_CND",
+   "OPCODE_CND0",
+   "OPCODE_DOT2ADD",
+   "OPCODE_INDEX",
+   "OPCODE_NEGATE",
+   "OPCODE_FRAC",
+   "OPCODE_CLAMP",
+   "OPCODE_FLOOR",
+   "OPCODE_ROUND",
+   "OPCODE_EXPBASE2",
+   "OPCODE_LOGBASE2",
+   "OPCODE_POWER",
+   "OPCODE_CROSSPRODUCT",
+   "OPCODE_MULTIPLYMATRIX",
+   "OPCODE_ABS",
+   "OPCODE_RCC",
+   "OPCODE_DPH",
+   "OPCODE_COS",
+   "OPCODE_DDX",
+   "OPCODE_DDY",
+   "OPCODE_KILP",
+   "OPCODE_PK2H",
+   "OPCODE_PK2US",
+   "OPCODE_PK4B",
+   "OPCODE_PK4UB",
+   "OPCODE_RFL",
+   "OPCODE_SEQ",
+   "OPCODE_SFL",
+   "OPCODE_SGT",
+   "OPCODE_SIN",
+   "OPCODE_SLE",
+   "OPCODE_SNE",
+   "OPCODE_STR",
+   "OPCODE_TEX",
+   "OPCODE_TXD",
+   "OPCODE_UP2H",
+   "OPCODE_UP2US",
+   "OPCODE_UP4B",
+   "OPCODE_UP4UB",
+   "OPCODE_X2D",
+   "OPCODE_ARA",
+   "OPCODE_ARR",
+   "OPCODE_BRA",
+   "OPCODE_CAL",
+   "OPCODE_RET",
+   "OPCODE_SSG",
+   "OPCODE_CMP",
+   "OPCODE_SCS",
+   "OPCODE_TXB",
+   "OPCODE_NRM",
+   "OPCODE_DIV",
+   "OPCODE_DP2",
+   "OPCODE_TXL",
+   "OPCODE_BRK",
+   "OPCODE_IF",
+   "OPCODE_LOOP",
+   "OPCODE_REP",
+   "OPCODE_ELSE",
+   "OPCODE_ENDIF",
+   "OPCODE_ENDLOOP",
+   "OPCODE_ENDREP",
+   "OPCODE_PUSHA",
+   "OPCODE_POPA",
+   "OPCODE_CEIL",
+   "OPCODE_I2F",
+   "OPCODE_NOT",
+   "OPCODE_TRUNC",
+   "OPCODE_SHL",
+   "OPCODE_SHR",
+   "OPCODE_AND",
+   "OPCODE_OR",
+   "OPCODE_MOD",
+   "OPCODE_XOR",
+   "OPCODE_SAD",
+   "OPCODE_TXF",
+   "OPCODE_TXQ",
+   "OPCODE_CONT",
+   "OPCODE_EMIT",
+   "OPCODE_ENDPRIM",
+   "OPCODE_BGNLOOP2",
+   "OPCODE_BGNSUB",
+   "OPCODE_ENDLOOP2",
+   "OPCODE_ENDSUB",
+   "OPCODE_NOISE1",
+   "OPCODE_NOISE2",
+   "OPCODE_NOISE3",
+   "OPCODE_NOISE4",
+   "OPCODE_NOP",
+   "OPCODE_TEXBEM",
+   "OPCODE_TEXBEML",
+   "OPCODE_TEXREG2AR",
+   "OPCODE_TEXM3X2PAD",
+   "OPCODE_TEXM3X2TEX",
+   "OPCODE_TEXM3X3PAD",
+   "OPCODE_TEXM3X3TEX",
+   "OPCODE_TEXM3X3SPEC",
+   "OPCODE_TEXM3X3VSPEC",
+   "OPCODE_TEXREG2GB",
+   "OPCODE_TEXREG2RGB",
+   "OPCODE_TEXDP3TEX",
+   "OPCODE_TEXDP3",
+   "OPCODE_TEXM3X3",
+   "OPCODE_TEXM3X2DEPTH",
+   "OPCODE_TEXDEPTH",
+   "OPCODE_BEM",
+   "OPCODE_M4X3",
+   "OPCODE_M3X4",
+   "OPCODE_M3X3",
+   "OPCODE_M3X2",
+   "OPCODE_NRM4",
+   "OPCODE_CALLNZ",
+   "OPCODE_IFC",
+   "OPCODE_BREAKC",
+   "OPCODE_KIL",
+   "OPCODE_END"
+};
+
+static const char *TGSI_OPCODES_SHORT[] =
+{
+   "ARL",
+   "MOV",
+   "LIT",
+   "RCP",
+   "RSQ",
+   "EXP",
+   "LOG",
+   "MUL",
+   "ADD",
+   "DP3",
+   "DP4",
+   "DST",
+   "MIN",
+   "MAX",
+   "SLT",
+   "SGE",
+   "MAD",
+   "SUB",
+   "LERP",
+   "CND",
+   "CND0",
+   "DOT2ADD",
+   "INDEX",
+   "NEGATE",
+   "FRAC",
+   "CLAMP",
+   "FLOOR",
+   "ROUND",
+   "EXPBASE2",
+   "LOGBASE2",
+   "POWER",
+   "CROSSPRODUCT",
+   "MULTIPLYMATRIX",
+   "ABS",
+   "RCC",
+   "DPH",
+   "COS",
+   "DDX",
+   "DDY",
+   "KILP",
+   "PK2H",
+   "PK2US",
+   "PK4B",
+   "PK4UB",
+   "RFL",
+   "SEQ",
+   "SFL",
+   "SGT",
+   "SIN",
+   "SLE",
+   "SNE",
+   "STR",
+   "TEX",
+   "TXD",
+   "UP2H",
+   "UP2US",
+   "UP4B",
+   "UP4UB",
+   "X2D",
+   "ARA",
+   "ARR",
+   "BRA",
+   "CAL",
+   "RET",
+   "SSG",
+   "CMP",
+   "SCS",
+   "TXB",
+   "NRM",
+   "DIV",
+   "DP2",
+   "TXL",
+   "BRK",
+   "IF",
+   "LOOP",
+   "REP",
+   "ELSE",
+   "ENDIF",
+   "ENDLOOP",
+   "ENDREP",
+   "PUSHA",
+   "POPA",
+   "CEIL",
+   "I2F",
+   "NOT",
+   "TRUNC",
+   "SHL",
+   "SHR",
+   "AND",
+   "OR",
+   "MOD",
+   "XOR",
+   "SAD",
+   "TXF",
+   "TXQ",
+   "CONT",
+   "EMIT",
+   "ENDPRIM",
+   "BGNLOOP2",
+   "BGNSUB",
+   "ENDLOOP2",
+   "ENDSUB",
+   "NOISE1",
+   "NOISE2",
+   "NOISE3",
+   "NOISE4",
+   "NOP",
+   "TEXBEM",
+   "TEXBEML",
+   "TEXREG2AR",
+   "TEXM3X2PAD",
+   "TEXM3X2TEX",
+   "TEXM3X3PAD",
+   "TEXM3X3TEX",
+   "TEXM3X3SPEC",
+   "TEXM3X3VSPEC",
+   "TEXREG2GB",
+   "TEXREG2RGB",
+   "TEXDP3TEX",
+   "TEXDP3",
+   "TEXM3X3",
+   "TEXM3X2DEPTH",
+   "TEXDEPTH",
+   "BEM",
+   "M4X3",
+   "M3X4",
+   "M3X3",
+   "M3X2",
+   "NRM4",
+   "CALLNZ",
+   "IFC",
+   "BREAKC",
+   "KIL",
+   "END"
+};
+
+static const char *TGSI_SATS[] =
+{
+   "SAT_NONE",
+   "SAT_ZERO_ONE",
+   "SAT_MINUS_PLUS_ONE"
+};
+
+static const char *TGSI_INSTRUCTION_EXTS[] =
+{
+   "INSTRUCTION_EXT_TYPE_NV",
+   "INSTRUCTION_EXT_TYPE_LABEL",
+   "INSTRUCTION_EXT_TYPE_TEXTURE"
+};
+
+static const char *TGSI_PRECISIONS[] =
+{
+   "PRECISION_DEFAULT",
+   "TGSI_PRECISION_FLOAT32",
+   "TGSI_PRECISION_FLOAT16",
+   "TGSI_PRECISION_FIXED12"
+};
+
+static const char *TGSI_CCS[] =
+{
+   "CC_GT",
+   "CC_EQ",
+   "CC_LT",
+   "CC_UN",
+   "CC_GE",
+   "CC_LE",
+   "CC_NE",
+   "CC_TR",
+   "CC_FL"
+};
+
+static const char *TGSI_SWIZZLES[] =
+{
+   "SWIZZLE_X",
+   "SWIZZLE_Y",
+   "SWIZZLE_Z",
+   "SWIZZLE_W"
+};
+
+static const char *TGSI_SWIZZLES_SHORT[] =
+{
+   "x",
+   "y",
+   "z",
+   "w"
+};
+
+static const char *TGSI_TEXTURES[] =
+{
+   "TEXTURE_UNKNOWN",
+   "TEXTURE_1D",
+   "TEXTURE_2D",
+   "TEXTURE_3D",
+   "TEXTURE_CUBE",
+   "TEXTURE_RECT",
+   "TEXTURE_SHADOW1D",
+   "TEXTURE_SHADOW2D",
+   "TEXTURE_SHADOWRECT"
+};
+
+static const char *TGSI_SRC_REGISTER_EXTS[] =
+{
+   "SRC_REGISTER_EXT_TYPE_SWZ",
+   "SRC_REGISTER_EXT_TYPE_MOD"
+};
+
+static const char *TGSI_EXTSWIZZLES[] =
+{
+   "EXTSWIZZLE_X",
+   "EXTSWIZZLE_Y",
+   "EXTSWIZZLE_Z",
+   "EXTSWIZZLE_W",
+   "EXTSWIZZLE_ZERO",
+   "EXTSWIZZLE_ONE"
+};
+
+static const char *TGSI_EXTSWIZZLES_SHORT[] =
+{
+   "x",
+   "y",
+   "z",
+   "w",
+   "0",
+   "1"
+};
+
+static const char *TGSI_WRITEMASKS[] =
+{
+   "0",
+   "WRITEMASK_X",
+   "WRITEMASK_Y",
+   "WRITEMASK_XY",
+   "WRITEMASK_Z",
+   "WRITEMASK_XZ",
+   "WRITEMASK_YZ",
+   "WRITEMASK_XYZ",
+   "WRITEMASK_W",
+   "WRITEMASK_XW",
+   "WRITEMASK_YW",
+   "WRITEMASK_XYW",
+   "WRITEMASK_ZW",
+   "WRITEMASK_XZW",
+   "WRITEMASK_YZW",
+   "WRITEMASK_XYZW"
+};
+
+static const char *TGSI_DST_REGISTER_EXTS[] =
+{
+   "DST_REGISTER_EXT_TYPE_CONDCODE",
+   "DST_REGISTER_EXT_TYPE_MODULATE"
+};
+
+static const char *TGSI_MODULATES[] =
+{
+   "MODULATE_1X",
+   "MODULATE_2X",
+   "MODULATE_4X",
+   "MODULATE_8X",
+   "MODULATE_HALF",
+   "MODULATE_QUARTER",
+   "MODULATE_EIGHTH"
+};
+
+static void
+dump_declaration_short(
+   struct gen_dump               *dump,
+   struct tgsi_full_declaration  *decl )
+{
+   TXT( "\nDCL " );
+   ENM( decl->Declaration.File, TGSI_FILES_SHORT );
+
+   switch( decl->Declaration.Declare ) {
+   case TGSI_DECLARE_RANGE:
+      CHR( '[' );
+      UID( decl->u.DeclarationRange.First );
+      if( decl->u.DeclarationRange.First != decl->u.DeclarationRange.Last ) {
+         TXT( ".." );
+         UID( decl->u.DeclarationRange.Last );
+      }
+      CHR( ']' );
+      break;
+   default:
+      assert( 0 );
+   }
+
+   if( decl->Declaration.UsageMask != TGSI_WRITEMASK_XYZW ) {
+      CHR( '.' );
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_X ) {
+         CHR( 'x' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Y ) {
+         CHR( 'y' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Z ) {
+         CHR( 'z' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_W ) {
+         CHR( 'w' );
+      }
+   }
+
+   if( decl->Declaration.Interpolate ) {
+      TXT( ", " );
+      ENM( decl->Interpolation.Interpolate, TGSI_INTERPOLATES_SHORT );
+   }
+
+   if( decl->Declaration.Semantic ) {
+      TXT( ", " );
+      ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS_SHORT );
+      CHR( '[' );
+      UID( decl->Semantic.SemanticIndex );
+      CHR( ']' );
+   }
+}
+
+static void
+dump_declaration_verbose(
+   struct gen_dump               *dump,
+   struct tgsi_full_declaration  *decl,
+   unsigned                      ignored,
+   unsigned                      deflt,
+   struct tgsi_full_declaration  *fd )
+{
+   TXT( "\nFile       : " );
+   ENM( decl->Declaration.File, TGSI_FILES );
+   TXT( "\nDeclare    : " );
+   ENM( decl->Declaration.Declare, TGSI_DECLARES );
+   if( deflt || fd->Declaration.UsageMask != decl->Declaration.UsageMask ) {
+      TXT( "\nUsageMask  : " );
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_X ) {
+         CHR( 'X' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Y ) {
+         CHR( 'Y' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_Z ) {
+         CHR( 'Z' );
+      }
+      if( decl->Declaration.UsageMask & TGSI_WRITEMASK_W ) {
+         CHR( 'W' );
+      }
+   }
+   if( deflt || fd->Declaration.Interpolate != decl->Declaration.Interpolate ) {
+      TXT( "\nInterpolate: " );
+      UID( decl->Declaration.Interpolate );
+   }
+   if( deflt || fd->Declaration.Semantic != decl->Declaration.Semantic ) {
+      TXT( "\nSemantic   : " );
+      UID( decl->Declaration.Semantic );
+   }
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( decl->Declaration.Padding );
+   }
+
+   CHR( '\n' );
+   switch( decl->Declaration.Declare ) {
+   case TGSI_DECLARE_RANGE:
+      TXT( "\nFirst: " );
+      UID( decl->u.DeclarationRange.First );
+      TXT( "\nLast : " );
+      UID( decl->u.DeclarationRange.Last );
+      break;
+
+   case TGSI_DECLARE_MASK:
+      TXT( "\nMask: " );
+      UIX( decl->u.DeclarationMask.Mask );
+      break;
+
+   default:
+      assert( 0 );
+   }
+
+   if( decl->Declaration.Interpolate ) {
+      CHR( '\n' );
+      TXT( "\nInterpolate: " );
+      ENM( decl->Interpolation.Interpolate, TGSI_INTERPOLATES );
+      if( ignored ) {
+         TXT( "\nPadding    : " );
+         UIX( decl->Interpolation.Padding );
+      }
+   }
+
+   if( decl->Declaration.Semantic ) {
+      CHR( '\n' );
+      TXT( "\nSemanticName : " );
+      ENM( decl->Semantic.SemanticName, TGSI_SEMANTICS );
+      TXT( "\nSemanticIndex: " );
+      UID( decl->Semantic.SemanticIndex );
+      if( ignored ) {
+         TXT( "\nPadding      : " );
+         UIX( decl->Semantic.Padding );
+      }
+   }
+}
+
+static void
+dump_immediate_short(
+   struct gen_dump            *dump,
+   struct tgsi_full_immediate *imm )
+{
+   unsigned i;
+
+   TXT( "\nIMM " );
+   ENM( imm->Immediate.DataType, TGSI_IMMS_SHORT );
+
+   TXT( " { " );
+   for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+      switch( imm->Immediate.DataType ) {
+      case TGSI_IMM_FLOAT32:
+         FLT( imm->u.ImmediateFloat32[i].Float );
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      if( i < imm->Immediate.Size - 2 ) {
+         TXT( ", " );
+      }
+   }
+   TXT( " }" );
+}
+
+static void
+dump_immediate_verbose(
+   struct gen_dump            *dump,
+   struct tgsi_full_immediate *imm,
+   unsigned                   ignored )
+{
+   unsigned i;
+
+   TXT( "\nDataType   : " );
+   ENM( imm->Immediate.DataType, TGSI_IMMS );
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( imm->Immediate.Padding );
+   }
+
+   for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+      CHR( '\n' );
+      switch( imm->Immediate.DataType ) {
+      case TGSI_IMM_FLOAT32:
+         TXT( "\nFloat: " );
+         FLT( imm->u.ImmediateFloat32[i].Float );
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+}
+
+static void
+dump_instruction_short(
+   struct gen_dump               *dump,
+   struct tgsi_full_instruction  *inst,
+   unsigned                      instno )
+{
+   unsigned i;
+   boolean  first_reg = TRUE;
+
+   CHR( '\n' );
+   UID( instno );
+   CHR( ':' );
+   ENM( inst->Instruction.Opcode, TGSI_OPCODES_SHORT );
+
+   switch( inst->Instruction.Saturate ) {
+   case TGSI_SAT_NONE:
+      break;
+   case TGSI_SAT_ZERO_ONE:
+      TXT( "_SAT" );
+      break;
+   case TGSI_SAT_MINUS_PLUS_ONE:
+      TXT( "_SAT[-1,1]" );
+      break;
+   default:
+      assert( 0 );
+   }
+
+   for( i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+
+      if( !first_reg ) {
+         CHR( ',' );
+      }
+      CHR( ' ' );
+
+      ENM( dst->DstRegister.File, TGSI_FILES_SHORT );
+
+      CHR( '[' );
+      SID( dst->DstRegister.Index );
+      CHR( ']' );
+
+      if( dst->DstRegister.WriteMask != TGSI_WRITEMASK_XYZW ) {
+         CHR( '.' );
+         if( dst->DstRegister.WriteMask & TGSI_WRITEMASK_X ) {
+            CHR( 'x' );
+         }
+         if( dst->DstRegister.WriteMask & TGSI_WRITEMASK_Y ) {
+            CHR( 'y' );
+         }
+         if( dst->DstRegister.WriteMask & TGSI_WRITEMASK_Z ) {
+            CHR( 'z' );
+         }
+         if( dst->DstRegister.WriteMask & TGSI_WRITEMASK_W ) {
+            CHR( 'w' );
+         }
+      }
+
+      first_reg = FALSE;
+   }
+
+   for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+
+      if( !first_reg ) {
+         CHR( ',' );
+      }
+      CHR( ' ' );
+
+      if( src->SrcRegisterExtMod.Complement ) {
+         TXT( "(1 - " );
+      }
+      if( src->SrcRegisterExtMod.Negate  ) {
+         CHR( '-' );
+      }
+      if( src->SrcRegisterExtMod.Absolute ) {
+         CHR( '|' );
+      }
+      if( src->SrcRegister.Negate ) {
+         CHR( '-' );
+      }
+
+      ENM( src->SrcRegister.File, TGSI_FILES_SHORT );
+
+      CHR( '[' );
+      SID( src->SrcRegister.Index );
+      CHR( ']' );
+
+      if (src->SrcRegister.Extended) {
+         if (src->SrcRegisterExtSwz.ExtSwizzleX != TGSI_EXTSWIZZLE_X ||
+             src->SrcRegisterExtSwz.ExtSwizzleY != TGSI_EXTSWIZZLE_Y ||
+             src->SrcRegisterExtSwz.ExtSwizzleZ != TGSI_EXTSWIZZLE_Z ||
+             src->SrcRegisterExtSwz.ExtSwizzleW != TGSI_EXTSWIZZLE_W) {
+            CHR( '.' );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES_SHORT );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES_SHORT );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES_SHORT );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES_SHORT );
+         }
+      }
+      else if( src->SrcRegister.SwizzleX != TGSI_SWIZZLE_X ||
+               src->SrcRegister.SwizzleY != TGSI_SWIZZLE_Y ||
+               src->SrcRegister.SwizzleZ != TGSI_SWIZZLE_Z ||
+               src->SrcRegister.SwizzleW != TGSI_SWIZZLE_W ) {
+         CHR( '.' );
+         ENM( src->SrcRegister.SwizzleX, TGSI_SWIZZLES_SHORT );
+         ENM( src->SrcRegister.SwizzleY, TGSI_SWIZZLES_SHORT );
+         ENM( src->SrcRegister.SwizzleZ, TGSI_SWIZZLES_SHORT );
+         ENM( src->SrcRegister.SwizzleW, TGSI_SWIZZLES_SHORT );
+      }
+
+      if( src->SrcRegisterExtMod.Absolute ) {
+         CHR( '|' );
+      }
+      if( src->SrcRegisterExtMod.Complement ) {
+         CHR( ')' );
+      }
+
+      first_reg = FALSE;
+   }
+
+   switch( inst->Instruction.Opcode ) {
+   case TGSI_OPCODE_IF:
+   case TGSI_OPCODE_ELSE:
+   case TGSI_OPCODE_BGNLOOP2:
+   case TGSI_OPCODE_ENDLOOP2:
+   case TGSI_OPCODE_CAL:
+      TXT( " :" );
+      UID( inst->InstructionExtLabel.Label );
+      break;
+   }
+}
+
+static void
+dump_instruction_verbose(
+   struct gen_dump               *dump,
+   struct tgsi_full_instruction  *inst,
+   unsigned                      ignored,
+   unsigned                      deflt,
+   struct tgsi_full_instruction  *fi )
+{
+   unsigned i;
+
+   TXT( "\nOpcode     : " );
+   ENM( inst->Instruction.Opcode, TGSI_OPCODES );
+   if( deflt || fi->Instruction.Saturate != inst->Instruction.Saturate ) {
+      TXT( "\nSaturate   : " );
+      ENM( inst->Instruction.Saturate, TGSI_SATS );
+   }
+   if( deflt || fi->Instruction.NumDstRegs != inst->Instruction.NumDstRegs ) {
+      TXT( "\nNumDstRegs : " );
+      UID( inst->Instruction.NumDstRegs );
+   }
+   if( deflt || fi->Instruction.NumSrcRegs != inst->Instruction.NumSrcRegs ) {
+      TXT( "\nNumSrcRegs : " );
+      UID( inst->Instruction.NumSrcRegs );
+   }
+   if( ignored ) {
+      TXT( "\nPadding    : " );
+      UIX( inst->Instruction.Padding );
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_nv( inst->InstructionExtNv, fi->InstructionExtNv ) ) {
+      CHR( '\n' );
+      TXT( "\nType          : " );
+      ENM( inst->InstructionExtNv.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtNv.Precision != inst->InstructionExtNv.Precision ) {
+         TXT( "\nPrecision     : " );
+         ENM( inst->InstructionExtNv.Precision, TGSI_PRECISIONS );
+      }
+      if( deflt || fi->InstructionExtNv.CondDstIndex != inst->InstructionExtNv.CondDstIndex ) {
+         TXT( "\nCondDstIndex  : " );
+         UID( inst->InstructionExtNv.CondDstIndex );
+      }
+      if( deflt || fi->InstructionExtNv.CondFlowIndex != inst->InstructionExtNv.CondFlowIndex ) {
+         TXT( "\nCondFlowIndex : " );
+         UID( inst->InstructionExtNv.CondFlowIndex );
+      }
+      if( deflt || fi->InstructionExtNv.CondMask != inst->InstructionExtNv.CondMask ) {
+         TXT( "\nCondMask      : " );
+         ENM( inst->InstructionExtNv.CondMask, TGSI_CCS );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleX != inst->InstructionExtNv.CondSwizzleX ) {
+         TXT( "\nCondSwizzleX  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleX, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleY != inst->InstructionExtNv.CondSwizzleY ) {
+         TXT( "\nCondSwizzleY  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleY, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleZ != inst->InstructionExtNv.CondSwizzleZ ) {
+         TXT( "\nCondSwizzleZ  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleZ, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondSwizzleW != inst->InstructionExtNv.CondSwizzleW ) {
+         TXT( "\nCondSwizzleW  : " );
+         ENM( inst->InstructionExtNv.CondSwizzleW, TGSI_SWIZZLES );
+      }
+      if( deflt || fi->InstructionExtNv.CondDstUpdate != inst->InstructionExtNv.CondDstUpdate ) {
+         TXT( "\nCondDstUpdate : " );
+         UID( inst->InstructionExtNv.CondDstUpdate );
+      }
+      if( deflt || fi->InstructionExtNv.CondFlowEnable != inst->InstructionExtNv.CondFlowEnable ) {
+         TXT( "\nCondFlowEnable: " );
+         UID( inst->InstructionExtNv.CondFlowEnable );
+      }
+      if( ignored ) {
+         TXT( "\nPadding       : " );
+         UIX( inst->InstructionExtNv.Padding );
+         if( deflt || fi->InstructionExtNv.Extended != inst->InstructionExtNv.Extended ) {
+            TXT( "\nExtended      : " );
+            UID( inst->InstructionExtNv.Extended );
+         }
+      }
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_label( inst->InstructionExtLabel, fi->InstructionExtLabel ) ) {
+      CHR( '\n' );
+      TXT( "\nType    : " );
+      ENM( inst->InstructionExtLabel.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtLabel.Label != inst->InstructionExtLabel.Label ) {
+         TXT( "\nLabel   : " );
+         UID( inst->InstructionExtLabel.Label );
+      }
+      if( ignored ) {
+         TXT( "\nPadding : " );
+         UIX( inst->InstructionExtLabel.Padding );
+         if( deflt || fi->InstructionExtLabel.Extended != inst->InstructionExtLabel.Extended ) {
+            TXT( "\nExtended: " );
+            UID( inst->InstructionExtLabel.Extended );
+         }
+      }
+   }
+
+   if( deflt || tgsi_compare_instruction_ext_texture( inst->InstructionExtTexture, fi->InstructionExtTexture ) ) {
+      CHR( '\n' );
+      TXT( "\nType    : " );
+      ENM( inst->InstructionExtTexture.Type, TGSI_INSTRUCTION_EXTS );
+      if( deflt || fi->InstructionExtTexture.Texture != inst->InstructionExtTexture.Texture ) {
+         TXT( "\nTexture : " );
+         ENM( inst->InstructionExtTexture.Texture, TGSI_TEXTURES );
+      }
+      if( ignored ) {
+         TXT( "\nPadding : " );
+         UIX( inst->InstructionExtTexture.Padding );
+         if( deflt || fi->InstructionExtTexture.Extended != inst->InstructionExtTexture.Extended ) {
+            TXT( "\nExtended: " );
+            UID( inst->InstructionExtTexture.Extended );
+         }
+      }
+   }
+
+   for( i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+      struct tgsi_full_dst_register *dst = &inst->FullDstRegisters[i];
+      struct tgsi_full_dst_register *fd = &fi->FullDstRegisters[i];
+
+      CHR( '\n' );
+      TXT( "\nFile     : " );
+      ENM( dst->DstRegister.File, TGSI_FILES );
+      if( deflt || fd->DstRegister.WriteMask != dst->DstRegister.WriteMask ) {
+         TXT( "\nWriteMask: " );
+         ENM( dst->DstRegister.WriteMask, TGSI_WRITEMASKS );
+      }
+      if( ignored ) {
+         if( deflt || fd->DstRegister.Indirect != dst->DstRegister.Indirect ) {
+            TXT( "\nIndirect : " );
+            UID( dst->DstRegister.Indirect );
+         }
+         if( deflt || fd->DstRegister.Dimension != dst->DstRegister.Dimension ) {
+            TXT( "\nDimension: " );
+            UID( dst->DstRegister.Dimension );
+         }
+      }
+      if( deflt || fd->DstRegister.Index != dst->DstRegister.Index ) {
+         TXT( "\nIndex    : " );
+         SID( dst->DstRegister.Index );
+      }
+      if( ignored ) {
+         TXT( "\nPadding  : " );
+         UIX( dst->DstRegister.Padding );
+         if( deflt || fd->DstRegister.Extended != dst->DstRegister.Extended ) {
+            TXT( "\nExtended : " );
+            UID( dst->DstRegister.Extended );
+         }
+      }
+
+      if( deflt || tgsi_compare_dst_register_ext_concode( dst->DstRegisterExtConcode, fd->DstRegisterExtConcode ) ) {
+         CHR( '\n' );
+         TXT( "\nType        : " );
+         ENM( dst->DstRegisterExtConcode.Type, TGSI_DST_REGISTER_EXTS );
+         if( deflt || fd->DstRegisterExtConcode.CondMask != dst->DstRegisterExtConcode.CondMask ) {
+            TXT( "\nCondMask    : " );
+            ENM( dst->DstRegisterExtConcode.CondMask, TGSI_CCS );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleX != dst->DstRegisterExtConcode.CondSwizzleX ) {
+            TXT( "\nCondSwizzleX: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleX, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleY != dst->DstRegisterExtConcode.CondSwizzleY ) {
+            TXT( "\nCondSwizzleY: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleY, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleZ != dst->DstRegisterExtConcode.CondSwizzleZ ) {
+            TXT( "\nCondSwizzleZ: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleZ, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSwizzleW != dst->DstRegisterExtConcode.CondSwizzleW ) {
+            TXT( "\nCondSwizzleW: " );
+            ENM( dst->DstRegisterExtConcode.CondSwizzleW, TGSI_SWIZZLES );
+         }
+         if( deflt || fd->DstRegisterExtConcode.CondSrcIndex != dst->DstRegisterExtConcode.CondSrcIndex ) {
+            TXT( "\nCondSrcIndex: " );
+            UID( dst->DstRegisterExtConcode.CondSrcIndex );
+         }
+         if( ignored ) {
+            TXT( "\nPadding     : " );
+            UIX( dst->DstRegisterExtConcode.Padding );
+            if( deflt || fd->DstRegisterExtConcode.Extended != dst->DstRegisterExtConcode.Extended ) {
+               TXT( "\nExtended    : " );
+               UID( dst->DstRegisterExtConcode.Extended );
+            }
+         }
+      }
+
+      if( deflt || tgsi_compare_dst_register_ext_modulate( dst->DstRegisterExtModulate, fd->DstRegisterExtModulate ) ) {
+         CHR( '\n' );
+         TXT( "\nType    : " );
+         ENM( dst->DstRegisterExtModulate.Type, TGSI_DST_REGISTER_EXTS );
+         if( deflt || fd->DstRegisterExtModulate.Modulate != dst->DstRegisterExtModulate.Modulate ) {
+            TXT( "\nModulate: " );
+            ENM( dst->DstRegisterExtModulate.Modulate, TGSI_MODULATES );
+         }
+         if( ignored ) {
+            TXT( "\nPadding : " );
+            UIX( dst->DstRegisterExtModulate.Padding );
+            if( deflt || fd->DstRegisterExtModulate.Extended != dst->DstRegisterExtModulate.Extended ) {
+               TXT( "\nExtended: " );
+               UID( dst->DstRegisterExtModulate.Extended );
+            }
+         }
+      }
+   }
+
+   for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+      struct tgsi_full_src_register *src = &inst->FullSrcRegisters[i];
+      struct tgsi_full_src_register *fs = &fi->FullSrcRegisters[i];
+
+      CHR( '\n' );
+      TXT( "\nFile     : ");
+      ENM( src->SrcRegister.File, TGSI_FILES );
+      if( deflt || fs->SrcRegister.SwizzleX != src->SrcRegister.SwizzleX ) {
+         TXT( "\nSwizzleX : " );
+         ENM( src->SrcRegister.SwizzleX, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleY != src->SrcRegister.SwizzleY ) {
+         TXT( "\nSwizzleY : " );
+         ENM( src->SrcRegister.SwizzleY, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleZ != src->SrcRegister.SwizzleZ ) {
+         TXT( "\nSwizzleZ : " );
+         ENM( src->SrcRegister.SwizzleZ, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.SwizzleW != src->SrcRegister.SwizzleW ) {
+         TXT( "\nSwizzleW : " );
+         ENM( src->SrcRegister.SwizzleW, TGSI_SWIZZLES );
+      }
+      if( deflt || fs->SrcRegister.Negate != src->SrcRegister.Negate ) {
+         TXT( "\nNegate   : " );
+         UID( src->SrcRegister.Negate );
+      }
+      if( ignored ) {
+         if( deflt || fs->SrcRegister.Indirect != src->SrcRegister.Indirect ) {
+            TXT( "\nIndirect : " );
+            UID( src->SrcRegister.Indirect );
+         }
+         if( deflt || fs->SrcRegister.Dimension != src->SrcRegister.Dimension ) {
+            TXT( "\nDimension: " );
+            UID( src->SrcRegister.Dimension );
+         }
+      }
+      if( deflt || fs->SrcRegister.Index != src->SrcRegister.Index ) {
+         TXT( "\nIndex    : " );
+         SID( src->SrcRegister.Index );
+      }
+      if( ignored ) {
+         if( deflt || fs->SrcRegister.Extended != src->SrcRegister.Extended ) {
+            TXT( "\nExtended : " );
+            UID( src->SrcRegister.Extended );
+         }
+      }
+
+      if( deflt || tgsi_compare_src_register_ext_swz( src->SrcRegisterExtSwz, fs->SrcRegisterExtSwz ) ) {
+         CHR( '\n' );
+         TXT( "\nType       : " );
+         ENM( src->SrcRegisterExtSwz.Type, TGSI_SRC_REGISTER_EXTS );
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleX != src->SrcRegisterExtSwz.ExtSwizzleX ) {
+            TXT( "\nExtSwizzleX: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleX, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleY != src->SrcRegisterExtSwz.ExtSwizzleY ) {
+            TXT( "\nExtSwizzleY: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleY, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleZ != src->SrcRegisterExtSwz.ExtSwizzleZ ) {
+            TXT( "\nExtSwizzleZ: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleZ, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtSwizzleW != src->SrcRegisterExtSwz.ExtSwizzleW ) {
+            TXT( "\nExtSwizzleW: " );
+            ENM( src->SrcRegisterExtSwz.ExtSwizzleW, TGSI_EXTSWIZZLES );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateX != src->SrcRegisterExtSwz.NegateX ) {
+            TXT( "\nNegateX   : " );
+            UID( src->SrcRegisterExtSwz.NegateX );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateY != src->SrcRegisterExtSwz.NegateY ) {
+            TXT( "\nNegateY   : " );
+            UID( src->SrcRegisterExtSwz.NegateY );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateZ != src->SrcRegisterExtSwz.NegateZ ) {
+            TXT( "\nNegateZ   : " );
+            UID( src->SrcRegisterExtSwz.NegateZ );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.NegateW != src->SrcRegisterExtSwz.NegateW ) {
+            TXT( "\nNegateW   : " );
+            UID( src->SrcRegisterExtSwz.NegateW );
+         }
+         if( deflt || fs->SrcRegisterExtSwz.ExtDivide != src->SrcRegisterExtSwz.ExtDivide ) {
+            TXT( "\nExtDivide  : " );
+            ENM( src->SrcRegisterExtSwz.ExtDivide, TGSI_EXTSWIZZLES );
+         }
+         if( ignored ) {
+            TXT( "\nPadding   : " );
+            UIX( src->SrcRegisterExtSwz.Padding );
+            if( deflt || fs->SrcRegisterExtSwz.Extended != src->SrcRegisterExtSwz.Extended ) {
+               TXT( "\nExtended   : " );
+               UID( src->SrcRegisterExtSwz.Extended );
+            }
+         }
+      }
+
+      if( deflt || tgsi_compare_src_register_ext_mod( src->SrcRegisterExtMod, fs->SrcRegisterExtMod ) ) {
+         CHR( '\n' );
+         TXT( "\nType     : " );
+         ENM( src->SrcRegisterExtMod.Type, TGSI_SRC_REGISTER_EXTS );
+         if( deflt || fs->SrcRegisterExtMod.Complement != src->SrcRegisterExtMod.Complement ) {
+            TXT( "\nComplement: " );
+            UID( src->SrcRegisterExtMod.Complement );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Bias != src->SrcRegisterExtMod.Bias ) {
+            TXT( "\nBias     : " );
+            UID( src->SrcRegisterExtMod.Bias );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Scale2X != src->SrcRegisterExtMod.Scale2X ) {
+            TXT( "\nScale2X   : " );
+            UID( src->SrcRegisterExtMod.Scale2X );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Absolute != src->SrcRegisterExtMod.Absolute ) {
+            TXT( "\nAbsolute  : " );
+            UID( src->SrcRegisterExtMod.Absolute );
+         }
+         if( deflt || fs->SrcRegisterExtMod.Negate != src->SrcRegisterExtMod.Negate ) {
+            TXT( "\nNegate   : " );
+            UID( src->SrcRegisterExtMod.Negate );
+         }
+         if( ignored ) {
+            TXT( "\nPadding   : " );
+            UIX( src->SrcRegisterExtMod.Padding );
+            if( deflt || fs->SrcRegisterExtMod.Extended != src->SrcRegisterExtMod.Extended ) {
+               TXT( "\nExtended  : " );
+               UID( src->SrcRegisterExtMod.Extended );
+            }
+         }
+      }
+   }
+}
+
+static void
+dump_gen(
+   struct gen_dump         *dump,
+   const struct tgsi_token *tokens,
+   unsigned                flags )
+{
+   struct tgsi_parse_context parse;
+   struct tgsi_full_instruction fi;
+   struct tgsi_full_declaration fd;
+   unsigned verbose = flags & TGSI_DUMP_VERBOSE;
+   unsigned ignored = !(flags & TGSI_DUMP_NO_IGNORED);
+   unsigned deflt = !(flags & TGSI_DUMP_NO_DEFAULT);
+   unsigned instno = 0;
+
+   dump->tabs = 0;
+
+   /* sanity check */
+   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_CONT], "OPCODE_CONT") == 0);
+
+   tgsi_parse_init( &parse, tokens );
+
+   TXT( "tgsi-dump begin -----------------" );
+
+   CHR( '\n' );
+   ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES_SHORT );
+   CHR( ' ' );
+   UID( parse.FullVersion.Version.MajorVersion );
+   CHR( '.' );
+   UID( parse.FullVersion.Version.MinorVersion );
+
+   if( verbose ) {
+      TXT( "\nMajorVersion: " );
+      UID( parse.FullVersion.Version.MajorVersion );
+      TXT( "\nMinorVersion: " );
+      UID( parse.FullVersion.Version.MinorVersion );
+      CHR( '\n' );
+
+      TXT( "\nHeaderSize: " );
+      UID( parse.FullHeader.Header.HeaderSize );
+      TXT( "\nBodySize  : " );
+      UID( parse.FullHeader.Header.BodySize );
+      TXT( "\nProcessor : " );
+      ENM( parse.FullHeader.Processor.Processor, TGSI_PROCESSOR_TYPES );
+      CHR( '\n' );
+   }
+
+   fi = tgsi_default_full_instruction();
+   fd = tgsi_default_full_declaration();
+
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         dump_declaration_short(
+            dump,
+            &parse.FullToken.FullDeclaration );
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         dump_immediate_short(
+            dump,
+            &parse.FullToken.FullImmediate );
+         break;
+
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         dump_instruction_short(
+            dump,
+            &parse.FullToken.FullInstruction,
+            instno );
+         instno++;
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      if( verbose ) {
+         TXT( "\nType       : " );
+         ENM( parse.FullToken.Token.Type, TGSI_TOKEN_TYPES );
+         if( ignored ) {
+            TXT( "\nSize       : " );
+            UID( parse.FullToken.Token.Size );
+            if( deflt || parse.FullToken.Token.Extended ) {
+               TXT( "\nExtended   : " );
+               UID( parse.FullToken.Token.Extended );
+            }
+         }
+
+         switch( parse.FullToken.Token.Type ) {
+         case TGSI_TOKEN_TYPE_DECLARATION:
+            dump_declaration_verbose(
+               dump,
+               &parse.FullToken.FullDeclaration,
+               ignored,
+               deflt,
+               &fd );
+            break;
+
+         case TGSI_TOKEN_TYPE_IMMEDIATE:
+            dump_immediate_verbose(
+               dump,
+               &parse.FullToken.FullImmediate,
+               ignored );
+            break;
+
+         case TGSI_TOKEN_TYPE_INSTRUCTION:
+            dump_instruction_verbose(
+               dump,
+               &parse.FullToken.FullInstruction,
+               ignored,
+               deflt,
+               &fi );
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         CHR( '\n' );
+      }
+   }
+
+   TXT( "\ntgsi-dump end -------------------\n" );
+
+   tgsi_parse_free( &parse );
+}
+
+
+static void
+sanity_checks(void)
+{
+   assert(strcmp(TGSI_OPCODES[TGSI_OPCODE_END], "OPCODE_END") == 0);
+   assert(strcmp(TGSI_OPCODES_SHORT[TGSI_OPCODE_END], "END") == 0);
+}
+
+
+void
+tgsi_dump(
+   const struct tgsi_token *tokens,
+   unsigned                flags )
+{
+   struct file_dump  dump;
+
+   sanity_checks();
+
+   dump.base.write = _file_dump_write;
+#if 0
+   {
+      static unsigned   counter = 0;
+      char              buffer[64];
+      sprintf( buffer, "tgsi-dump-%.4u.txt", counter++ );
+      dump.file = fopen( buffer, "wt" );
+   }
+#else
+   dump.file = stderr;
+#endif
+
+   dump_gen(
+      &dump.base,
+      tokens,
+      flags );
+
+#if 0
+   fclose( dump.file );
+#endif
+}
+
+void
+tgsi_dump_str(
+   char                    **str,
+   const struct tgsi_token *tokens,
+   unsigned                flags )
+{
+   struct text_dump  dump;
+
+   dump.base.write = _text_dump_write;
+   dump.text = NULL;
+   dump.length = 0;
+   dump.capacity = 0;
+
+   dump_gen(
+      &dump.base,
+      tokens,
+      flags );
+
+   *str = dump.text;
+}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
new file mode 100644
index 00000000000..1adc9db2519
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
@@ -0,0 +1,28 @@
+#if !defined TGSI_DUMP_H
+#define TGSI_DUMP_H
+
+#if defined __cplusplus
+extern "C" {
+#endif // defined __cplusplus
+
+#define TGSI_DUMP_VERBOSE       1
+#define TGSI_DUMP_NO_IGNORED    2
+#define TGSI_DUMP_NO_DEFAULT    4
+
+void
+tgsi_dump(
+   const struct tgsi_token *tokens,
+   unsigned                flags );
+
+void
+tgsi_dump_str(
+   char                    **str,
+   const struct tgsi_token *tokens,
+   unsigned                flags );
+
+#if defined __cplusplus
+} // extern "C"
+#endif // defined __cplusplus
+
+#endif // !defined TGSI_DUMP_H
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_parse.c b/src/gallium/auxiliary/tgsi/util/tgsi_parse.c
new file mode 100644
index 00000000000..bf6b89ce564
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_parse.c
@@ -0,0 +1,319 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+
+void
+tgsi_full_token_init(
+   union tgsi_full_token *full_token )
+{
+   full_token->Token.Type = TGSI_TOKEN_TYPE_DECLARATION;
+}
+
+void
+tgsi_full_token_free(
+   union tgsi_full_token *full_token )
+{
+   if( full_token->Token.Type == TGSI_TOKEN_TYPE_IMMEDIATE ) {
+      FREE( full_token->FullImmediate.u.Pointer );
+   }
+}
+
+unsigned
+tgsi_parse_init(
+   struct tgsi_parse_context *ctx,
+   const struct tgsi_token *tokens )
+{
+   ctx->FullVersion.Version = *(struct tgsi_version *) &tokens[0];
+   if( ctx->FullVersion.Version.MajorVersion > 1 ) {
+      return TGSI_PARSE_ERROR;
+   }
+
+   ctx->FullHeader.Header = *(struct tgsi_header *) &tokens[1];
+   if( ctx->FullHeader.Header.HeaderSize >= 2 ) {
+      ctx->FullHeader.Processor = *(struct tgsi_processor *) &tokens[2];
+   }
+   else {
+      ctx->FullHeader.Processor = tgsi_default_processor();
+   }
+
+   ctx->Tokens = tokens;
+   ctx->Position = 1 + ctx->FullHeader.Header.HeaderSize;
+
+   tgsi_full_token_init( &ctx->FullToken );
+
+   return TGSI_PARSE_OK;
+}
+
+void
+tgsi_parse_free(
+   struct tgsi_parse_context *ctx )
+{
+   tgsi_full_token_free( &ctx->FullToken );
+}
+
+boolean
+tgsi_parse_end_of_tokens(
+   struct tgsi_parse_context *ctx )
+{
+   return ctx->Position >=
+      1 + ctx->FullHeader.Header.HeaderSize + ctx->FullHeader.Header.BodySize;
+}
+
+static void
+next_token(
+   struct tgsi_parse_context *ctx,
+   void *token )
+{
+   assert( !tgsi_parse_end_of_tokens( ctx ) );
+
+   *(struct tgsi_token *) token = ctx->Tokens[ctx->Position++];
+}
+
+void
+tgsi_parse_token(
+   struct tgsi_parse_context *ctx )
+{
+   struct tgsi_token token;
+   unsigned i;
+
+   tgsi_full_token_free( &ctx->FullToken );
+   tgsi_full_token_init( &ctx->FullToken );
+
+   next_token( ctx, &token );
+
+   switch( token.Type ) {
+   case TGSI_TOKEN_TYPE_DECLARATION:
+   {
+      struct tgsi_full_declaration *decl = &ctx->FullToken.FullDeclaration;
+
+      *decl = tgsi_default_full_declaration();
+      decl->Declaration = *(struct tgsi_declaration *) &token;
+
+      switch( decl->Declaration.Type ) {
+      case TGSI_DECLARE_RANGE:
+         next_token( ctx, &decl->u.DeclarationRange );
+         break;
+
+      case TGSI_DECLARE_MASK:
+         next_token( ctx, &decl->u.DeclarationMask );
+         break;
+
+      default:
+         assert (0);
+      }
+
+      if( decl->Declaration.Interpolate ) {
+         next_token( ctx, &decl->Interpolation );
+      }
+
+      if( decl->Declaration.Semantic ) {
+         next_token( ctx, &decl->Semantic );
+      }
+
+      break;
+   }
+
+   case TGSI_TOKEN_TYPE_IMMEDIATE:
+   {
+      struct tgsi_full_immediate *imm = &ctx->FullToken.FullImmediate;
+
+      *imm = tgsi_default_full_immediate();
+      imm->Immediate = *(struct tgsi_immediate *) &token;
+
+      assert( !imm->Immediate.Extended );
+
+      switch (imm->Immediate.DataType) {
+      case TGSI_IMM_FLOAT32:
+         imm->u.Pointer = MALLOC(
+            sizeof( struct tgsi_immediate_float32 ) * (imm->Immediate.Size - 1) );
+         for( i = 0; i < imm->Immediate.Size - 1; i++ ) {
+            next_token( ctx, &imm->u.ImmediateFloat32[i] );
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+
+      break;
+   }
+
+   case TGSI_TOKEN_TYPE_INSTRUCTION:
+   {
+      struct tgsi_full_instruction *inst = &ctx->FullToken.FullInstruction;
+      unsigned extended;
+
+      *inst = tgsi_default_full_instruction();
+      inst->Instruction = *(struct tgsi_instruction *) &token;
+
+      extended = inst->Instruction.Extended;
+
+      while( extended ) {
+         struct tgsi_src_register_ext token;
+
+         next_token( ctx, &token );
+
+         switch( token.Type ) {
+         case TGSI_INSTRUCTION_EXT_TYPE_NV:
+            inst->InstructionExtNv =
+               *(struct tgsi_instruction_ext_nv *) &token;
+            break;
+
+         case TGSI_INSTRUCTION_EXT_TYPE_LABEL:
+            inst->InstructionExtLabel =
+               *(struct tgsi_instruction_ext_label *) &token;
+            break;
+
+         case TGSI_INSTRUCTION_EXT_TYPE_TEXTURE:
+            inst->InstructionExtTexture =
+               *(struct tgsi_instruction_ext_texture *) &token;
+            break;
+
+         default:
+            assert( 0 );
+         }
+
+         extended = token.Extended;
+      }
+
+      assert( inst->Instruction.NumDstRegs <= TGSI_FULL_MAX_DST_REGISTERS );
+
+      for(  i = 0; i < inst->Instruction.NumDstRegs; i++ ) {
+         unsigned extended;
+
+         next_token( ctx, &inst->FullDstRegisters[i].DstRegister );
+
+         /*
+          * No support for indirect or multi-dimensional addressing.
+          */
+         assert( !inst->FullDstRegisters[i].DstRegister.Indirect );
+         assert( !inst->FullDstRegisters[i].DstRegister.Dimension );
+
+         extended = inst->FullDstRegisters[i].DstRegister.Extended;
+
+         while( extended ) {
+            struct tgsi_src_register_ext token;
+
+            next_token( ctx, &token );
+
+            switch( token.Type ) {
+            case TGSI_DST_REGISTER_EXT_TYPE_CONDCODE:
+               inst->FullDstRegisters[i].DstRegisterExtConcode =
+                  *(struct tgsi_dst_register_ext_concode *) &token;
+               break;
+
+            case TGSI_DST_REGISTER_EXT_TYPE_MODULATE:
+               inst->FullDstRegisters[i].DstRegisterExtModulate =
+                  *(struct tgsi_dst_register_ext_modulate *) &token;
+               break;
+
+            default:
+               assert( 0 );
+            }
+
+            extended = token.Extended;
+         }
+      }
+
+      assert( inst->Instruction.NumSrcRegs <= TGSI_FULL_MAX_SRC_REGISTERS );
+
+      for( i = 0; i < inst->Instruction.NumSrcRegs; i++ ) {
+         unsigned extended;
+
+         next_token( ctx, &inst->FullSrcRegisters[i].SrcRegister );
+
+         extended = inst->FullSrcRegisters[i].SrcRegister.Extended;
+
+         while( extended ) {
+            struct tgsi_src_register_ext token;
+
+            next_token( ctx, &token );
+
+            switch( token.Type ) {
+            case TGSI_SRC_REGISTER_EXT_TYPE_SWZ:
+               inst->FullSrcRegisters[i].SrcRegisterExtSwz =
+                  *(struct tgsi_src_register_ext_swz *) &token;
+               break;
+
+            case TGSI_SRC_REGISTER_EXT_TYPE_MOD:
+               inst->FullSrcRegisters[i].SrcRegisterExtMod =
+                  *(struct tgsi_src_register_ext_mod *) &token;
+               break;
+
+            default:
+               assert( 0 );
+            }
+
+            extended = token.Extended;
+         }
+
+         if( inst->FullSrcRegisters[i].SrcRegister.Indirect ) {
+            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterInd );
+
+            /*
+             * No support for indirect or multi-dimensional addressing.
+             */
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
+         }
+
+         if( inst->FullSrcRegisters[i].SrcRegister.Dimension ) {
+            next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDim );
+
+            /*
+             * No support for multi-dimensional addressing.
+             */
+            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Dimension );
+            assert( !inst->FullSrcRegisters[i].SrcRegisterDim.Extended );
+
+            if( inst->FullSrcRegisters[i].SrcRegisterDim.Indirect ) {
+               next_token( ctx, &inst->FullSrcRegisters[i].SrcRegisterDimInd );
+
+               /*
+               * No support for indirect or multi-dimensional addressing.
+               */
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Indirect );
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Dimension );
+               assert( !inst->FullSrcRegisters[i].SrcRegisterInd.Extended );
+            }
+         }
+      }
+
+      break;
+   }
+
+   default:
+      assert( 0 );
+   }
+}
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_parse.h b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
new file mode 100644
index 00000000000..9372da8d5d1
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_parse.h
@@ -0,0 +1,121 @@
+#if !defined TGSI_PARSE_H
+#define TGSI_PARSE_H
+
+#if defined __cplusplus
+extern "C" {
+#endif // defined __cplusplus
+
+struct tgsi_full_version
+{
+   struct tgsi_version  Version;
+};
+
+struct tgsi_full_header
+{
+   struct tgsi_header      Header;
+   struct tgsi_processor   Processor;
+};
+
+struct tgsi_full_dst_register
+{
+   struct tgsi_dst_register               DstRegister;
+   struct tgsi_dst_register_ext_concode   DstRegisterExtConcode;
+   struct tgsi_dst_register_ext_modulate  DstRegisterExtModulate;
+};
+
+struct tgsi_full_src_register
+{
+   struct tgsi_src_register         SrcRegister;
+   struct tgsi_src_register_ext_swz SrcRegisterExtSwz;
+   struct tgsi_src_register_ext_mod SrcRegisterExtMod;
+   struct tgsi_src_register         SrcRegisterInd;
+   struct tgsi_dimension            SrcRegisterDim;
+   struct tgsi_src_register         SrcRegisterDimInd;
+};
+
+struct tgsi_full_declaration
+{
+   struct tgsi_declaration Declaration;
+   union
+   {
+      struct tgsi_declaration_range DeclarationRange;
+      struct tgsi_declaration_mask  DeclarationMask;
+   } u;
+   struct tgsi_declaration_interpolation  Interpolation;
+   struct tgsi_declaration_semantic       Semantic;
+};
+
+struct tgsi_full_immediate
+{
+   struct tgsi_immediate   Immediate;
+   union
+   {
+      void                          *Pointer;
+      struct tgsi_immediate_float32 *ImmediateFloat32;
+   } u;
+};
+
+#define TGSI_FULL_MAX_DST_REGISTERS 2
+#define TGSI_FULL_MAX_SRC_REGISTERS 3
+
+struct tgsi_full_instruction
+{
+   struct tgsi_instruction             Instruction;
+   struct tgsi_instruction_ext_nv      InstructionExtNv;
+   struct tgsi_instruction_ext_label   InstructionExtLabel;
+   struct tgsi_instruction_ext_texture InstructionExtTexture;
+   struct tgsi_full_dst_register       FullDstRegisters[TGSI_FULL_MAX_DST_REGISTERS];
+   struct tgsi_full_src_register       FullSrcRegisters[TGSI_FULL_MAX_SRC_REGISTERS];
+};
+
+union tgsi_full_token
+{
+   struct tgsi_token             Token;
+   struct tgsi_full_declaration  FullDeclaration;
+   struct tgsi_full_immediate    FullImmediate;
+   struct tgsi_full_instruction  FullInstruction;
+};
+
+void
+tgsi_full_token_init(
+   union tgsi_full_token *full_token );
+
+void
+tgsi_full_token_free(
+   union tgsi_full_token *full_token );
+
+struct tgsi_parse_context
+{
+   const struct tgsi_token    *Tokens;
+   unsigned                   Position;
+   struct tgsi_full_version   FullVersion;
+   struct tgsi_full_header    FullHeader;
+   union tgsi_full_token      FullToken;
+};
+
+#define TGSI_PARSE_OK      0
+#define TGSI_PARSE_ERROR   1
+
+unsigned
+tgsi_parse_init(
+   struct tgsi_parse_context *ctx,
+   const struct tgsi_token *tokens );
+
+void
+tgsi_parse_free(
+   struct tgsi_parse_context *ctx );
+
+boolean
+tgsi_parse_end_of_tokens(
+   struct tgsi_parse_context *ctx );
+
+void
+tgsi_parse_token(
+   struct tgsi_parse_context *ctx );
+
+#if defined __cplusplus
+} // extern "C"
+#endif // defined __cplusplus
+
+#endif // !defined TGSI_PARSE_H
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_transform.c b/src/gallium/auxiliary/tgsi/util/tgsi_transform.c
new file mode 100644
index 00000000000..357f77b05a6
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_transform.c
@@ -0,0 +1,199 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * TGSI program transformation utility.
+ *
+ * Authors:  Brian Paul
+ */
+
+
+#include "tgsi_transform.h"
+
+
+
+static void
+emit_instruction(struct tgsi_transform_context *ctx,
+                 const struct tgsi_full_instruction *inst)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_instruction(inst,
+                                     ctx->tokens_out + ti,
+                                     ctx->header,
+                                     ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+static void
+emit_declaration(struct tgsi_transform_context *ctx,
+                 const struct tgsi_full_declaration *decl)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_declaration(decl,
+                                     ctx->tokens_out + ti,
+                                     ctx->header,
+                                     ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+static void
+emit_immediate(struct tgsi_transform_context *ctx,
+               const struct tgsi_full_immediate *imm)
+{
+   uint ti = ctx->ti;
+
+   ti += tgsi_build_full_immediate(imm,
+                                   ctx->tokens_out + ti,
+                                   ctx->header,
+                                   ctx->max_tokens_out - ti);
+   ctx->ti = ti;
+}
+
+
+
+/**
+ * Apply user-defined transformations to the input shader to produce
+ * the output shader.
+ * For example, a register search-and-replace operation could be applied
+ * by defining a transform_instruction() callback that examined and changed
+ * the instruction src/dest regs.
+ *
+ * \return number of tokens emitted
+ */
+int
+tgsi_transform_shader(const struct tgsi_token *tokens_in,
+                      struct tgsi_token *tokens_out,
+                      uint max_tokens_out,
+                      struct tgsi_transform_context *ctx)
+{
+   uint procType;
+
+   /* input shader */
+   struct tgsi_parse_context parse;
+
+   /* output shader */
+   struct tgsi_processor *processor;
+
+
+   /**
+    ** callback context init
+    **/
+   ctx->emit_instruction = emit_instruction;
+   ctx->emit_declaration = emit_declaration;
+   ctx->emit_immediate = emit_immediate;
+   ctx->tokens_out = tokens_out;
+   ctx->max_tokens_out = max_tokens_out;
+
+
+   /**
+    ** Setup to begin parsing input shader
+    **/
+   if (tgsi_parse_init( &parse, tokens_in ) != TGSI_PARSE_OK) {
+      debug_printf("tgsi_parse_init() failed in tgsi_transform_shader()!\n");
+      return -1;
+   }
+   procType = parse.FullHeader.Processor.Processor;
+   assert(procType == TGSI_PROCESSOR_FRAGMENT ||
+          procType == TGSI_PROCESSOR_VERTEX ||
+          procType == TGSI_PROCESSOR_GEOMETRY);
+
+
+   /**
+    **  Setup output shader
+    **/
+   *(struct tgsi_version *) &tokens_out[0] = tgsi_build_version();
+
+   ctx->header = (struct tgsi_header *) (tokens_out + 1);
+   *ctx->header = tgsi_build_header();
+
+   processor = (struct tgsi_processor *) (tokens_out + 2);
+   *processor = tgsi_build_processor( procType, ctx->header );
+
+   ctx->ti = 3;
+
+
+   /**
+    ** Loop over incoming program tokens/instructions
+    */
+   while( !tgsi_parse_end_of_tokens( &parse ) ) {
+
+      tgsi_parse_token( &parse );
+
+      switch( parse.FullToken.Token.Type ) {
+      case TGSI_TOKEN_TYPE_INSTRUCTION:
+         {
+            struct tgsi_full_instruction *fullinst
+               = &parse.FullToken.FullInstruction;
+
+            if (ctx->transform_instruction)
+               ctx->transform_instruction(ctx, fullinst);
+            else
+               ctx->emit_instruction(ctx, fullinst);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_DECLARATION:
+         {
+            struct tgsi_full_declaration *fulldecl
+               = &parse.FullToken.FullDeclaration;
+
+            if (ctx->transform_declaration)
+               ctx->transform_declaration(ctx, fulldecl);
+            else
+               ctx->emit_declaration(ctx, fulldecl);
+         }
+         break;
+
+      case TGSI_TOKEN_TYPE_IMMEDIATE:
+         {
+            struct tgsi_full_immediate *fullimm
+               = &parse.FullToken.FullImmediate;
+
+            if (ctx->transform_immediate)
+               ctx->transform_immediate(ctx, fullimm);
+            else
+               ctx->emit_immediate(ctx, fullimm);
+         }
+         break;
+
+      default:
+         assert( 0 );
+      }
+   }
+
+   if (ctx->epilog) {
+      ctx->epilog(ctx);
+   }
+
+   tgsi_parse_free (&parse);
+
+   return ctx->ti;
+}
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_transform.h b/src/gallium/auxiliary/tgsi/util/tgsi_transform.h
new file mode 100644
index 00000000000..fcf85d603be
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_transform.h
@@ -0,0 +1,93 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef TGSI_TRANSFORM_H
+#define TGSI_TRANSFORM_H
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_build.h"
+
+
+
+/**
+ * Subclass this to add caller-specific data
+ */
+struct tgsi_transform_context
+{
+/**** PUBLIC ***/
+
+   /**
+    * User-defined callbacks invoked per instruction.
+    */
+   void (*transform_instruction)(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_instruction *inst);
+
+   void (*transform_declaration)(struct tgsi_transform_context *ctx,
+                                 struct tgsi_full_declaration *decl);
+
+   void (*transform_immediate)(struct tgsi_transform_context *ctx,
+                               struct tgsi_full_immediate *imm);
+
+   /**
+    * Called at end of input program to allow caller to append extra
+    * instructions.  Return number of tokens emitted.
+    */
+   void (*epilog)(struct tgsi_transform_context *ctx);
+
+
+/*** PRIVATE ***/
+
+   /**
+    * These are setup by tgsi_transform_shader() and cannot be overridden.
+    * Meant to be called from in the above user callback functions.
+    */
+   void (*emit_instruction)(struct tgsi_transform_context *ctx,
+                            const struct tgsi_full_instruction *inst);
+   void (*emit_declaration)(struct tgsi_transform_context *ctx,
+                            const struct tgsi_full_declaration *decl);
+   void (*emit_immediate)(struct tgsi_transform_context *ctx,
+                          const struct tgsi_full_immediate *imm);
+
+   struct tgsi_header *header;
+   uint max_tokens_out;
+   struct tgsi_token *tokens_out;
+   uint ti;
+};
+
+
+
+extern int
+tgsi_transform_shader(const struct tgsi_token *tokens_in,
+                      struct tgsi_token *tokens_out,
+                      uint max_tokens_out,
+                      struct tgsi_transform_context *ctx);
+
+
+#endif /* TGSI_TRANSFORM_H */
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_util.c b/src/gallium/auxiliary/tgsi/util/tgsi_util.c
new file mode 100644
index 00000000000..4cdd89182ad
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_util.c
@@ -0,0 +1,274 @@
+#include "pipe/p_debug.h"
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi_parse.h"
+#include "tgsi_build.h"
+#include "tgsi_util.h"
+
+union pointer_hack
+{
+   void *pointer;
+   unsigned long long uint64;
+};
+
+void *
+tgsi_align_128bit(
+   void *unaligned )
+{
+   union pointer_hack ph;
+
+   ph.uint64 = 0;
+   ph.pointer = unaligned;
+   ph.uint64 = (ph.uint64 + 15) & ~15;
+   return ph.pointer;
+}
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->SwizzleX;
+   case 1:
+      return reg->SwizzleY;
+   case 2:
+      return reg->SwizzleZ;
+   case 3:
+      return reg->SwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->ExtSwizzleX;
+   case 1:
+      return reg->ExtSwizzleY;
+   case 2:
+      return reg->ExtSwizzleZ;
+   case 3:
+      return reg->ExtSwizzleW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register  *reg,
+   unsigned component )
+{
+   unsigned swizzle;
+
+   /*
+    * First, calculate  the   extended swizzle for a given channel. This will give
+    * us either a channel index into the simple swizzle or  a constant 1 or   0.
+    */
+   swizzle = tgsi_util_get_src_register_extswizzle(
+      &reg->SrcRegisterExtSwz,
+      component );
+
+   assert (TGSI_SWIZZLE_X == TGSI_EXTSWIZZLE_X);
+   assert (TGSI_SWIZZLE_Y == TGSI_EXTSWIZZLE_Y);
+   assert (TGSI_SWIZZLE_Z == TGSI_EXTSWIZZLE_Z);
+   assert (TGSI_SWIZZLE_W == TGSI_EXTSWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ZERO > TGSI_SWIZZLE_W);
+   assert (TGSI_EXTSWIZZLE_ONE > TGSI_SWIZZLE_W);
+
+   /*
+    * Second, calculate the simple  swizzle  for   the   unswizzled channel index.
+    * Leave the constants intact, they are   not   affected by the   simple swizzle.
+    */
+   if( swizzle <= TGSI_SWIZZLE_W ) {
+      swizzle = tgsi_util_get_src_register_swizzle(
+         &reg->SrcRegister,
+         component );
+   }
+
+   return swizzle;
+}
+
+void
+tgsi_util_set_src_register_swizzle(
+   struct tgsi_src_register *reg,
+   unsigned swizzle,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->SwizzleX = swizzle;
+      break;
+   case 1:
+      reg->SwizzleY = swizzle;
+      break;
+   case 2:
+      reg->SwizzleZ = swizzle;
+      break;
+   case 3:
+      reg->SwizzleW = swizzle;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+void
+tgsi_util_set_src_register_extswizzle(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned swizzle,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->ExtSwizzleX = swizzle;
+      break;
+   case 1:
+      reg->ExtSwizzleY = swizzle;
+      break;
+   case 2:
+      reg->ExtSwizzleZ = swizzle;
+      break;
+   case 3:
+      reg->ExtSwizzleW = swizzle;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const  struct tgsi_src_register_ext_swz *reg,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      return reg->NegateX;
+   case 1:
+      return reg->NegateY;
+   case 2:
+      return reg->NegateZ;
+   case 3:
+      return reg->NegateW;
+   default:
+      assert( 0 );
+   }
+   return 0;
+}
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component )
+{
+   switch( component ) {
+   case 0:
+      reg->NegateX = negate;
+      break;
+   case 1:
+      reg->NegateY = negate;
+      break;
+   case 2:
+      reg->NegateZ = negate;
+      break;
+   case 3:
+      reg->NegateW = negate;
+      break;
+   default:
+      assert( 0 );
+   }
+}
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct  tgsi_full_src_register *reg,
+   unsigned component )
+{
+   unsigned sign_mode;
+
+   if( reg->SrcRegisterExtMod.Absolute ) {
+      /* Consider only the post-abs negation. */
+
+      if( reg->SrcRegisterExtMod.Negate ) {
+         sign_mode = TGSI_UTIL_SIGN_SET;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_CLEAR;
+      }
+   }
+   else {
+      /* Accumulate the three negations. */
+
+      unsigned negate;
+
+      negate = reg->SrcRegister.Negate;
+      if( tgsi_util_get_src_register_extnegate( &reg->SrcRegisterExtSwz, component ) ) {
+         negate = !negate;
+      }
+      if( reg->SrcRegisterExtMod.Negate ) {
+         negate = !negate;
+      }
+
+      if( negate ) {
+         sign_mode = TGSI_UTIL_SIGN_TOGGLE;
+      }
+      else {
+         sign_mode = TGSI_UTIL_SIGN_KEEP;
+      }
+   }
+
+   return sign_mode;
+}
+
+void
+tgsi_util_set_full_src_register_sign_mode(
+   struct tgsi_full_src_register *reg,
+   unsigned sign_mode )
+{
+   reg->SrcRegisterExtSwz.NegateX = 0;
+   reg->SrcRegisterExtSwz.NegateY = 0;
+   reg->SrcRegisterExtSwz.NegateZ = 0;
+   reg->SrcRegisterExtSwz.NegateW = 0;
+
+   switch (sign_mode)
+   {
+   case TGSI_UTIL_SIGN_CLEAR:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 1;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 1;
+      reg->SrcRegisterExtMod.Negate = 1;
+      break;
+
+   case TGSI_UTIL_SIGN_TOGGLE:
+      reg->SrcRegister.Negate = 1;
+      reg->SrcRegisterExtMod.Absolute = 0;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   case TGSI_UTIL_SIGN_KEEP:
+      reg->SrcRegister.Negate = 0;
+      reg->SrcRegisterExtMod.Absolute = 0;
+      reg->SrcRegisterExtMod.Negate = 0;
+      break;
+
+   default:
+      assert( 0 );
+   }
+}
+
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_util.h b/src/gallium/auxiliary/tgsi/util/tgsi_util.h
new file mode 100644
index 00000000000..ef14446f0e4
--- /dev/null
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_util.h
@@ -0,0 +1,70 @@
+#if !defined TGSI_UTIL_H
+#define TGSI_UTIL_H
+
+#if defined __cplusplus
+extern "C" {
+#endif // defined __cplusplus
+
+void *
+tgsi_align_128bit(
+   void *unaligned );
+
+unsigned
+tgsi_util_get_src_register_swizzle(
+   const struct tgsi_src_register *reg,
+   unsigned component );
+
+unsigned
+tgsi_util_get_src_register_extswizzle(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component);
+
+unsigned
+tgsi_util_get_full_src_register_extswizzle(
+   const struct tgsi_full_src_register *reg,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_swizzle(
+   struct tgsi_src_register *reg,
+   unsigned swizzle,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_extswizzle(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned swizzle,
+   unsigned component );
+
+unsigned
+tgsi_util_get_src_register_extnegate(
+   const struct tgsi_src_register_ext_swz *reg,
+   unsigned component );
+
+void
+tgsi_util_set_src_register_extnegate(
+   struct tgsi_src_register_ext_swz *reg,
+   unsigned negate,
+   unsigned component );
+
+#define TGSI_UTIL_SIGN_CLEAR    0   /* Force positive */
+#define TGSI_UTIL_SIGN_SET      1   /* Force negative */
+#define TGSI_UTIL_SIGN_TOGGLE   2   /* Negate */
+#define TGSI_UTIL_SIGN_KEEP     3   /* No change */
+
+unsigned
+tgsi_util_get_full_src_register_sign_mode(
+   const struct tgsi_full_src_register *reg,
+   unsigned component );
+
+void
+tgsi_util_set_full_src_register_sign_mode(
+   struct tgsi_full_src_register *reg,
+   unsigned sign_mode );
+
+#if defined __cplusplus
+} // extern "C"
+#endif // defined __cplusplus
+
+#endif // !defined TGSI_UTIL_H
+
diff --git a/src/gallium/auxiliary/util/p_debug.c b/src/gallium/auxiliary/util/p_debug.c
new file mode 100644
index 00000000000..b9607a6ba7f
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_debug.c
@@ -0,0 +1,76 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include <stdarg.h>
+
+#ifdef WIN32
+#include <windows.h>
+#include <winddi.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#endif
+
+#include "pipe/p_debug.h" 
+#include "pipe/p_compiler.h" 
+
+
+void debug_vprintf(const char *format, va_list ap)
+{
+#ifdef WIN32
+   EngDebugPrint("Gallium3D: ", (PCHAR)format, ap);
+#else
+   vfprintf(stderr, format, ap);
+#endif
+}
+
+
+void debug_printf(const char *format, ...)
+{
+   va_list ap;
+   va_start(ap, format);
+   debug_vprintf(format, ap);
+   va_end(ap);
+}
+
+
+static INLINE void debug_abort(void) 
+{
+#ifdef WIN32
+   EngDebugBreak();
+#else
+   abort();
+#endif
+}
+
+
+void debug_assert_fail(const char *expr, const char *file, unsigned line) 
+{
+   debug_printf("%s:%i: Assertion `%s' failed.\n", file, line, expr);
+   debug_abort();
+}
diff --git a/src/gallium/auxiliary/util/p_tile.c b/src/gallium/auxiliary/util/p_tile.c
new file mode 100644
index 00000000000..3f795a38989
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_tile.c
@@ -0,0 +1,699 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * RGBA/float tile get/put functions.
+ * Usable both by drivers and state trackers.
+ * Surfaces should already be in a mapped state.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+#include "pipe/p_inlines.h"
+
+#include "p_tile.h"
+
+
+
+/**
+ * Move raw block of pixels from surface to user memory.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_get_tile_raw(struct pipe_context *pipe,
+                  struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *p, int dst_stride)
+{
+   const uint cpp = ps->cpp;
+   const ubyte *pSrc;
+   const uint src_stride = ps->pitch * cpp;
+   ubyte *pDest;
+   uint i;
+
+   if (dst_stride == 0) {
+      dst_stride = w * cpp;
+   }
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   pSrc = (const ubyte *) pipe_surface_map(ps) + (y * ps->pitch + x) * cpp;
+   pDest = (ubyte *) p;
+
+   for (i = 0; i < h; i++) {
+      memcpy(pDest, pSrc, w * cpp);
+      pDest += dst_stride;
+      pSrc += src_stride;
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
+/**
+ * Move raw block of pixels from user memory to surface.
+ * This should be usable by any hw driver that has mappable surfaces.
+ */
+void
+pipe_put_tile_raw(struct pipe_context *pipe,
+                  struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *p, int src_stride)
+{
+   const uint cpp = ps->cpp;
+   const ubyte *pSrc;
+   const uint dst_stride = ps->pitch * cpp;
+   ubyte *pDest;
+   uint i;
+
+   if (src_stride == 0) {
+      src_stride = w * cpp;
+   }
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   pSrc = (const ubyte *) p;
+   pDest = (ubyte *) pipe_surface_map(ps) + (y * ps->pitch + x) * cpp;
+
+   for (i = 0; i < h; i++) {
+      memcpy(pDest, pSrc, w * cpp);
+      pDest += dst_stride;
+      pSrc += src_stride;
+   }
+
+   pipe_surface_unmap(ps);
+}
+
+
+
+
+/** Convert short in [-32768,32767] to GLfloat in [-1.0,1.0] */
+#define SHORT_TO_FLOAT(S)   ((2.0F * (S) + 1.0F) * (1.0F/65535.0F))
+
+#define UNCLAMPED_FLOAT_TO_SHORT(us, f)  \
+   us = ( (short) ( CLAMP((f), -1.0, 1.0) * 32767.0F) )
+
+
+
+/*** PIPE_FORMAT_A8R8G8B8_UNORM ***/
+
+static void
+a8r8g8b8_get_tile_rgba(unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = UBYTE_TO_FLOAT((pixel >> 16) & 0xff);
+         pRow[1] = UBYTE_TO_FLOAT((pixel >>  8) & 0xff);
+         pRow[2] = UBYTE_TO_FLOAT((pixel >>  0) & 0xff);
+         pRow[3] = UBYTE_TO_FLOAT((pixel >> 24) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+a8r8g8b8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         UNCLAMPED_FLOAT_TO_UBYTE(r, pRow[0]);
+         UNCLAMPED_FLOAT_TO_UBYTE(g, pRow[1]);
+         UNCLAMPED_FLOAT_TO_UBYTE(b, pRow[2]);
+         UNCLAMPED_FLOAT_TO_UBYTE(a, pRow[3]);
+         *dst++ = (a << 24) | (r << 16) | (g << 8) | b;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_B8G8R8A8_UNORM ***/
+
+static void
+b8g8r8a8_get_tile_rgba(unsigned *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const unsigned pixel = *src++;
+         pRow[0] = UBYTE_TO_FLOAT((pixel >>  8) & 0xff);
+         pRow[1] = UBYTE_TO_FLOAT((pixel >> 16) & 0xff);
+         pRow[2] = UBYTE_TO_FLOAT((pixel >> 24) & 0xff);
+         pRow[3] = UBYTE_TO_FLOAT((pixel >>  0) & 0xff);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+b8g8r8a8_put_tile_rgba(unsigned *dst,
+                       unsigned w, unsigned h,
+                       const float *p,
+                       unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         unsigned r, g, b, a;
+         UNCLAMPED_FLOAT_TO_UBYTE(r, pRow[0]);
+         UNCLAMPED_FLOAT_TO_UBYTE(g, pRow[1]);
+         UNCLAMPED_FLOAT_TO_UBYTE(b, pRow[2]);
+         UNCLAMPED_FLOAT_TO_UBYTE(a, pRow[3]);
+         *dst++ = (b << 24) | (g << 16) | (r << 8) | a;
+      }
+      p += src_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A1R5G5B5_UNORM ***/
+
+static void
+a1r5g5b5_get_tile_rgba(ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 10) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x1f) * (1.0f / 31.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = ((pixel >> 15)       ) * 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_A4R4G4B4_UNORM ***/
+
+static void
+a4r4g4b4_get_tile_rgba(ushort *src,
+                       unsigned w, unsigned h,
+                       float *p,
+                       unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >>  8) & 0xf) * (1.0f / 15.0f);
+         pRow[1] = ((pixel >>  4) & 0xf) * (1.0f / 15.0f);
+         pRow[2] = ((pixel      ) & 0xf) * (1.0f / 15.0f);
+         pRow[3] = ((pixel >> 12)      ) * (1.0f / 15.0f);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R5G6B5_UNORM ***/
+
+static void
+r5g6b5_get_tile_rgba(ushort *src,
+                     unsigned w, unsigned h,
+                     float *p,
+                     unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         const ushort pixel = *src++;
+         pRow[0] = ((pixel >> 11) & 0x1f) * (1.0f / 31.0f);
+         pRow[1] = ((pixel >>  5) & 0x3f) * (1.0f / 63.0f);
+         pRow[2] = ((pixel      ) & 0x1f) * (1.0f / 31.0f);
+         pRow[3] = 1.0f;
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r5g5b5_put_tile_rgba(ushort *dst,
+                     unsigned w, unsigned h,
+                     const float *p,
+                     unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         uint r = (uint) (CLAMP(pRow[0], 0.0, 1.0) * 31.0);
+         uint g = (uint) (CLAMP(pRow[1], 0.0, 1.0) * 63.0);
+         uint b = (uint) (CLAMP(pRow[2], 0.0, 1.0) * 31.0);
+         *dst++ = (r << 11) | (g << 5) | (b);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_Z16_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z16_get_tile_rgba(ushort *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const float scale = 1.0f / 65535.0f;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = *src++ * scale;
+      }
+      p += dst_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_U_L8 ***/
+
+static void
+l8_get_tile_rgba(ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = UBYTE_TO_FLOAT(*src);
+         pRow[3] = 1.0;
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_U_A8 ***/
+
+static void
+a8_get_tile_rgba(ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = 0.0;
+         pRow[3] = UBYTE_TO_FLOAT(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_R16G16B16A16_SNORM ***/
+
+static void
+r16g16b16a16_get_tile_rgba(short *src,
+                           unsigned w, unsigned h,
+                           float *p,
+                           unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src += 4, pRow += 4) {
+         pRow[0] = SHORT_TO_FLOAT(src[0]);
+         pRow[1] = SHORT_TO_FLOAT(src[1]);
+         pRow[2] = SHORT_TO_FLOAT(src[2]);
+         pRow[3] = SHORT_TO_FLOAT(src[3]);
+      }
+      p += dst_stride;
+   }
+}
+
+
+static void
+r16g16b16a16_put_tile_rgba(short *dst,
+                           unsigned w, unsigned h,
+                           const float *p,
+                           unsigned src_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      const float *pRow = p;
+      for (j = 0; j < w; j++, dst += 4, pRow += 4) {
+         UNCLAMPED_FLOAT_TO_SHORT(dst[0], pRow[0]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[1], pRow[1]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[2], pRow[2]);
+         UNCLAMPED_FLOAT_TO_SHORT(dst[3], pRow[3]);
+      }
+      p += src_stride;
+   }
+}
+
+
+
+/*** PIPE_FORMAT_U_I8 ***/
+
+static void
+i8_get_tile_rgba(ubyte *src,
+                 unsigned w, unsigned h,
+                 float *p,
+                 unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, src++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = UBYTE_TO_FLOAT(*src);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_U_A8_L8 ***/
+
+static void
+a8_l8_get_tile_rgba(ushort *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         ushort p = *src++;
+         pRow[0] =
+         pRow[1] =
+         pRow[2] = UBYTE_TO_FLOAT(p & 0xff);
+         pRow[3] = UBYTE_TO_FLOAT(p >> 8);
+      }
+      p += dst_stride;
+   }
+}
+
+
+
+
+/*** PIPE_FORMAT_Z32_UNORM ***/
+
+/**
+ * Return each Z value as four floats in [0,1].
+ */
+static void
+z32_get_tile_rgba(unsigned *src,
+                  unsigned w, unsigned h,
+                  float *p,
+                  unsigned dst_stride)
+{
+   const double scale = 1.0 / (double) 0xffffffff;
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (*src++ * scale);
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_S8Z24_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+s8z24_get_tile_rgba(unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ & 0xffffff));
+      }
+      p += dst_stride;
+   }
+}
+
+
+/*** PIPE_FORMAT_Z24S8_UNORM ***/
+
+/**
+ * Return Z component as four float in [0,1].  Stencil part ignored.
+ */
+static void
+z24s8_get_tile_rgba(unsigned *src,
+                    unsigned w, unsigned h,
+                    float *p,
+                    unsigned dst_stride)
+{
+   const double scale = 1.0 / ((1 << 24) - 1);
+   unsigned i, j;
+
+   for (i = 0; i < h; i++) {
+      float *pRow = p;
+      for (j = 0; j < w; j++, pRow += 4) {
+         pRow[0] =
+         pRow[1] =
+         pRow[2] =
+         pRow[3] = (float) (scale * (*src++ >> 8));
+      }
+      p += dst_stride;
+   }
+}
+
+
+void
+pipe_get_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p)
+{
+   unsigned dst_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(h * w * ps->cpp);
+
+   if (!packed)
+      return;
+
+   pipe_get_tile_raw(pipe, ps, x, y, w, h, packed, w * ps->cpp);
+
+   switch (ps->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_get_tile_rgba((unsigned *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_get_tile_rgba((unsigned *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      a1r5g5b5_get_tile_rgba((ushort *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_A4R4G4B4_UNORM:
+      a4r4g4b4_get_tile_rgba((ushort *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g6b5_get_tile_rgba((ushort *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_U_L8:
+      l8_get_tile_rgba((ubyte *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_U_A8:
+      a8_get_tile_rgba((ubyte *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_U_I8:
+      i8_get_tile_rgba((ubyte *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_U_A8_L8:
+      a8_l8_get_tile_rgba((ushort *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_get_tile_rgba((short *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      z16_get_tile_rgba((ushort *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      z32_get_tile_rgba((unsigned *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+      s8z24_get_tile_rgba((unsigned *) packed, w, h, p, dst_stride);
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      z24s8_get_tile_rgba((unsigned *) packed, w, h, p, dst_stride);
+      break;
+   default:
+      assert(0);
+   }
+
+   FREE(packed);
+}
+
+
+void
+pipe_put_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p)
+{
+   unsigned src_stride = w * 4;
+   void *packed;
+
+   if (pipe_clip_tile(x, y, &w, &h, ps))
+      return;
+
+   packed = MALLOC(h * w * ps->cpp);
+
+   if (!packed)
+      return;
+
+   switch (ps->format) {
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      a8r8g8b8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      b8g8r8a8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_A1R5G5B5_UNORM:
+      /*a1r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_R5G6B5_UNORM:
+      r5g5b5_put_tile_rgba((ushort *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      break;
+   case PIPE_FORMAT_U_L8:
+      /*l8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_U_A8:
+      /*a8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_U_I8:
+      /*i8_put_tile_rgba((ubyte *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_U_A8_L8:
+      /*a8_l8_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      r16g16b16a16_put_tile_rgba((short *) packed, w, h, p, src_stride);
+      break;
+   case PIPE_FORMAT_Z16_UNORM:
+      /*z16_put_tile_rgba((ushort *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z32_UNORM:
+      /*z32_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_S8Z24_UNORM:
+      /*s8z24_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   case PIPE_FORMAT_Z24S8_UNORM:
+      /*z24s8_put_tile_rgba((unsigned *) packed, w, h, p, src_stride);*/
+      break;
+   default:
+      assert(0);
+   }
+
+   pipe_put_tile_raw(pipe, ps, x, y, w, h, packed, w * ps->cpp);
+
+   FREE(packed);
+}
diff --git a/src/gallium/auxiliary/util/p_tile.h b/src/gallium/auxiliary/util/p_tile.h
new file mode 100644
index 00000000000..cd8124bf11f
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_tile.h
@@ -0,0 +1,81 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+#ifndef P_TILE_H
+#define P_TILE_H
+
+#include "pipe/p_compiler.h"
+
+struct pipe_context;
+struct pipe_surface;
+
+
+/**
+ * Clip tile against surface dims.
+ * \return TRUE if tile is totally clipped, FALSE otherwise
+ */
+static INLINE boolean
+pipe_clip_tile(uint x, uint y, uint *w, uint *h, const struct pipe_surface *ps)
+{
+   if (x >= ps->width)
+      return TRUE;
+   if (y >= ps->height)
+      return TRUE;
+   if (x + *w > ps->width)
+      *w = ps->width - x;
+   if (y + *h > ps->height)
+      *h = ps->height - y;
+   return FALSE;
+}
+
+
+extern void
+pipe_get_tile_raw(struct pipe_context *pipe,
+                  struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  void *p, int dst_stride);
+
+extern void
+pipe_put_tile_raw(struct pipe_context *pipe,
+                  struct pipe_surface *ps,
+                  uint x, uint y, uint w, uint h,
+                  const void *p, int src_stride);
+
+
+extern void
+pipe_get_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   float *p);
+
+extern void
+pipe_put_tile_rgba(struct pipe_context *pipe,
+                   struct pipe_surface *ps,
+                   uint x, uint y, uint w, uint h,
+                   const float *p);
+
+#endif
diff --git a/src/gallium/auxiliary/util/p_util.c b/src/gallium/auxiliary/util/p_util.c
new file mode 100644
index 00000000000..2a92f8e408b
--- /dev/null
+++ b/src/gallium/auxiliary/util/p_util.c
@@ -0,0 +1,73 @@
+/**************************************************************************
+ * 
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/**
+ * Miscellaneous utility functions.
+ */
+
+
+#include "pipe/p_defines.h"
+#include "pipe/p_util.h"
+
+
+/**
+ * Copy 2D rect from one place to another.
+ * Position and sizes are in pixels.
+ */
+void
+pipe_copy_rect(ubyte * dst,
+               unsigned cpp,
+               unsigned dst_pitch,
+               unsigned dst_x,
+               unsigned dst_y,
+               unsigned width,
+               unsigned height,
+               const ubyte * src,
+               int src_pitch,
+               unsigned src_x, 
+               int src_y)
+{
+   unsigned i;
+
+   dst_pitch *= cpp;
+   src_pitch *= cpp;
+   dst += dst_x * cpp;
+   src += src_x * cpp;
+   dst += dst_y * dst_pitch;
+   src += src_y * src_pitch;
+   width *= cpp;
+
+   if (width == dst_pitch && width == src_pitch)
+      memcpy(dst, src, height * width);
+   else {
+      for (i = 0; i < height; i++) {
+         memcpy(dst, src, width);
+         dst += dst_pitch;
+         src += src_pitch;
+      }
+   }
+}