Merge commit 'origin/gallium-master-merge'

This is the big merge of the gallium-0.2 branch into master. gallium-master-merge was just the staging area for it. Both gallium-0.2 and gallium-master-merge are considered closed now. Conflicts: progs/demos/Makefile src/mesa/main/state.c src/mesa/main/texenvprogram.c
author: Brian Paul <[email protected]> 2009-02-10 16:44:02 -0700
committer: Brian Paul <[email protected]> 2009-02-10 16:44:02 -0700
commit: 5340b6dff73a0a23531ce2a5f28fba8303adab6e (patch)
tree: b141fc3648568dd8b941c966059e6ed32a8bd0ad /src/gallium/auxiliary/translate
parent: 9fd26daec24f21dbe17afcb2e2ab272667ee9a69 (diff)
parent: ee4c921b65fb76998711f3c40330505cbc49a0e0 (diff)
8 files changed, 1764 insertions, 0 deletions
diff --git a/src/gallium/auxiliary/translate/Makefile b/src/gallium/auxiliary/translate/Makefile
new file mode 100644
index 00000000000..ad2a5b705e4
--- /dev/null
+++ b/src/gallium/auxiliary/translate/Makefile
@@ -0,0 +1,15 @@
+TOP = ../../../..
+include $(TOP)/configs/current
+
+LIBNAME = translate
+
+C_SOURCES = \
+	translate_generic.c \
+	translate_sse.c \
+	translate.c \
+        translate_cache.c
+
+include ../../Makefile.template
+
+symlinks:
+
diff --git a/src/gallium/auxiliary/translate/SConscript b/src/gallium/auxiliary/translate/SConscript
new file mode 100644
index 00000000000..9553a675372
--- /dev/null
+++ b/src/gallium/auxiliary/translate/SConscript
@@ -0,0 +1,12 @@
+Import('*')
+
+translate = env.ConvenienceLibrary(
+	target = 'translate',
+	source = [
+		'translate_generic.c',
+		'translate_sse.c',
+		'translate.c',
+		'translate_cache.c',
+	])
+
+auxiliaries.insert(0, translate)
diff --git a/src/gallium/auxiliary/translate/translate.c b/src/gallium/auxiliary/translate/translate.c
new file mode 100644
index 00000000000..7678903f75c
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate.c
@@ -0,0 +1,48 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "pipe/p_config.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+struct translate *translate_create( const struct translate_key *key )
+{
+   struct translate *translate = NULL;
+
+#if defined(PIPE_ARCH_X86)
+   translate = translate_sse2_create( key );
+   if (translate)
+      return translate;
+#endif
+
+   return translate_generic_create( key );
+}
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
new file mode 100644
index 00000000000..34526eb0617
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -0,0 +1,127 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+
+/**
+ * Vertex fetch/store/convert code.  This functionality is used in two places:
+ * 1. Vertex fetch/convert - to grab vertex data from incoming vertex
+ *    arrays and convert to format needed by vertex shaders.
+ * 2. Vertex store/emit - to convert simple float[][4] vertex attributes
+ *    (which is the organization used throughout the draw/prim pipeline) to
+ *    hardware-specific formats and emit into hardware vertex buffers.
+ *
+ *
+ * Authors:
+ *    Keith Whitwell <[email protected]>
+ */
+
+#ifndef _TRANSLATE_H
+#define _TRANSLATE_H
+
+
+#include "pipe/p_compiler.h"
+#include "pipe/p_format.h"
+#include "pipe/p_state.h"
+
+struct translate_element 
+{
+   enum pipe_format input_format;
+   enum pipe_format output_format;
+   unsigned input_buffer:8;
+   unsigned input_offset:24;
+   unsigned output_offset;
+};
+
+
+struct translate_key {
+   unsigned output_stride;
+   unsigned nr_elements;
+   struct translate_element element[PIPE_MAX_ATTRIBS + 1];
+};
+
+
+struct translate {
+   struct translate_key key;
+
+   void (*release)( struct translate * );
+
+   void (*set_buffer)( struct translate *,
+		       unsigned i,
+		       const void *ptr,
+		       unsigned stride );
+
+   void (PIPE_CDECL *run_elts)( struct translate *,
+                                const unsigned *elts,
+                                unsigned count,
+                                void *output_buffer);
+
+   void (PIPE_CDECL *run)( struct translate *,
+                           unsigned start,
+                           unsigned count,
+                           void *output_buffer);
+};
+
+
+
+#if 0
+struct translate_context *translate_context_create( void );
+void translate_context_destroy( struct translate_context * );
+
+struct translate *translate_lookup_or_create( struct translate_context *tctx,
+					      const struct translate_key *key );
+#endif
+
+
+struct translate *translate_create( const struct translate_key *key );
+
+static INLINE int translate_keysize( const struct translate_key *key )
+{
+   return 2 * sizeof(int) + key->nr_elements * sizeof(struct translate_element);
+}
+
+static INLINE int translate_key_compare( const struct translate_key *a,
+                                         const struct translate_key *b )
+{
+   int keysize = translate_keysize(a);
+   return memcmp(a, b, keysize);
+}
+
+
+static INLINE void translate_key_sanitize( struct translate_key *a )
+{
+   int keysize = translate_keysize(a);
+   char *ptr = (char *)a;
+   memset(ptr + keysize, 0, sizeof(*a) - keysize);
+}
+
+
+/*******************************************************************************
+ *  Private:
+ */
+struct translate *translate_sse2_create( const struct translate_key *key );
+
+struct translate *translate_generic_create( const struct translate_key *key );
+
+
+#endif
diff --git a/src/gallium/auxiliary/translate/translate_cache.c b/src/gallium/auxiliary/translate/translate_cache.c
new file mode 100644
index 00000000000..d8069a149cf
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_cache.c
@@ -0,0 +1,102 @@
+/**************************************************************************
+ *
+ * Copyright 2008 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+#include "translate_cache.h"
+
+#include "cso_cache/cso_cache.h"
+#include "cso_cache/cso_hash.h"
+
+struct translate_cache {
+   struct cso_hash *hash;
+};
+
+struct translate_cache * translate_cache_create( void )
+{
+   struct translate_cache *cache = MALLOC_STRUCT(translate_cache);
+   cache->hash = cso_hash_create();
+   return cache;
+}
+
+
+static INLINE void delete_translates(struct translate_cache *cache)
+{
+   struct cso_hash *hash = cache->hash;
+   struct cso_hash_iter iter = cso_hash_first_node(hash);
+   while (!cso_hash_iter_is_null(iter)) {
+      struct translate *state = (struct translate*)cso_hash_iter_data(iter);
+      iter = cso_hash_iter_next(iter);
+      if (state) {
+         state->release(state);
+      }
+   }
+}
+
+void translate_cache_destroy(struct translate_cache *cache)
+{
+   delete_translates(cache);
+   cso_hash_delete(cache->hash);
+   FREE(cache);
+}
+
+
+static INLINE unsigned translate_hash_key_size(struct translate_key *key)
+{
+   unsigned size = sizeof(struct translate_key) -
+                   sizeof(struct translate_element) * (PIPE_MAX_ATTRIBS - key->nr_elements);
+   return size;
+}
+
+static INLINE unsigned create_key(struct translate_key *key)
+{
+   unsigned hash_key;
+   unsigned size = translate_hash_key_size(key);
+   /*debug_printf("key size = %d, (els = %d)\n",
+     size, key->nr_elements);*/
+   hash_key = cso_construct_key(key, size);
+   return hash_key;
+}
+
+struct translate * translate_cache_find(struct translate_cache *cache,
+                                        struct translate_key *key)
+{
+   unsigned hash_key = create_key(key);
+   struct translate *translate = (struct translate*)
+      cso_hash_find_data_from_template(cache->hash,
+                                       hash_key,
+                                       key, sizeof(*key));
+
+   if (!translate) {
+      /* create/insert */
+      translate = translate_create(key);
+      cso_hash_insert(cache->hash, hash_key, translate);
+   }
+
+   return translate;
+}
diff --git a/src/gallium/auxiliary/translate/translate_cache.h b/src/gallium/auxiliary/translate/translate_cache.h
new file mode 100644
index 00000000000..7dba871e579
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_cache.h
@@ -0,0 +1,54 @@
+/*
+ * Copyright 2008 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+#ifndef _TRANSLATE_CACHE_H
+#define _TRANSLATE_CACHE_H
+
+
+/*******************************************************************************
+ * Translate cache.
+ * Simply used to cache created translates. Avoids unecessary creation of
+ * translate's if one suitable for a given translate_key has already been
+ * created.
+ *
+ * Note: this functionality depends and requires the CSO module.
+ */
+struct translate_cache;
+
+struct translate_key;
+struct translate;
+
+struct translate_cache *translate_cache_create( void );
+void translate_cache_destroy(struct translate_cache *cache);
+
+/**
+ * Will try to find a translate structure matched by the given key.
+ * If such a structure doesn't exist in the cache the function
+ * will automatically create it, insert it in the cache and
+ * return the created version.
+ *
+ */
+struct translate *translate_cache_find(struct translate_cache *cache,
+                                       struct translate_key *key);
+
+#endif
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
new file mode 100644
index 00000000000..8d39b64c6c1
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -0,0 +1,700 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <[email protected]>
+  */
+
+#include "util/u_memory.h"
+#include "pipe/p_state.h"
+#include "translate.h"
+
+
+#define DRAW_DBG 0
+
+typedef void (*fetch_func)(const void *ptr, float *attrib);
+typedef void (*emit_func)(const float *attrib, void *ptr);
+
+
+
+struct translate_generic {
+   struct translate translate;
+
+   struct {
+      fetch_func fetch;
+      unsigned buffer;
+      unsigned input_offset;
+
+      emit_func emit;
+      unsigned output_offset;
+      
+      char *input_ptr;
+      unsigned input_stride;
+
+   } attrib[PIPE_MAX_ATTRIBS];
+
+   unsigned nr_attrib;
+};
+
+
+static struct translate_generic *translate_generic( struct translate *translate )
+{
+   return (struct translate_generic *)translate;
+}
+
+/**
+ * Fetch a float[4] vertex attribute from memory, doing format/type
+ * conversion as needed.
+ *
+ * This is probably needed/dupliocated elsewhere, eg format
+ * conversion, texture sampling etc.
+ */
+#define ATTRIB( NAME, SZ, TYPE, FROM, TO )		\
+static void						\
+fetch_##NAME(const void *ptr, float *attrib)		\
+{							\
+   const float defaults[4] = { 0.0f,0.0f,0.0f,1.0f };	\
+   unsigned i;						\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      attrib[i] = FROM(i);				\
+   }							\
+							\
+   for (; i < 4; i++) {					\
+      attrib[i] = defaults[i];				\
+   }							\
+}							\
+							\
+static void						\
+emit_##NAME(const float *attrib, void *ptr)		\
+{  \
+   unsigned i;						\
+   TYPE *out = (TYPE *)ptr;				\
+							\
+   for (i = 0; i < SZ; i++) {				\
+      out[i] = TO(attrib[i]);				\
+   }							\
+}
+
+
+#define FROM_64_FLOAT(i)   ((float) ((double *) ptr)[i])
+#define FROM_32_FLOAT(i)   (((float *) ptr)[i])
+
+#define FROM_8_USCALED(i)  ((float) ((unsigned char *) ptr)[i])
+#define FROM_16_USCALED(i) ((float) ((unsigned short *) ptr)[i])
+#define FROM_32_USCALED(i) ((float) ((unsigned int *) ptr)[i])
+
+#define FROM_8_SSCALED(i)  ((float) ((char *) ptr)[i])
+#define FROM_16_SSCALED(i) ((float) ((short *) ptr)[i])
+#define FROM_32_SSCALED(i) ((float) ((int *) ptr)[i])
+
+#define FROM_8_UNORM(i)    ((float) ((unsigned char *) ptr)[i] / 255.0f)
+#define FROM_16_UNORM(i)   ((float) ((unsigned short *) ptr)[i] / 65535.0f)
+#define FROM_32_UNORM(i)   ((float) ((unsigned int *) ptr)[i] / 4294967295.0f)
+
+#define FROM_8_SNORM(i)    ((float) ((char *) ptr)[i] / 127.0f)
+#define FROM_16_SNORM(i)   ((float) ((short *) ptr)[i] / 32767.0f)
+#define FROM_32_SNORM(i)   ((float) ((int *) ptr)[i] / 2147483647.0f)
+
+#define FROM_32_FIXED(i)   (((int *) ptr)[i] / 65536.0f)
+
+#define TO_64_FLOAT(x)   ((double) x)
+#define TO_32_FLOAT(x)   (x)
+
+#define TO_8_USCALED(x)  ((unsigned char) x)
+#define TO_16_USCALED(x) ((unsigned short) x)
+#define TO_32_USCALED(x) ((unsigned int) x)
+
+#define TO_8_SSCALED(x)  ((char) x)
+#define TO_16_SSCALED(x) ((short) x)
+#define TO_32_SSCALED(x) ((int) x)
+
+#define TO_8_UNORM(x)    ((unsigned char) (x * 255.0f))
+#define TO_16_UNORM(x)   ((unsigned short) (x * 65535.0f))
+#define TO_32_UNORM(x)   ((unsigned int) (x * 4294967295.0f))
+
+#define TO_8_SNORM(x)    ((char) (x * 127.0f))
+#define TO_16_SNORM(x)   ((short) (x * 32767.0f))
+#define TO_32_SNORM(x)   ((int) (x * 2147483647.0f))
+
+#define TO_32_FIXED(x)   ((int) (x * 65536.0f))
+
+
+
+ATTRIB( R64G64B64A64_FLOAT,   4, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64G64B64_FLOAT,      3, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64G64_FLOAT,         2, double, FROM_64_FLOAT, TO_64_FLOAT )
+ATTRIB( R64_FLOAT,            1, double, FROM_64_FLOAT, TO_64_FLOAT )
+
+ATTRIB( R32G32B32A32_FLOAT,   4, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32G32B32_FLOAT,      3, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32G32_FLOAT,         2, float, FROM_32_FLOAT, TO_32_FLOAT )
+ATTRIB( R32_FLOAT,            1, float, FROM_32_FLOAT, TO_32_FLOAT )
+
+ATTRIB( R32G32B32A32_USCALED, 4, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32G32B32_USCALED,    3, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32G32_USCALED,       2, unsigned, FROM_32_USCALED, TO_32_USCALED )
+ATTRIB( R32_USCALED,          1, unsigned, FROM_32_USCALED, TO_32_USCALED )
+
+ATTRIB( R32G32B32A32_SSCALED, 4, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32G32B32_SSCALED,    3, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32G32_SSCALED,       2, int, FROM_32_SSCALED, TO_32_SSCALED )
+ATTRIB( R32_SSCALED,          1, int, FROM_32_SSCALED, TO_32_SSCALED )
+
+ATTRIB( R32G32B32A32_UNORM, 4, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32G32B32_UNORM,    3, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32G32_UNORM,       2, unsigned, FROM_32_UNORM, TO_32_UNORM )
+ATTRIB( R32_UNORM,          1, unsigned, FROM_32_UNORM, TO_32_UNORM )
+
+ATTRIB( R32G32B32A32_SNORM, 4, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32G32B32_SNORM,    3, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32G32_SNORM,       2, int, FROM_32_SNORM, TO_32_SNORM )
+ATTRIB( R32_SNORM,          1, int, FROM_32_SNORM, TO_32_SNORM )
+
+ATTRIB( R16G16B16A16_USCALED, 4, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16G16B16_USCALED,    3, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16G16_USCALED,       2, ushort, FROM_16_USCALED, TO_16_USCALED )
+ATTRIB( R16_USCALED,          1, ushort, FROM_16_USCALED, TO_16_USCALED )
+
+ATTRIB( R16G16B16A16_SSCALED, 4, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16G16B16_SSCALED,    3, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16G16_SSCALED,       2, short, FROM_16_SSCALED, TO_16_SSCALED )
+ATTRIB( R16_SSCALED,          1, short, FROM_16_SSCALED, TO_16_SSCALED )
+
+ATTRIB( R16G16B16A16_UNORM, 4, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16G16B16_UNORM,    3, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16G16_UNORM,       2, ushort, FROM_16_UNORM, TO_16_UNORM )
+ATTRIB( R16_UNORM,          1, ushort, FROM_16_UNORM, TO_16_UNORM )
+
+ATTRIB( R16G16B16A16_SNORM, 4, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16G16B16_SNORM,    3, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16G16_SNORM,       2, short, FROM_16_SNORM, TO_16_SNORM )
+ATTRIB( R16_SNORM,          1, short, FROM_16_SNORM, TO_16_SNORM )
+
+ATTRIB( R8G8B8A8_USCALED,   4, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8G8B8_USCALED,     3, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8G8_USCALED,       2, ubyte, FROM_8_USCALED, TO_8_USCALED )
+ATTRIB( R8_USCALED,         1, ubyte, FROM_8_USCALED, TO_8_USCALED )
+
+ATTRIB( R8G8B8A8_SSCALED,  4, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8G8B8_SSCALED,    3, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8G8_SSCALED,      2, char, FROM_8_SSCALED, TO_8_SSCALED )
+ATTRIB( R8_SSCALED,        1, char, FROM_8_SSCALED, TO_8_SSCALED )
+
+ATTRIB( R8G8B8A8_UNORM,  4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8G8B8_UNORM,    3, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8G8_UNORM,      2, ubyte, FROM_8_UNORM, TO_8_UNORM )
+ATTRIB( R8_UNORM,        1, ubyte, FROM_8_UNORM, TO_8_UNORM )
+
+ATTRIB( R8G8B8A8_SNORM,  4, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8G8B8_SNORM,    3, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8G8_SNORM,      2, char, FROM_8_SNORM, TO_8_SNORM )
+ATTRIB( R8_SNORM,        1, char, FROM_8_SNORM, TO_8_SNORM )
+
+ATTRIB( A8R8G8B8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+//ATTRIB( R8G8B8A8_UNORM,       4, ubyte, FROM_8_UNORM, TO_8_UNORM )
+
+ATTRIB( R32G32B32A32_FIXED,   4, int, FROM_32_FIXED, TO_32_FIXED )
+ATTRIB( R32G32B32_FIXED,      3, int, FROM_32_FIXED, TO_32_FIXED )
+ATTRIB( R32G32_FIXED,         2, int, FROM_32_FIXED, TO_32_FIXED )
+ATTRIB( R32_FIXED,            1, int, FROM_32_FIXED, TO_32_FIXED )
+
+
+
+static void
+fetch_B8G8R8A8_UNORM(const void *ptr, float *attrib)
+{
+   attrib[2] = FROM_8_UNORM(0);
+   attrib[1] = FROM_8_UNORM(1);
+   attrib[0] = FROM_8_UNORM(2);
+   attrib[3] = FROM_8_UNORM(3);
+}
+
+static void
+emit_B8G8R8A8_UNORM( const float *attrib, void *ptr)
+{
+   ubyte *out = (ubyte *)ptr;
+   out[2] = TO_8_UNORM(attrib[0]);
+   out[1] = TO_8_UNORM(attrib[1]);
+   out[0] = TO_8_UNORM(attrib[2]);
+   out[3] = TO_8_UNORM(attrib[3]);
+}
+
+static void 
+fetch_NULL( const void *ptr, float *attrib )
+{
+   attrib[0] = 0;
+   attrib[1] = 0;
+   attrib[2] = 0;
+   attrib[3] = 1;
+}
+
+static void 
+emit_NULL( const float *attrib, void *ptr )
+{
+   /* do nothing is the only sensible option */
+}
+
+static fetch_func get_fetch_func( enum pipe_format format )
+{
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return &fetch_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return &fetch_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return &fetch_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return &fetch_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return &fetch_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return &fetch_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return &fetch_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return &fetch_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return &fetch_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return &fetch_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return &fetch_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return &fetch_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return &fetch_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return &fetch_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return &fetch_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return &fetch_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return &fetch_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return &fetch_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return &fetch_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return &fetch_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return &fetch_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return &fetch_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return &fetch_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return &fetch_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return &fetch_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return &fetch_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return &fetch_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return &fetch_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return &fetch_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return &fetch_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return &fetch_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return &fetch_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return &fetch_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return &fetch_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return &fetch_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return &fetch_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return &fetch_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return &fetch_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return &fetch_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return &fetch_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return &fetch_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return &fetch_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return &fetch_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return &fetch_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return &fetch_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return &fetch_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return &fetch_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return &fetch_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return &fetch_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return &fetch_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return &fetch_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return &fetch_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return &fetch_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return &fetch_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return &fetch_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return &fetch_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return &fetch_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return &fetch_B8G8R8A8_UNORM;
+
+   case PIPE_FORMAT_R32_FIXED:
+      return &fetch_R32_FIXED;
+   case PIPE_FORMAT_R32G32_FIXED:
+      return &fetch_R32G32_FIXED;
+   case PIPE_FORMAT_R32G32B32_FIXED:
+      return &fetch_R32G32B32_FIXED;
+   case PIPE_FORMAT_R32G32B32A32_FIXED:
+      return &fetch_R32G32B32A32_FIXED;
+
+   default:
+      assert(0); 
+      return &fetch_NULL;
+   }
+}
+
+
+
+
+static emit_func get_emit_func( enum pipe_format format )
+{
+   /* silence warnings */
+   (void) emit_R32G32B32A32_FIXED;
+   (void) emit_R32G32B32_FIXED;
+   (void) emit_R32G32_FIXED;
+   (void) emit_R32_FIXED;
+
+   switch (format) {
+   case PIPE_FORMAT_R64_FLOAT:
+      return &emit_R64_FLOAT;
+   case PIPE_FORMAT_R64G64_FLOAT:
+      return &emit_R64G64_FLOAT;
+   case PIPE_FORMAT_R64G64B64_FLOAT:
+      return &emit_R64G64B64_FLOAT;
+   case PIPE_FORMAT_R64G64B64A64_FLOAT:
+      return &emit_R64G64B64A64_FLOAT;
+
+   case PIPE_FORMAT_R32_FLOAT:
+      return &emit_R32_FLOAT;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      return &emit_R32G32_FLOAT;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      return &emit_R32G32B32_FLOAT;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      return &emit_R32G32B32A32_FLOAT;
+
+   case PIPE_FORMAT_R32_UNORM:
+      return &emit_R32_UNORM;
+   case PIPE_FORMAT_R32G32_UNORM:
+      return &emit_R32G32_UNORM;
+   case PIPE_FORMAT_R32G32B32_UNORM:
+      return &emit_R32G32B32_UNORM;
+   case PIPE_FORMAT_R32G32B32A32_UNORM:
+      return &emit_R32G32B32A32_UNORM;
+
+   case PIPE_FORMAT_R32_USCALED:
+      return &emit_R32_USCALED;
+   case PIPE_FORMAT_R32G32_USCALED:
+      return &emit_R32G32_USCALED;
+   case PIPE_FORMAT_R32G32B32_USCALED:
+      return &emit_R32G32B32_USCALED;
+   case PIPE_FORMAT_R32G32B32A32_USCALED:
+      return &emit_R32G32B32A32_USCALED;
+
+   case PIPE_FORMAT_R32_SNORM:
+      return &emit_R32_SNORM;
+   case PIPE_FORMAT_R32G32_SNORM:
+      return &emit_R32G32_SNORM;
+   case PIPE_FORMAT_R32G32B32_SNORM:
+      return &emit_R32G32B32_SNORM;
+   case PIPE_FORMAT_R32G32B32A32_SNORM:
+      return &emit_R32G32B32A32_SNORM;
+
+   case PIPE_FORMAT_R32_SSCALED:
+      return &emit_R32_SSCALED;
+   case PIPE_FORMAT_R32G32_SSCALED:
+      return &emit_R32G32_SSCALED;
+   case PIPE_FORMAT_R32G32B32_SSCALED:
+      return &emit_R32G32B32_SSCALED;
+   case PIPE_FORMAT_R32G32B32A32_SSCALED:
+      return &emit_R32G32B32A32_SSCALED;
+
+   case PIPE_FORMAT_R16_UNORM:
+      return &emit_R16_UNORM;
+   case PIPE_FORMAT_R16G16_UNORM:
+      return &emit_R16G16_UNORM;
+   case PIPE_FORMAT_R16G16B16_UNORM:
+      return &emit_R16G16B16_UNORM;
+   case PIPE_FORMAT_R16G16B16A16_UNORM:
+      return &emit_R16G16B16A16_UNORM;
+
+   case PIPE_FORMAT_R16_USCALED:
+      return &emit_R16_USCALED;
+   case PIPE_FORMAT_R16G16_USCALED:
+      return &emit_R16G16_USCALED;
+   case PIPE_FORMAT_R16G16B16_USCALED:
+      return &emit_R16G16B16_USCALED;
+   case PIPE_FORMAT_R16G16B16A16_USCALED:
+      return &emit_R16G16B16A16_USCALED;
+
+   case PIPE_FORMAT_R16_SNORM:
+      return &emit_R16_SNORM;
+   case PIPE_FORMAT_R16G16_SNORM:
+      return &emit_R16G16_SNORM;
+   case PIPE_FORMAT_R16G16B16_SNORM:
+      return &emit_R16G16B16_SNORM;
+   case PIPE_FORMAT_R16G16B16A16_SNORM:
+      return &emit_R16G16B16A16_SNORM;
+
+   case PIPE_FORMAT_R16_SSCALED:
+      return &emit_R16_SSCALED;
+   case PIPE_FORMAT_R16G16_SSCALED:
+      return &emit_R16G16_SSCALED;
+   case PIPE_FORMAT_R16G16B16_SSCALED:
+      return &emit_R16G16B16_SSCALED;
+   case PIPE_FORMAT_R16G16B16A16_SSCALED:
+      return &emit_R16G16B16A16_SSCALED;
+
+   case PIPE_FORMAT_R8_UNORM:
+      return &emit_R8_UNORM;
+   case PIPE_FORMAT_R8G8_UNORM:
+      return &emit_R8G8_UNORM;
+   case PIPE_FORMAT_R8G8B8_UNORM:
+      return &emit_R8G8B8_UNORM;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      return &emit_R8G8B8A8_UNORM;
+
+   case PIPE_FORMAT_R8_USCALED:
+      return &emit_R8_USCALED;
+   case PIPE_FORMAT_R8G8_USCALED:
+      return &emit_R8G8_USCALED;
+   case PIPE_FORMAT_R8G8B8_USCALED:
+      return &emit_R8G8B8_USCALED;
+   case PIPE_FORMAT_R8G8B8A8_USCALED:
+      return &emit_R8G8B8A8_USCALED;
+
+   case PIPE_FORMAT_R8_SNORM:
+      return &emit_R8_SNORM;
+   case PIPE_FORMAT_R8G8_SNORM:
+      return &emit_R8G8_SNORM;
+   case PIPE_FORMAT_R8G8B8_SNORM:
+      return &emit_R8G8B8_SNORM;
+   case PIPE_FORMAT_R8G8B8A8_SNORM:
+      return &emit_R8G8B8A8_SNORM;
+
+   case PIPE_FORMAT_R8_SSCALED:
+      return &emit_R8_SSCALED;
+   case PIPE_FORMAT_R8G8_SSCALED:
+      return &emit_R8G8_SSCALED;
+   case PIPE_FORMAT_R8G8B8_SSCALED:
+      return &emit_R8G8B8_SSCALED;
+   case PIPE_FORMAT_R8G8B8A8_SSCALED:
+      return &emit_R8G8B8A8_SSCALED;
+
+   case PIPE_FORMAT_A8R8G8B8_UNORM:
+      return &emit_A8R8G8B8_UNORM;
+
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      return &emit_B8G8R8A8_UNORM;
+
+   default:
+      assert(0); 
+      return &emit_NULL;
+   }
+}
+
+
+
+/**
+ * Fetch vertex attributes for 'count' vertices.
+ */
+static void PIPE_CDECL generic_run_elts( struct translate *translate,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+   unsigned i;
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      unsigned elt = *elts++;
+
+      for (attr = 0; attr < nr_attrs; attr++) {
+	 float data[4];
+
+	 const char *src = (tg->attrib[attr].input_ptr + 
+			    tg->attrib[attr].input_stride * elt);
+
+	 char *dst = (vert + 
+		      tg->attrib[attr].output_offset);
+
+	 tg->attrib[attr].fetch( src, data );
+
+         if (0) debug_printf("vert %d/%d attr %d: %f %f %f %f\n",
+                             i, elt, attr, data[0], data[1], data[2], data[3]);
+
+	 tg->attrib[attr].emit( data, dst );
+      }
+      
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+
+static void PIPE_CDECL generic_run( struct translate *translate,
+                                    unsigned start,
+                                    unsigned count,
+                                    void *output_buffer )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   char *vert = output_buffer;
+   unsigned nr_attrs = tg->nr_attrib;
+   unsigned attr;
+   unsigned i;
+
+   /* loop over vertex attributes (vertex shader inputs)
+    */
+   for (i = 0; i < count; i++) {
+      unsigned elt = start + i;
+
+      for (attr = 0; attr < nr_attrs; attr++) {
+	 float data[4];
+
+	 const char *src = (tg->attrib[attr].input_ptr + 
+			    tg->attrib[attr].input_stride * elt);
+
+	 char *dst = (vert + 
+		      tg->attrib[attr].output_offset);
+
+	 tg->attrib[attr].fetch( src, data );
+
+         if (0) debug_printf("vert %d attr %d: %f %f %f %f\n",
+                             i, attr, data[0], data[1], data[2], data[3]);
+
+	 tg->attrib[attr].emit( data, dst );
+      }
+      
+      vert += tg->translate.key.output_stride;
+   }
+}
+
+
+			       
+static void generic_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride )
+{
+   struct translate_generic *tg = translate_generic(translate);
+   unsigned i;
+
+   for (i = 0; i < tg->nr_attrib; i++) {
+      if (tg->attrib[i].buffer == buf) {
+	 tg->attrib[i].input_ptr = ((char *)ptr +
+				    tg->attrib[i].input_offset);
+	 tg->attrib[i].input_stride = stride;
+      }
+   }
+}
+
+
+static void generic_release( struct translate *translate )
+{
+   /* Refcount?
+    */
+   FREE(translate);
+}
+
+struct translate *translate_generic_create( const struct translate_key *key )
+{
+   struct translate_generic *tg = CALLOC_STRUCT(translate_generic);
+   unsigned i;
+
+   if (tg == NULL)
+      return NULL;
+
+   tg->translate.key = *key;
+   tg->translate.release = generic_release;
+   tg->translate.set_buffer = generic_set_buffer;
+   tg->translate.run_elts = generic_run_elts;
+   tg->translate.run = generic_run;
+
+   for (i = 0; i < key->nr_elements; i++) {
+
+      tg->attrib[i].fetch = get_fetch_func(key->element[i].input_format);
+      tg->attrib[i].buffer = key->element[i].input_buffer;
+      tg->attrib[i].input_offset = key->element[i].input_offset;
+
+      tg->attrib[i].emit = get_emit_func(key->element[i].output_format);
+      tg->attrib[i].output_offset = key->element[i].output_offset;
+
+   }
+
+   tg->nr_attrib = key->nr_elements;
+
+
+   return &tg->translate;
+}
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
new file mode 100644
index 00000000000..b62db8d8f33
--- /dev/null
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -0,0 +1,706 @@
+/*
+ * Copyright 2003 Tungsten Graphics, inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * on the rights to use, copy, modify, merge, publish, distribute, sub
+ * license, and/or sell copies of the Software, and to permit persons to whom
+ * the Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
+ * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
+ * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
+ * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
+ * USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ * Authors:
+ *    Keith Whitwell <[email protected]>
+ */
+
+
+#include "pipe/p_config.h"
+#include "pipe/p_compiler.h"
+#include "util/u_memory.h"
+#include "util/u_math.h"
+
+#include "translate.h"
+
+
+#if defined(PIPE_ARCH_X86)
+
+#include "rtasm/rtasm_cpu.h"
+#include "rtasm/rtasm_x86sse.h"
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+
+typedef void (PIPE_CDECL *run_func)( struct translate *translate,
+                                     unsigned start,
+                                     unsigned count,
+                                     void *output_buffer );
+
+typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
+                                          const unsigned *elts,
+                                          unsigned count,
+                                          void *output_buffer );
+
+struct translate_buffer {
+   const void *base_ptr;
+   unsigned stride;
+   void *ptr;                   /* updated per vertex */
+};
+
+
+struct translate_sse {
+   struct translate translate;
+
+   struct x86_function linear_func;
+   struct x86_function elt_func;
+   struct x86_function *func;
+
+   boolean loaded_identity;
+   boolean loaded_255;
+   boolean loaded_inv_255;
+
+   float identity[4];
+   float float_255[4];
+   float inv_255[4];
+
+   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
+   unsigned nr_buffers;
+
+   run_func      gen_run;
+   run_elts_func gen_run_elts;
+
+   /* these are actually known values, but putting them in a struct
+    * like this is helpful to keep them in sync across the file.
+    */
+   struct x86_reg tmp_EAX;
+   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
+   struct x86_reg outbuf_ECX;
+   struct x86_reg machine_EDX;
+   struct x86_reg count_ESI;    /* decrements to zero */
+};
+
+static int get_offset( const void *a, const void *b )
+{
+   return (const char *)b - (const char *)a;
+}
+
+
+
+static struct x86_reg get_identity( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 6);
+
+   if (!p->loaded_identity) {
+      p->loaded_identity = TRUE;
+      p->identity[0] = 0;
+      p->identity[1] = 0;
+      p->identity[2] = 0;
+      p->identity[3] = 1;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(p->machine_EDX, 
+			       get_offset(p, &p->identity[0])));
+   }
+
+   return reg;
+}
+
+static struct x86_reg get_255( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 7);
+
+   if (!p->loaded_255) {
+      p->loaded_255 = TRUE;
+      p->float_255[0] =
+	 p->float_255[1] =
+	 p->float_255[2] =
+	 p->float_255[3] = 255.0f;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(p->machine_EDX, 
+			       get_offset(p, &p->float_255[0])));
+   }
+
+   return reg;
+}
+
+static struct x86_reg get_inv_255( struct translate_sse *p )
+{
+   struct x86_reg reg = x86_make_reg(file_XMM, 5);
+
+   if (!p->loaded_inv_255) {
+      p->loaded_inv_255 = TRUE;
+      p->inv_255[0] =
+	 p->inv_255[1] =
+	 p->inv_255[2] =
+	 p->inv_255[3] = 1.0f / 255.0f;
+
+      sse_movups(p->func, reg, 
+		 x86_make_disp(p->machine_EDX, 
+			       get_offset(p, &p->inv_255[0])));
+   }
+
+   return reg;
+}
+
+
+static void emit_load_R32G32B32A32( struct translate_sse *p, 			   
+				    struct x86_reg data,
+				    struct x86_reg arg0 )
+{
+   sse_movups(p->func, data, arg0);
+}
+
+static void emit_load_R32G32B32( struct translate_sse *p, 			   
+				 struct x86_reg data,
+				 struct x86_reg arg0 )
+{
+   /* Have to jump through some hoops:
+    *
+    * c 0 0 0
+    * c 0 0 1
+    * 0 0 c 1
+    * a b c 1
+    */
+   sse_movss(p->func, data, x86_make_disp(arg0, 8));
+   sse_shufps(p->func, data, get_identity(p), SHUF(X,Y,Z,W) );
+   sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
+   sse_movlps(p->func, data, arg0);
+}
+
+static void emit_load_R32G32( struct translate_sse *p, 
+			   struct x86_reg data,
+			   struct x86_reg arg0 )
+{
+   /* 0 0 0 1
+    * a b 0 1
+    */
+   sse_movups(p->func, data, get_identity(p) );
+   sse_movlps(p->func, data, arg0);
+}
+
+
+static void emit_load_R32( struct translate_sse *p, 
+			   struct x86_reg data,
+			   struct x86_reg arg0 )
+{
+   /* a 0 0 0
+    * a 0 0 1
+    */
+   sse_movss(p->func, data, arg0);
+   sse_orps(p->func, data, get_identity(p) );
+}
+
+
+static void emit_load_R8G8B8A8_UNORM( struct translate_sse *p,
+				       struct x86_reg data,
+				       struct x86_reg src )
+{
+
+   /* Load and unpack twice:
+    */
+   sse_movss(p->func, data, src);
+   sse2_punpcklbw(p->func, data, get_identity(p));
+   sse2_punpcklbw(p->func, data, get_identity(p));
+
+   /* Convert to float:
+    */
+   sse2_cvtdq2ps(p->func, data, data);
+
+
+   /* Scale by 1/255.0
+    */
+   sse_mulps(p->func, data, get_inv_255(p));
+}
+
+
+
+
+static void emit_store_R32G32B32A32( struct translate_sse *p, 			   
+				     struct x86_reg dest,
+				     struct x86_reg dataXMM )
+{
+   sse_movups(p->func, dest, dataXMM);
+}
+
+static void emit_store_R32G32B32( struct translate_sse *p, 
+				  struct x86_reg dest,
+				  struct x86_reg dataXMM )
+{
+   /* Emit two, shuffle, emit one.
+    */
+   sse_movlps(p->func, dest, dataXMM);
+   sse_shufps(p->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+   sse_movss(p->func, x86_make_disp(dest,8), dataXMM);
+}
+
+static void emit_store_R32G32( struct translate_sse *p, 
+			       struct x86_reg dest,
+			       struct x86_reg dataXMM )
+{
+   sse_movlps(p->func, dest, dataXMM);
+}
+
+static void emit_store_R32( struct translate_sse *p, 
+			    struct x86_reg dest,
+			    struct x86_reg dataXMM )
+{
+   sse_movss(p->func, dest, dataXMM);
+}
+
+
+
+static void emit_store_R8G8B8A8_UNORM( struct translate_sse *p,
+				       struct x86_reg dest,
+				       struct x86_reg dataXMM )
+{
+   /* Scale by 255.0
+    */
+   sse_mulps(p->func, dataXMM, get_255(p));
+
+   /* Pack and emit:
+    */
+   sse2_cvtps2dq(p->func, dataXMM, dataXMM);
+   sse2_packssdw(p->func, dataXMM, dataXMM);
+   sse2_packuswb(p->func, dataXMM, dataXMM);
+   sse_movss(p->func, dest, dataXMM);
+}
+
+
+
+
+
+/* Extended swizzles?  Maybe later.
+ */  
+static void emit_swizzle( struct translate_sse *p,
+			  struct x86_reg dest,
+			  struct x86_reg src,
+			  unsigned char shuffle )
+{
+   sse_shufps(p->func, dest, src, shuffle);
+}
+
+
+static boolean translate_attr( struct translate_sse *p,
+			       const struct translate_element *a,
+			       struct x86_reg srcECX,
+			       struct x86_reg dstEAX)
+{
+   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
+
+   switch (a->input_format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_load_R32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_load_R32G32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_load_R32G32B32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_load_R32G32B32A32(p, dataXMM, srcECX);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
+      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(p, dataXMM, srcECX);
+      break;
+   default:
+      return FALSE;
+   }
+
+   switch (a->output_format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_store_R32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_store_R32G32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_store_R32G32B32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_store_R32G32B32A32(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_swizzle(p, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_store_R8G8B8A8_UNORM(p, dstEAX, dataXMM);
+      break;
+   default:
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+static boolean init_inputs( struct translate_sse *p,
+                            boolean linear )
+{
+   unsigned i;
+   if (linear) {
+      for (i = 0; i < p->nr_buffers; i++) {
+         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDX,
+                                                     get_offset(p, &p->buffer[i].stride));
+         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDX,
+                                                     get_offset(p, &p->buffer[i].ptr));
+         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDX,
+                                                     get_offset(p, &p->buffer[i].base_ptr));
+         struct x86_reg elt = p->idx_EBX;
+         struct x86_reg tmp = p->tmp_EAX;
+
+
+         /* Calculate pointer to first attrib:
+          */
+         x86_mov(p->func, tmp, buf_stride);
+         x86_imul(p->func, tmp, elt);
+         x86_add(p->func, tmp, buf_base_ptr);
+
+
+         /* In the linear case, keep the buffer pointer instead of the
+          * index number.
+          */
+         if (p->nr_buffers == 1) 
+            x86_mov( p->func, elt, tmp );
+         else
+            x86_mov( p->func, buf_ptr, tmp );
+      }
+   }
+
+   return TRUE;
+}
+
+
+static struct x86_reg get_buffer_ptr( struct translate_sse *p,
+                                      boolean linear,
+                                      unsigned buf_idx,
+                                      struct x86_reg elt )
+{
+   if (linear && p->nr_buffers == 1) {
+      return p->idx_EBX;
+   }
+   else if (linear) {
+      struct x86_reg ptr = p->tmp_EAX;
+      struct x86_reg buf_ptr = 
+         x86_make_disp(p->machine_EDX, 
+                       get_offset(p, &p->buffer[buf_idx].ptr));
+      
+      x86_mov(p->func, ptr, buf_ptr);
+      return ptr;
+   }
+   else {
+      struct x86_reg ptr = p->tmp_EAX;
+
+      struct x86_reg buf_stride = 
+         x86_make_disp(p->machine_EDX, 
+                       get_offset(p, &p->buffer[buf_idx].stride));
+
+      struct x86_reg buf_base_ptr = 
+         x86_make_disp(p->machine_EDX, 
+                       get_offset(p, &p->buffer[buf_idx].base_ptr));
+
+
+
+      /* Calculate pointer to current attrib:
+       */
+      x86_mov(p->func, ptr, buf_stride);
+      x86_imul(p->func, ptr, elt);
+      x86_add(p->func, ptr, buf_base_ptr);
+      return ptr;
+   }
+}
+
+
+
+static boolean incr_inputs( struct translate_sse *p, 
+                            boolean linear )
+{
+   if (linear && p->nr_buffers == 1) {
+      struct x86_reg stride = x86_make_disp(p->machine_EDX,
+                                            get_offset(p, &p->buffer[0].stride));
+
+      x86_add(p->func, p->idx_EBX, stride);
+      sse_prefetchnta(p->func, x86_make_disp(p->idx_EBX, 192));
+   }
+   else if (linear) {
+      unsigned i;
+
+      /* Is this worthwhile??
+       */
+      for (i = 0; i < p->nr_buffers; i++) {
+         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDX,
+                                                get_offset(p, &p->buffer[i].ptr));
+         struct x86_reg buf_stride = x86_make_disp(p->machine_EDX,
+                                                   get_offset(p, &p->buffer[i].stride));
+
+         x86_mov(p->func, p->tmp_EAX, buf_ptr);
+         x86_add(p->func, p->tmp_EAX, buf_stride);
+         if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
+         x86_mov(p->func, buf_ptr, p->tmp_EAX);
+      }
+   } 
+   else {
+      x86_lea(p->func, p->idx_EBX, x86_make_disp(p->idx_EBX, 4));
+   }
+   
+   return TRUE;
+}
+
+
+/* Build run( struct translate *machine,
+ *            unsigned start,
+ *            unsigned count,
+ *            void *output_buffer )
+ * or
+ *  run_elts( struct translate *machine,
+ *            unsigned *elts,
+ *            unsigned count,
+ *            void *output_buffer )
+ *
+ *  Lots of hardcoding
+ *
+ * EAX -- pointer to current output vertex
+ * ECX -- pointer to current attribute 
+ * 
+ */
+static boolean build_vertex_emit( struct translate_sse *p,
+				  struct x86_function *func,
+				  boolean linear )
+{
+   int fixup, label;
+   unsigned j;
+
+   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
+   p->idx_EBX       = x86_make_reg(file_REG32, reg_BX);
+   p->outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
+   p->machine_EDX   = x86_make_reg(file_REG32, reg_DX);
+   p->count_ESI     = x86_make_reg(file_REG32, reg_SI);
+
+   p->func = func;
+   p->loaded_inv_255 = FALSE;
+   p->loaded_255 = FALSE;
+   p->loaded_identity = FALSE;
+
+   x86_init_func(p->func);
+
+   /* Push a few regs?
+    */
+   x86_push(p->func, p->idx_EBX);
+   x86_push(p->func, p->count_ESI);
+
+   /* Load arguments into regs:
+    */
+   x86_mov(p->func, p->machine_EDX, x86_fn_arg(p->func, 1));
+   x86_mov(p->func, p->idx_EBX, x86_fn_arg(p->func, 2));
+   x86_mov(p->func, p->count_ESI, x86_fn_arg(p->func, 3));
+   x86_mov(p->func, p->outbuf_ECX, x86_fn_arg(p->func, 4));
+
+   /* Get vertex count, compare to zero
+    */
+   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
+   x86_cmp(p->func, p->count_ESI, p->tmp_EAX);
+   fixup = x86_jcc_forward(p->func, cc_E);
+
+   /* always load, needed or not:
+    */
+   init_inputs(p, linear);
+
+   /* Note address for loop jump
+    */
+   label = x86_get_label(p->func);
+   {
+      struct x86_reg elt = linear ? p->idx_EBX : x86_deref(p->idx_EBX);
+      int last_vb = -1;
+      struct x86_reg vb;
+
+      for (j = 0; j < p->translate.key.nr_elements; j++) {
+         const struct translate_element *a = &p->translate.key.element[j];
+
+         /* Figure out source pointer address:
+          */
+         if (a->input_buffer != last_vb) {
+            last_vb = a->input_buffer;
+            vb = get_buffer_ptr(p, linear, a->input_buffer, elt);
+         }
+         
+         if (!translate_attr( p, a, 
+                              x86_make_disp(vb, a->input_offset), 
+                              x86_make_disp(p->outbuf_ECX, a->output_offset)))
+            return FALSE;
+      }
+
+      /* Next output vertex:
+       */
+      x86_lea(p->func, 
+              p->outbuf_ECX, 
+              x86_make_disp(p->outbuf_ECX, 
+                            p->translate.key.output_stride));
+
+      /* Incr index
+       */ 
+      incr_inputs( p, linear );
+   }
+
+   /* decr count, loop if not zero
+    */
+   x86_dec(p->func, p->count_ESI);
+   x86_jcc(p->func, cc_NZ, label);
+
+   /* Exit mmx state?
+    */
+   if (p->func->need_emms)
+      mmx_emms(p->func);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(p->func, fixup);
+
+   /* Pop regs and return
+    */
+   
+   x86_pop(p->func, p->count_ESI);
+   x86_pop(p->func, p->idx_EBX);
+   x86_ret(p->func);
+
+   return TRUE;
+}
+
+
+
+
+
+
+			       
+static void translate_sse_set_buffer( struct translate *translate,
+				unsigned buf,
+				const void *ptr,
+				unsigned stride )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   if (buf < p->nr_buffers) {
+      p->buffer[buf].base_ptr = (char *)ptr;
+      p->buffer[buf].stride = stride;
+   }
+
+   if (0) debug_printf("%s %d/%d: %p %d\n", 
+                       __FUNCTION__, buf, 
+                       p->nr_buffers, 
+                       ptr, stride);
+}
+
+
+static void translate_sse_release( struct translate *translate )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   x86_release_func( &p->linear_func );
+   x86_release_func( &p->elt_func );
+
+   FREE(p);
+}
+
+static void PIPE_CDECL translate_sse_run_elts( struct translate *translate,
+			      const unsigned *elts,
+			      unsigned count,
+			      void *output_buffer )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   p->gen_run_elts( translate,
+		    elts,
+		    count,
+		    output_buffer );
+}
+
+static void PIPE_CDECL translate_sse_run( struct translate *translate,
+			 unsigned start,
+			 unsigned count,
+			 void *output_buffer )
+{
+   struct translate_sse *p = (struct translate_sse *)translate;
+
+   p->gen_run( translate,
+	       start,
+	       count,
+	       output_buffer );
+}
+
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   struct translate_sse *p = NULL;
+   unsigned i;
+
+   if (!rtasm_cpu_has_sse() || !rtasm_cpu_has_sse2())
+      goto fail;
+
+   p = CALLOC_STRUCT( translate_sse );
+   if (p == NULL) 
+      goto fail;
+
+   p->translate.key = *key;
+   p->translate.release = translate_sse_release;
+   p->translate.set_buffer = translate_sse_set_buffer;
+   p->translate.run_elts = translate_sse_run_elts;
+   p->translate.run = translate_sse_run;
+
+   for (i = 0; i < key->nr_elements; i++) 
+      p->nr_buffers = MAX2( p->nr_buffers, key->element[i].input_buffer + 1 );
+
+   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
+
+   if (!build_vertex_emit(p, &p->linear_func, TRUE))
+      goto fail;
+
+   if (!build_vertex_emit(p, &p->elt_func, FALSE))
+      goto fail;
+
+   p->gen_run = (run_func)x86_get_func(&p->linear_func);
+   if (p->gen_run == NULL)
+      goto fail;
+
+   p->gen_run_elts = (run_elts_func)x86_get_func(&p->elt_func);
+   if (p->gen_run_elts == NULL)
+      goto fail;
+
+   return &p->translate;
+
+ fail:
+   if (p)
+      translate_sse_release( &p->translate );
+
+   return NULL;
+}
+
+
+
+#else
+
+struct translate *translate_sse2_create( const struct translate_key *key )
+{
+   return NULL;
+}
+
+#endif
author	Brian Paul <[email protected]>	2009-02-10 16:44:02 -0700
committer	Brian Paul <[email protected]>	2009-02-10 16:44:02 -0700
commit	5340b6dff73a0a23531ce2a5f28fba8303adab6e (patch)
tree	b141fc3648568dd8b941c966059e6ed32a8bd0ad /src/gallium/auxiliary/translate
parent	9fd26daec24f21dbe17afcb2e2ab272667ee9a69 (diff)
parent	ee4c921b65fb76998711f3c40330505cbc49a0e0 (diff)