Remove tgsi_sse2.

tgsi_exec is simple. llvm is fast. tgsi_sse2 ends up being neither.
author: José Fonseca <[email protected]> 2011-11-08 00:10:47 +0000
committer: José Fonseca <[email protected]> 2011-11-08 22:57:34 +0000
commit: 4eb3225b38ce12cb34ab3d90804c9683bd7b4ed3 (patch)
tree: 857d6c1740eb32fc86744f7afd81322862f6150c /src
parent: 207a016ecaabbccf865a5b8e026b95a4276adc15 (diff)
20 files changed, 3 insertions, 7048 deletions
diff --git a/src/gallium/auxiliary/Makefile.sources b/src/gallium/auxiliary/Makefile.sources
index 766beb0fafc..baded909cec 100644
--- a/src/gallium/auxiliary/Makefile.sources
+++ b/src/gallium/auxiliary/Makefile.sources
@@ -33,12 +33,8 @@ C_SOURCES := \
 	draw/draw_pt_vsplit.c \
 	draw/draw_vertex.c \
 	draw/draw_vs.c \
-	draw/draw_vs_aos.c \
-	draw/draw_vs_aos_io.c \
-	draw/draw_vs_aos_machine.c \
 	draw/draw_vs_exec.c \
 	draw/draw_vs_ppc.c \
-	draw/draw_vs_sse.c \
 	draw/draw_vs_variant.c \
 	os/os_misc.c \
 	os/os_stream.c \
@@ -83,7 +79,6 @@ C_SOURCES := \
 	tgsi/tgsi_ppc.c \
 	tgsi/tgsi_sanity.c \
 	tgsi/tgsi_scan.c \
-	tgsi/tgsi_sse2.c \
 	tgsi/tgsi_text.c \
 	tgsi/tgsi_transform.c \
 	tgsi/tgsi_ureg.c \
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index b84d2b77179..3521a035e2f 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -237,10 +237,6 @@ struct draw_context
       uint num_samplers;
       struct tgsi_sampler **samplers;
 
-      /* Here's another one:
-       */
-      struct aos_machine *aos_machine; 
-
 
       const void *aligned_constants[PIPE_MAX_CONSTANT_BUFFERS];
 
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 1763dbc199f..957bbe57a82 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -81,14 +81,12 @@ draw_vs_set_constants(struct draw_context *draw,
    }
 
    draw->vs.aligned_constants[slot] = constants;
-   draw_vs_aos_machine_constants(draw->vs.aos_machine, slot, constants);
 }
 
 
 void draw_vs_set_viewport( struct draw_context *draw,
                            const struct pipe_viewport_state *viewport )
 {
-   draw_vs_aos_machine_viewport( draw->vs.aos_machine, viewport );
 }
 
 
@@ -103,22 +101,8 @@ draw_create_vertex_shader(struct draw_context *draw,
       tgsi_dump(shader->tokens, 0);
    }
 
-   if (!draw->pt.middle.llvm) {
-#if 0
-/* these paths don't support vertex clamping
- * TODO: either add it, or remove them completely
- * use LLVM instead if you want performance
- * use exec instead if you want debugging/more correctness
- */
-#if defined(PIPE_ARCH_X86)
-      vs = draw_create_vs_sse( draw, shader );
-#elif defined(PIPE_ARCH_PPC)
-      vs = draw_create_vs_ppc( draw, shader );
-#endif
-#endif
-   }
 #if HAVE_LLVM
-   else {
+   if (draw->pt.middle.llvm) {
       vs = draw_create_vs_llvm(draw, shader);
    }
 #endif
@@ -199,12 +183,6 @@ draw_vs_init( struct draw_context *draw )
    if (!draw->vs.fetch_cache) 
       return FALSE;
 
-   draw->vs.aos_machine = draw_vs_aos_machine();
-#ifdef PIPE_ARCH_X86
-   if (!draw->vs.aos_machine)
-      return FALSE;
-#endif
-      
    return TRUE;
 }
 
@@ -219,9 +197,6 @@ draw_vs_destroy( struct draw_context *draw )
    if (draw->vs.emit_cache)
       translate_cache_destroy(draw->vs.emit_cache);
 
-   if (draw->vs.aos_machine)
-      draw_vs_aos_machine_destroy(draw->vs.aos_machine);
-
    for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
       if (draw->vs.aligned_constant_storage[i]) {
          align_free((void *)draw->vs.aligned_constant_storage[i]);
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index e6d187e9774..49229f8164b 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -159,10 +159,6 @@ draw_create_vs_exec(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
 
 struct draw_vertex_shader *
-draw_create_vs_sse(struct draw_context *draw,
-		   const struct pipe_shader_state *templ);
-
-struct draw_vertex_shader *
 draw_create_vs_ppc(struct draw_context *draw,
 		   const struct pipe_shader_state *templ);
 
@@ -170,10 +166,6 @@ draw_create_vs_ppc(struct draw_context *draw,
 struct draw_vs_variant_key;
 struct draw_vertex_shader;
 
-struct draw_vs_variant *
-draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
-                                const struct draw_vs_variant_key *key );
-
 #if HAVE_LLVM
 struct draw_vertex_shader *
 draw_create_vs_llvm(struct draw_context *draw,
@@ -214,18 +206,6 @@ static INLINE int draw_vs_variant_key_compare( const struct draw_vs_variant_key
 }
 
 
-struct aos_machine *draw_vs_aos_machine( void );
-void draw_vs_aos_machine_destroy( struct aos_machine *machine );
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
-                              unsigned slot,
-                              const void *constants);
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
-                                   const struct pipe_viewport_state *viewport );
-
-
 #define MAX_TGSI_VERTICES 4
    
 
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
deleted file mode 100644
index 7b90dba0cd5..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos.c
+++ /dev/null
@@ -1,2267 +0,0 @@
-/*
- * Mesa 3-D graphics library
- * Version:  6.3
- *
- * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included
- * in all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
- * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/**
- * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
- * using the rtasm runtime assembler.  Based on the old
- * t_vb_arb_program_sse.c
- */
-
-
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_debug.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_dump.h"
-
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-#define DISASSEM 0
-#define FAST_MATH 1
-
-static const char *files[] =
-{
-   "NULL",
-   "CONST",
-   "IN",
-   "OUT",
-   "TEMP",
-   "SAMP",
-   "ADDR",
-   "IMM",
-   "INTERNAL",
-};
-
-static INLINE boolean eq( struct x86_reg a,
-			    struct x86_reg b )
-{
-   return (a.file == b.file &&
-	   a.idx == b.idx &&
-	   a.mod == b.mod &&
-	   a.disp == b.disp);
-}
-      
-struct x86_reg aos_get_x86( struct aos_compilation *cp,
-                            unsigned which_reg, /* quick hack */
-                            unsigned value )
-{
-   struct x86_reg reg;
-
-   if (which_reg == 0)
-      reg = cp->temp_EBP;
-   else
-      reg = cp->tmp_EAX;
-
-   if (cp->x86_reg[which_reg] != value) {
-      unsigned offset;
-
-      switch (value) {
-      case X86_IMMEDIATES:
-         assert(which_reg == 0);
-         offset = Offset(struct aos_machine, immediates);
-         break;
-      case X86_CONSTANTS:
-         assert(which_reg == 1);
-         offset = Offset(struct aos_machine, constants);
-         break;
-      case X86_BUFFERS:
-         assert(which_reg == 0);
-         offset = Offset(struct aos_machine, buffer);
-         break;
-      default:
-         assert(0);
-         offset = 0;
-      }
-
-
-      x86_mov(cp->func, reg, 
-              x86_make_disp(cp->machine_EDX, offset));
-
-      cp->x86_reg[which_reg] = value;
-   }
-
-   return reg;
-}
-
-
-static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
-                                  unsigned file,
-				  unsigned idx )
-{
-   struct x86_reg ptr = cp->machine_EDX;
-
-   switch (file) {
-   case TGSI_FILE_INPUT:
-      assert(idx < MAX_INPUTS);
-      return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
-
-   case TGSI_FILE_OUTPUT:
-      return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
-
-   case TGSI_FILE_TEMPORARY:
-      assert(idx < MAX_TEMPS);
-      return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
-
-   case AOS_FILE_INTERNAL:
-      assert(idx < MAX_INTERNALS);
-      return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
-
-   case TGSI_FILE_IMMEDIATE: 
-      assert(idx < MAX_IMMEDIATES);  /* just a sanity check */
-      return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
-
-   case TGSI_FILE_CONSTANT: 
-      assert(idx < MAX_CONSTANTS);  /* just a sanity check */
-      return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
-
-   default:
-      AOS_ERROR(cp, "unknown reg file");
-      return x86_make_reg(0,0);
-   }
-}
-		
-
-
-#define X87_CW_EXCEPTION_INV_OP       (1<<0)
-#define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
-#define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
-#define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
-#define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
-#define X87_CW_EXCEPTION_PRECISION    (1<<5)
-#define X87_CW_PRECISION_SINGLE       (0<<8)
-#define X87_CW_PRECISION_RESERVED     (1<<8)
-#define X87_CW_PRECISION_DOUBLE       (2<<8)
-#define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
-#define X87_CW_PRECISION_MASK         (3<<8)
-#define X87_CW_ROUND_NEAREST          (0<<10)
-#define X87_CW_ROUND_DOWN             (1<<10)
-#define X87_CW_ROUND_UP               (2<<10)
-#define X87_CW_ROUND_ZERO             (3<<10)
-#define X87_CW_ROUND_MASK             (3<<10)
-#define X87_CW_INFINITY               (1<<12)
-
-
-
-
-static void spill( struct aos_compilation *cp, unsigned idx )
-{
-   if (!cp->xmm[idx].dirty ||
-       (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
-        cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
-        cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
-      AOS_ERROR(cp, "invalid spill");
-      return;
-   }
-   else {
-      struct x86_reg oldval = get_reg_ptr(cp,
-                                          cp->xmm[idx].file,
-                                          cp->xmm[idx].idx);
-     
-      if (0) debug_printf("\nspill %s[%d]", 
-                          files[cp->xmm[idx].file],
-                          cp->xmm[idx].idx);
- 
-      assert(cp->xmm[idx].dirty);
-      sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
-      cp->xmm[idx].dirty = 0;
-   }
-}
-
-
-void aos_spill_all( struct aos_compilation *cp )
-{
-   unsigned i;
-
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].dirty) 
-         spill(cp, i);
-      aos_release_xmm_reg(cp, i);
-   }
-}
-
-
-static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
-                                        struct x86_reg reg )
-{
-   if (reg.file != file_XMM ||
-       cp->xmm[reg.idx].file != TGSI_FILE_NULL)
-   {
-      struct x86_reg tmp = aos_get_xmm_reg(cp);
-      sse_movaps(cp->func, tmp, reg);
-      reg = tmp;
-   }
-
-   cp->xmm[reg.idx].last_used = cp->insn_counter;
-   return reg;
-}
-
-static struct x86_reg get_xmm( struct aos_compilation *cp,
-                               struct x86_reg reg )
-{
-   if (reg.file != file_XMM) 
-   {
-      struct x86_reg tmp = aos_get_xmm_reg(cp);
-      sse_movaps(cp->func, tmp, reg);
-      reg = tmp;
-   }
-
-   cp->xmm[reg.idx].last_used = cp->insn_counter;
-   return reg;
-}
-
-
-/* Allocate an empty xmm register, either as a temporary or later to
- * "adopt" as a shader reg.
- */
-struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
-{
-   unsigned i;
-   unsigned oldest = 0;
-   boolean found = FALSE;
-
-   for (i = 0; i < 8; i++) 
-      if (cp->xmm[i].last_used != cp->insn_counter &&
-          cp->xmm[i].file == TGSI_FILE_NULL) {
-	 oldest = i;
-         found = TRUE;
-      }
-
-   if (!found) {
-      for (i = 0; i < 8; i++) 
-         if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
-            oldest = i;
-   }
-
-   /* Need to write out the old value?
-    */
-   if (cp->xmm[oldest].dirty) 
-      spill(cp, oldest);
-
-   assert(cp->xmm[oldest].last_used != cp->insn_counter);
-
-   cp->xmm[oldest].file = TGSI_FILE_NULL;
-   cp->xmm[oldest].idx = 0;
-   cp->xmm[oldest].dirty = 0;
-   cp->xmm[oldest].last_used = cp->insn_counter;
-   return x86_make_reg(file_XMM, oldest);
-}
-
-void aos_release_xmm_reg( struct aos_compilation *cp,
-                          unsigned idx )
-{
-   cp->xmm[idx].file = TGSI_FILE_NULL;
-   cp->xmm[idx].idx = 0;
-   cp->xmm[idx].dirty = 0;
-   cp->xmm[idx].last_used = 0;
-}
-
-
-static void aos_soft_release_xmm( struct aos_compilation *cp,
-                                  struct x86_reg reg )
-{
-   if (reg.file == file_XMM) {
-      assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
-      cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
-   }
-}
-
-
-     
-/* Mark an xmm reg as holding the current copy of a shader reg.
- */
-void aos_adopt_xmm_reg( struct aos_compilation *cp,
-                        struct x86_reg reg,
-                        unsigned file,
-                        unsigned idx,
-                        unsigned dirty )
-{
-   unsigned i;
-
-   if (reg.file != file_XMM) {
-      assert(0);
-      return;
-   }
-
-
-   /* If any xmm reg thinks it holds this shader reg, break the
-    * illusion.
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].file == file && 
-          cp->xmm[i].idx == idx) 
-      {
-         /* If an xmm reg is already holding this shader reg, take into account its
-          * dirty flag...
-          */
-         dirty |= cp->xmm[i].dirty;
-         aos_release_xmm_reg(cp, i);
-      }
-   }
-
-   cp->xmm[reg.idx].file = file;
-   cp->xmm[reg.idx].idx = idx;
-   cp->xmm[reg.idx].dirty = dirty;
-   cp->xmm[reg.idx].last_used = cp->insn_counter;
-}
-
-
-/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
- */
-static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, 
-                                              unsigned file,
-                                              unsigned idx )
-{
-   unsigned i;
-
-   /* Ensure the in-memory copy of this reg is up-to-date
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].file == file && 
-          cp->xmm[i].idx == idx &&
-          cp->xmm[i].dirty) {
-         spill(cp, i);
-      }
-   }
-
-   return get_reg_ptr( cp, file, idx );
-}
-
-
-/* As above, but return a pointer.  Note - this pointer may alias
- * those returned by get_arg_ptr().
- */
-static struct x86_reg get_dst_ptr( struct aos_compilation *cp, 
-                                   const struct tgsi_full_dst_register *dst )
-{
-   unsigned file = dst->Register.File;
-   unsigned idx = dst->Register.Index;
-   unsigned i;
-   
-
-   /* Ensure in-memory copy of this reg is up-to-date and invalidate
-    * any xmm copies.
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].file == file &&
-          cp->xmm[i].idx == idx)
-      {
-         if (cp->xmm[i].dirty) 
-            spill(cp, i);
-         
-         aos_release_xmm_reg(cp, i);
-      }
-   }
-
-   return get_reg_ptr( cp, file, idx );
-}
-
-
-
-
-
-/* Return an XMM reg if the argument is resident, otherwise return a
- * base+offset pointer to the saved value.
- */
-struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
-                                   unsigned file,
-                                   unsigned idx )
-{
-   unsigned i;
-
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].file == file &&
-	  cp->xmm[i].idx  == idx) 
-      {
-	 cp->xmm[i].last_used = cp->insn_counter;
-	 return x86_make_reg(file_XMM, i);
-      }
-   }
-
-   /* If not found in the XMM register file, return an indirect
-    * reference to the in-memory copy:
-    */
-   return get_reg_ptr( cp, file, idx );
-}
-
-
-
-static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, 
-                                              unsigned file,
-                                              unsigned idx )
-{
-   struct x86_reg reg = get_xmm( cp,
-                                 aos_get_shader_reg( cp, file, idx ) );
-
-   aos_adopt_xmm_reg( cp,
-                      reg,
-                      file,
-                      idx,
-                      FALSE );
-   
-   return reg;
-}
-
-
-
-struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
-                                     unsigned imm )
-{
-   return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
-}
-
-
-struct x86_reg aos_get_internal( struct aos_compilation *cp,
-                                 unsigned imm )
-{
-   return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
-}
-
-
-
-
-
-/* Emulate pshufd insn in regular SSE, if necessary:
- */
-static void emit_pshufd( struct aos_compilation *cp,
-			 struct x86_reg dst,
-			 struct x86_reg arg0,
-			 ubyte shuf )
-{
-   if (cp->have_sse2) {
-      sse2_pshufd(cp->func, dst, arg0, shuf);
-   }
-   else {
-      if (!eq(dst, arg0)) 
-	 sse_movaps(cp->func, dst, arg0);
-
-      sse_shufps(cp->func, dst, dst, shuf);
-   }
-}
-
-/* load masks (pack into negs??)
- * pshufd - shuffle according to writemask
- * and - result, mask
- * nand - dest, mask
- * or - dest, result
- */
-static boolean mask_write( struct aos_compilation *cp,
-                           struct x86_reg dst,
-                           struct x86_reg result,
-                           unsigned mask )
-{
-   struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
-   struct x86_reg tmp = aos_get_xmm_reg(cp);
-   
-   emit_pshufd(cp, tmp, imm_swz, 
-               SHUF((mask & 1) ? 2 : 3,
-                    (mask & 2) ? 2 : 3,
-                    (mask & 4) ? 2 : 3,
-                    (mask & 8) ? 2 : 3));
-
-   sse_andps(cp->func, dst, tmp);
-   sse_andnps(cp->func, tmp, result);
-   sse_orps(cp->func, dst, tmp);
-
-   aos_release_xmm_reg(cp, tmp.idx);
-   return TRUE;
-}
-
-
-
-
-/* Helper for writemask:
- */
-static boolean emit_shuf_copy2( struct aos_compilation *cp,
-				  struct x86_reg dst,
-				  struct x86_reg arg0,
-				  struct x86_reg arg1,
-				  ubyte shuf )
-{
-   struct x86_reg tmp = aos_get_xmm_reg(cp);
-
-   emit_pshufd(cp, dst, arg1, shuf);
-   emit_pshufd(cp, tmp, arg0, shuf);
-   sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
-   emit_pshufd(cp, dst, dst, shuf);
-
-   aos_release_xmm_reg(cp, tmp.idx);
-   return TRUE;
-}
-
-
-
-#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
-
-
-/* Locate a source register and perform any required (simple) swizzle.  
- * 
- * Just fail on complex swizzles at this point.
- */
-static struct x86_reg fetch_src( struct aos_compilation *cp, 
-                                 const struct tgsi_full_src_register *src ) 
-{
-   struct x86_reg arg0 = aos_get_shader_reg(cp, 
-                                            src->Register.File, 
-                                            src->Register.Index);
-   unsigned i;
-   ubyte swz = 0;
-   unsigned negs = 0;
-   unsigned abs = 0;
-
-   for (i = 0; i < 4; i++) {
-      unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
-      unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
-
-      swz |= (swizzle & 0x3) << (i * 2);
-
-      switch (neg) {
-      case TGSI_UTIL_SIGN_TOGGLE:
-         negs |= (1<<i);
-         break;
-         
-      case TGSI_UTIL_SIGN_KEEP:
-         break;
-
-      case TGSI_UTIL_SIGN_CLEAR:
-         abs |= (1<<i);
-         break;
-
-      default:
-         AOS_ERROR(cp, "unsupported sign-mode");
-         break;
-      }
-   }
-
-   if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
-      struct x86_reg dst = aos_get_xmm_reg(cp);
-
-      if (swz != SSE_SWIZZLE_NOOP)
-         emit_pshufd(cp, dst, arg0, swz);
-      else
-         sse_movaps(cp->func, dst, arg0);
-
-      if (negs && negs != 0xf) {
-         struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
-         struct x86_reg tmp = aos_get_xmm_reg(cp);
-
-         /* Load 1,-1,0,0
-          * Use neg as arg to pshufd
-          * Multiply
-          */
-         emit_pshufd(cp, tmp, imm_swz, 
-                     SHUF((negs & 1) ? 1 : 0,
-                          (negs & 2) ? 1 : 0,
-                          (negs & 4) ? 1 : 0,
-                          (negs & 8) ? 1 : 0));
-         sse_mulps(cp->func, dst, tmp);
-
-         aos_release_xmm_reg(cp, tmp.idx);
-         aos_soft_release_xmm(cp, imm_swz);
-      }
-      else if (negs) {
-         struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
-         sse_mulps(cp->func, dst, imm_negs);
-         aos_soft_release_xmm(cp, imm_negs);
-      }
-
-
-      if (abs && abs != 0xf) {
-         AOS_ERROR(cp, "unsupported partial abs");
-      }
-      else if (abs) {
-         struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
-         struct x86_reg tmp = aos_get_xmm_reg(cp);
-
-         sse_movaps(cp->func, tmp, dst);
-         sse_mulps(cp->func, tmp, neg);
-         sse_maxps(cp->func, dst, tmp);
-
-         aos_release_xmm_reg(cp, tmp.idx);
-         aos_soft_release_xmm(cp, neg);
-      }
-
-      aos_soft_release_xmm(cp, arg0);
-      return dst;
-   }
-      
-   return arg0;
-}
-
-static void x87_fld_src( struct aos_compilation *cp, 
-                         const struct tgsi_full_src_register *src,
-                         unsigned channel ) 
-{
-   struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, 
-                                                src->Register.File, 
-                                                src->Register.Index);
-
-   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
-   unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
-
-   x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
-
-   switch (neg) {
-   case TGSI_UTIL_SIGN_TOGGLE:
-      /* Flip the sign:
-       */
-      x87_fchs( cp->func );
-      break;
-         
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
-
-   case TGSI_UTIL_SIGN_CLEAR:
-      x87_fabs( cp->func );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      x87_fabs( cp->func );
-      x87_fchs( cp->func );
-      break;
-
-   default:
-      AOS_ERROR(cp, "unsupported sign-mode");
-      break;
-   }
-}
-
-
-
-
-
-
-/* Used to implement write masking.  This and most of the other instructions
- * here would be easier to implement if there had been a translation
- * to a 2 argument format (dst/arg0, arg1) at the shader level before
- * attempting to translate to x86/sse code.
- */
-static void store_dest( struct aos_compilation *cp, 
-                        const struct tgsi_full_dst_register *reg,
-                        struct x86_reg result )
-{
-   struct x86_reg dst;
-
-   switch (reg->Register.WriteMask) {
-   case 0:
-      return;
-   
-   case TGSI_WRITEMASK_XYZW:
-      aos_adopt_xmm_reg(cp, 
-                        get_xmm_writable(cp, result), 
-                        reg->Register.File,
-                        reg->Register.Index,
-                        TRUE);
-      return;
-   default: 
-      break;
-   }
-
-   dst = aos_get_shader_reg_xmm(cp, 
-                                reg->Register.File,
-                                reg->Register.Index);
-
-   switch (reg->Register.WriteMask) {
-   case TGSI_WRITEMASK_X:
-      sse_movss(cp->func, dst, get_xmm(cp, result));
-      break;
-      
-   case TGSI_WRITEMASK_ZW:
-      sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
-      break;
-
-   case TGSI_WRITEMASK_XY: 
-      result = get_xmm_writable(cp, result);
-      sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
-      dst = result;
-      break;
-
-   case TGSI_WRITEMASK_YZW: 
-      result = get_xmm_writable(cp, result);
-      sse_movss(cp->func, result, dst);
-      dst = result;
-      break;
-
-   default:
-      mask_write(cp, dst, result, reg->Register.WriteMask);
-      break;
-   }
-
-   aos_adopt_xmm_reg(cp, 
-                     dst, 
-                     reg->Register.File,
-                     reg->Register.Index,
-                     TRUE);
-
-}
-
-static void inject_scalar( struct aos_compilation *cp,
-                           struct x86_reg dst,
-                           struct x86_reg result,
-                           ubyte swizzle )
-{
-   sse_shufps(cp->func, dst, dst, swizzle);
-   sse_movss(cp->func, dst, result);
-   sse_shufps(cp->func, dst, dst, swizzle);
-}
-
-
-static void store_scalar_dest( struct aos_compilation *cp, 
-                               const struct tgsi_full_dst_register *reg,
-                               struct x86_reg result )
-{
-   unsigned writemask = reg->Register.WriteMask;
-   struct x86_reg dst;
-
-   if (writemask != TGSI_WRITEMASK_X &&
-       writemask != TGSI_WRITEMASK_Y &&
-       writemask != TGSI_WRITEMASK_Z &&
-       writemask != TGSI_WRITEMASK_W &&
-       writemask != 0) 
-   {
-      result = get_xmm_writable(cp, result); /* already true, right? */
-      sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
-      store_dest(cp, reg, result);
-      return;
-   }
-
-   result = get_xmm(cp, result);
-   dst = aos_get_shader_reg_xmm(cp, 
-                                reg->Register.File,
-                                reg->Register.Index);
-
-
-
-   switch (reg->Register.WriteMask) {
-   case TGSI_WRITEMASK_X:
-      sse_movss(cp->func, dst, result);
-      break;
-
-   case TGSI_WRITEMASK_Y:
-      inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
-      break;
-
-   case TGSI_WRITEMASK_Z:
-      inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
-      break;
-
-   case TGSI_WRITEMASK_W:
-      inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
-      break;
-
-   default:
-      break;
-   }
-
-   aos_adopt_xmm_reg(cp, 
-                     dst, 
-                     reg->Register.File,
-                     reg->Register.Index,
-                     TRUE);
-}
-   
-
-
-static void x87_fst_or_nop( struct x86_function *func,
-                            unsigned writemask,
-                            unsigned channel,
-                            struct x86_reg ptr )
-{
-   assert(ptr.file == file_REG32);
-   if (writemask & (1<<channel)) 
-      x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
-}
-
-static void x87_fstp_or_pop( struct x86_function *func,
-                             unsigned writemask,
-                             unsigned channel,
-                             struct x86_reg ptr )
-{
-   assert(ptr.file == file_REG32);
-   if (writemask & (1<<channel)) 
-      x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
-   else
-      x87_fstp( func, x86_make_reg( file_x87, 0 ));
-}
-
-
-
-/* 
- */
-static void x87_fstp_dest4( struct aos_compilation *cp,
-                            const struct tgsi_full_dst_register *dst )
-{
-   struct x86_reg ptr = get_dst_ptr(cp, dst); 
-   unsigned writemask = dst->Register.WriteMask;
-
-   x87_fst_or_nop(cp->func, writemask, 0, ptr);
-   x87_fst_or_nop(cp->func, writemask, 1, ptr);
-   x87_fst_or_nop(cp->func, writemask, 2, ptr);
-   x87_fstp_or_pop(cp->func, writemask, 3, ptr);
-}
-
-/* Save current x87 state and put it into single precision mode.
- */
-static void save_fpu_state( struct aos_compilation *cp )
-{
-   x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, 
-                                       Offset(struct aos_machine, fpu_restore)));
-}
-
-static void restore_fpu_state( struct aos_compilation *cp )
-{
-   x87_fnclex(cp->func);
-   x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
-                                      Offset(struct aos_machine, fpu_restore)));
-}
-
-static void set_fpu_round_neg_inf( struct aos_compilation *cp )
-{
-   if (cp->fpucntl != FPU_RND_NEG) {
-      cp->fpucntl = FPU_RND_NEG;
-      x87_fnclex(cp->func);
-      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
-                                         Offset(struct aos_machine, fpu_rnd_neg_inf)));
-   }
-}
-
-static void set_fpu_round_nearest( struct aos_compilation *cp )
-{
-   if (cp->fpucntl != FPU_RND_NEAREST) {
-      cp->fpucntl = FPU_RND_NEAREST;
-      x87_fnclex(cp->func);
-      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
-                                         Offset(struct aos_machine, fpu_rnd_nearest)));
-   }
-}
-
-#if 0
-static void x87_emit_ex2( struct aos_compilation *cp )
-{
-   struct x86_reg st0 = x86_make_reg(file_x87, 0);
-   struct x86_reg st1 = x86_make_reg(file_x87, 1);
-   int stack = cp->func->x87_stack;
-
-   /* set_fpu_round_neg_inf( cp ); */
-
-   x87_fld(cp->func, st0);      /* a a */
-   x87_fprndint( cp->func );	/* int(a) a*/
-   x87_fsubr(cp->func, st1, st0);    /* int(a) frc(a) */
-   x87_fxch(cp->func, st1);     /* frc(a) int(a) */
-   x87_f2xm1(cp->func);         /* (2^frc(a))-1 int(a) */
-   x87_fld1(cp->func);          /* 1 (2^frc(a))-1 int(a) */
-   x87_faddp(cp->func, st1);	/* 2^frac(a) int(a)  */
-   x87_fscale(cp->func);	/* (2^frac(a)*2^int(int(a))) int(a) */
-                                /* 2^a int(a) */
-   x87_fstp(cp->func, st1);     /* 2^a */
-
-   assert( stack == cp->func->x87_stack);
-      
-}
-#endif
-
-#if 0
-static void PIPE_CDECL print_reg( const char *msg,
-                                  const float *reg )
-{
-   debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
-}
-#endif
-
-#if 0
-static void emit_print( struct aos_compilation *cp,
-                        const char *message, /* must point to a static string! */
-                        unsigned file,
-                        unsigned idx )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
-   unsigned i;
-
-   /* There shouldn't be anything on the x87 stack.  Can add this
-    * capacity later if need be.
-    */
-   assert(cp->func->x87_stack == 0);
-
-   /* For absolute correctness, need to spill/invalidate all XMM regs
-    * too.  We're obviously not concerned about performance on this
-    * debug path, so here goes:
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].dirty) 
-         spill(cp, i);
-
-      aos_release_xmm_reg(cp, i);
-   }
-
-   /* Push caller-save (ie scratch) regs.  
-    */
-   x86_cdecl_caller_push_regs( cp->func );
-
-
-   /* Push the arguments:
-    */
-   x86_lea( cp->func, ecx, arg );
-   x86_push( cp->func, ecx );
-   x86_push_imm32( cp->func, (int)message );
-
-   /* Call the helper.  Could call debug_printf directly, but
-    * print_reg is a nice place to put a breakpoint if need be.
-    */
-   x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
-   x86_call( cp->func, ecx );
-   x86_pop( cp->func, ecx );
-   x86_pop( cp->func, ecx );
-
-   /* Pop caller-save regs 
-    */
-   x86_cdecl_caller_pop_regs( cp->func );
-
-   /* Done... 
-    */
-}
-#endif
-
-/**
- * The traditional instructions.  All operate on internal registers
- * and ignore write masks and swizzling issues.
- */
-
-static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
-   struct x86_reg tmp = aos_get_xmm_reg(cp);
-
-   sse_movaps(cp->func, tmp, arg0);
-   sse_mulps(cp->func, tmp, neg);
-   sse_maxps(cp->func, tmp, arg0);
-   
-   store_dest(cp, &op->Dst[0], tmp);
-   return TRUE;
-}
-
-static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_addps(cp->func, dst, arg1);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   x87_fld_src(cp, &op->Src[0], 0);
-   x87_fcos(cp->func);
-   x87_fstp_dest4(cp, &op->Dst[0]);
-   return TRUE;
-}
-
-/* The dotproduct instructions don't really do that well in sse:
- * XXX: produces wrong results -- disabled.
- */
-static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg tmp = aos_get_xmm_reg(cp); 
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_mulps(cp->func, dst, arg1);
-   /* Now the hard bit: sum the first 3 values:
-    */ 
-   sse_movhlps(cp->func, tmp, dst);
-   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
-   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
-   sse_addss(cp->func, dst, tmp);
-   
-   aos_release_xmm_reg(cp, tmp.idx);
-   store_scalar_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg tmp = aos_get_xmm_reg(cp);      
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_mulps(cp->func, dst, arg1);
-   
-   /* Now the hard bit: sum the values:
-    */ 
-   sse_movhlps(cp->func, tmp, dst);
-   sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
-   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
-   sse_addss(cp->func, dst, tmp);
-
-   aos_release_xmm_reg(cp, tmp.idx);
-   store_scalar_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg tmp = aos_get_xmm_reg(cp);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_mulps(cp->func, dst, arg1);
-
-   /* Now the hard bit: sum the values (from DP3):
-    */ 
-   sse_movhlps(cp->func, tmp, dst);
-   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
-   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
-   sse_addss(cp->func, dst, tmp);
-   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
-   sse_addss(cp->func, dst, tmp);
-
-   aos_release_xmm_reg(cp, tmp.idx);
-   store_scalar_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-    struct x86_reg dst = aos_get_xmm_reg(cp);
-    struct x86_reg tmp = aos_get_xmm_reg(cp);
-    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
-
-/*    dst[0] = 1.0     * 1.0F; */
-/*    dst[1] = arg0[1] * arg1[1]; */
-/*    dst[2] = arg0[2] * 1.0; */
-/*    dst[3] = 1.0     * arg1[3]; */
-
-    emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
-    emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
-    sse_mulps(cp->func, dst, tmp);
-
-    aos_release_xmm_reg(cp, tmp.idx);
-    store_dest(cp, &op->Dst[0], dst);
-    return TRUE;
-}
-
-static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   x87_fld1(cp->func);		/* 1 */
-   x87_fld_src(cp, &op->Src[0], 0);	/* a0 1 */
-   x87_fyl2x(cp->func);	/* log2(a0) */
-   x87_fstp_dest4(cp, &op->Dst[0]);
-   return TRUE;
-}
-
-#if 0
-static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   x87_fld_src(cp, &op->Src[0], 0);
-   x87_emit_ex2(cp);
-   x87_fstp_dest4(cp, &op->Dst[0]);
-   return TRUE;
-}
-#endif
-
-
-static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); 
-   unsigned writemask = op->Dst[0].Register.WriteMask;
-   int i;
-
-   set_fpu_round_neg_inf( cp );
-
-   /* Load all sources first to avoid aliasing
-    */
-   for (i = 3; i >= 0; i--) {
-      if (writemask & (1<<i)) {
-         x87_fld_src(cp, &op->Src[0], i);   
-      }
-   }
-
-   for (i = 0; i < 4; i++) {
-      if (writemask & (1<<i)) {
-         x87_fprndint( cp->func );   
-         x87_fstp(cp->func, x86_make_disp(dst, i*4));
-      }
-   }
-
-   return TRUE;
-}
-
-
-static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); 
-   unsigned writemask = op->Dst[0].Register.WriteMask;
-   int i;
-
-   set_fpu_round_nearest( cp );
-
-   /* Load all sources first to avoid aliasing
-    */
-   for (i = 3; i >= 0; i--) {
-      if (writemask & (1<<i)) {
-         x87_fld_src(cp, &op->Src[0], i);   
-      }
-   }
-
-   for (i = 0; i < 4; i++) {
-      if (writemask & (1<<i)) {
-         x87_fprndint( cp->func );   
-         x87_fstp(cp->func, x86_make_disp(dst, i*4));
-      }
-   }
-
-   return TRUE;
-}
-
-
-static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); 
-   struct x86_reg st0 = x86_make_reg(file_x87, 0);
-   struct x86_reg st1 = x86_make_reg(file_x87, 1);
-   unsigned writemask = op->Dst[0].Register.WriteMask;
-   int i;
-
-   set_fpu_round_neg_inf( cp );
-
-   /* suck all the source values onto the stack before writing out any
-    * dst, which may alias...
-    */
-   for (i = 3; i >= 0; i--) {
-      if (writemask & (1<<i)) {
-         x87_fld_src(cp, &op->Src[0], i);   
-      }
-   }
-
-   for (i = 0; i < 4; i++) {
-      if (writemask & (1<<i)) {
-         x87_fld(cp->func, st0);     /* a a */
-         x87_fprndint( cp->func );   /* flr(a) a */
-         x87_fsubp(cp->func, st1);  /* frc(a) */
-         x87_fstp(cp->func, x86_make_disp(dst, i*4));
-      }
-   }
-
-   return TRUE;
-}
-
-
-
-
-
-
-static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned writemask = op->Dst[0].Register.WriteMask;
-   unsigned lit_count = cp->lit_count++;
-   struct x86_reg result, arg0;
-   unsigned i;
-
-#if 1
-   /* For absolute correctness, need to spill/invalidate all XMM regs
-    * too.  
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].dirty) 
-         spill(cp, i);
-      aos_release_xmm_reg(cp, i);
-   }
-#endif
-
-   if (writemask != TGSI_WRITEMASK_XYZW) 
-      result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
-   else 
-      result = get_dst_ptr(cp, &op->Dst[0]);    
-
-   
-   arg0 = fetch_src( cp, &op->Src[0] );
-   if (arg0.file == file_XMM) {
-      struct x86_reg tmp = x86_make_disp(cp->machine_EDX, 
-                                         Offset(struct aos_machine, tmp[1]));
-      sse_movaps( cp->func, tmp, arg0 );
-      arg0 = tmp;
-   }
-                  
-      
-
-   /* Push caller-save (ie scratch) regs.  
-    */
-   x86_cdecl_caller_push_regs( cp->func );
-
-   /* Push the arguments:
-    */
-   x86_push_imm32( cp->func, lit_count );
-
-   x86_lea( cp->func, ecx, arg0 );
-   x86_push( cp->func, ecx );
-
-   x86_lea( cp->func, ecx, result );
-   x86_push( cp->func, ecx );
-
-   x86_push( cp->func, cp->machine_EDX );
-
-   if (lit_count < MAX_LIT_INFO) {
-      x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, 
-                                             Offset(struct aos_machine, lit_info) + 
-                                             lit_count * sizeof(struct lit_info) + 
-                                             Offset(struct lit_info, func)));
-   }
-   else {
-      x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
-   }
-
-   x86_call( cp->func, ecx );
-            
-   x86_pop( cp->func, ecx );    /* fixme... */
-   x86_pop( cp->func, ecx );
-   x86_pop( cp->func, ecx );
-   x86_pop( cp->func, ecx );
-
-   x86_cdecl_caller_pop_regs( cp->func );
-
-   if (writemask != TGSI_WRITEMASK_XYZW) {
-      store_dest( cp, 
-                  &op->Dst[0],
-                  get_xmm_writable( cp, result ) );
-   }
-
-   return TRUE;
-}
-
-#if 0   
-static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]); 
-   unsigned writemask = op->Dst[0].Register.WriteMask;
-
-   if (writemask & TGSI_WRITEMASK_YZ) {
-      struct x86_reg st1 = x86_make_reg(file_x87, 1);
-      struct x86_reg st2 = x86_make_reg(file_x87, 2);
-
-      /* a1' = a1 <= 0 ? 1 : a1;  
-       */
-      x87_fldz(cp->func);                           /* 1 0  */
-#if 1
-      x87_fld1(cp->func);                           /* 1 0  */
-#else
-      /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
-       */
-      x87_fldz(cp->func);                           /* 1 0  */
-#endif
-      x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0  */
-      x87_fcomi(cp->func, st2);	                    /* a1 1 0  */
-      x87_fcmovb(cp->func, st1);                    /* a1' 1 0  */
-      x87_fstp(cp->func, st1);                      /* a1' 0  */
-      x87_fstp(cp->func, st1);                      /* a1'  */
-
-      x87_fld_src(cp, &op->Src[0], 3); /* a3 a1'  */
-      x87_fxch(cp->func, st1);                      /* a1' a3  */
-      
-
-      /* Compute pow(a1, a3)
-       */
-      x87_fyl2x(cp->func);	/* a3*log2(a1)      */
-      x87_emit_ex2( cp );       /* 2^(a3*log2(a1))   */
-
-
-      /* a0' = max2(a0, 0):
-       */
-      x87_fldz(cp->func);                           /* 0 r2 */
-      x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
-      x87_fcomi(cp->func, st1);	
-      x87_fcmovb(cp->func, st1);                    /* a0' 0 r2 */
-
-      x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
-
-      x87_fcomi(cp->func, st1);  /* a0' 0 r2 */
-      x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
-
-      x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
-      x87_fpop(cp->func);       /* r2 */
-      x87_fpop(cp->func);
-   }
-
-   if (writemask & TGSI_WRITEMASK_XW) {
-      x87_fld1(cp->func);
-      x87_fst_or_nop(cp->func, writemask, 0, dst);
-      x87_fstp_or_pop(cp->func, writemask, 3, dst);
-   }
-
-   return TRUE;
-}
-#endif
-
-
-
-static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_maxps(cp->func, dst, arg1);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-
-static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_minps(cp->func, dst, arg1);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   /* potentially nothing to do */
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_mulps(cp->func, dst, arg1);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-
-static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);
-
-   /* If we can't clobber old contents of arg0, get a temporary & copy
-    * it there, then clobber it...
-    */
-   arg0 = get_xmm_writable(cp, arg0);
-
-   sse_mulps(cp->func, arg0, arg1);
-   sse_addps(cp->func, arg0, arg2);
-   store_dest(cp, &op->Dst[0], arg0);
-   return TRUE;
-}
-
-
-
-/* A wrapper for powf().
- * Makes sure it is cdecl and operates on floats.
- */
-static float PIPE_CDECL _powerf( float x, float y )
-{
-#if FAST_MATH
-   return util_fast_pow(x, y);
-#else
-   return powf( x, y );
-#endif
-}
-
-#if FAST_MATH
-static float PIPE_CDECL _exp2(float x)
-{
-   return util_fast_exp2(x);
-}
-#endif
-
-
-/* Really not sufficient -- need to check for conditions that could
- * generate inf/nan values, which will slow things down hugely.
- */
-static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-#if 0
-   x87_fld_src(cp, &op->Src[1], 0);  /* a1.x */
-   x87_fld_src(cp, &op->Src[0], 0);	/* a0.x a1.x */
-   x87_fyl2x(cp->func);	                                /* a1*log2(a0) */
-
-   x87_emit_ex2( cp );		/* 2^(a1*log2(a0)) */
-
-   x87_fstp_dest4(cp, &op->Dst[0]);
-#else
-   uint i;
-
-   /* For absolute correctness, need to spill/invalidate all XMM regs
-    * too.  
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].dirty) 
-         spill(cp, i);
-      aos_release_xmm_reg(cp, i);
-   }
-
-   /* Push caller-save (ie scratch) regs.  
-    */
-   x86_cdecl_caller_push_regs( cp->func );
-
-   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
-
-   x87_fld_src( cp, &op->Src[1], 0 );
-   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
-   x87_fld_src( cp, &op->Src[0], 0 );
-   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
-
-   /* tmp_EAX has been pushed & will be restored below */
-   x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
-   x86_call( cp->func, cp->tmp_EAX );
-
-   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
-
-   x86_cdecl_caller_pop_regs( cp->func );
-
-   /* Note retval on x87 stack:
-    */
-   cp->func->x87_stack++;
-
-   x87_fstp_dest4( cp, &op->Dst[0] );
-#endif
-   return TRUE;
-}
-
-
-#if FAST_MATH
-static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   uint i;
-
-   /* For absolute correctness, need to spill/invalidate all XMM regs
-    * too.  
-    */
-   for (i = 0; i < 8; i++) {
-      if (cp->xmm[i].dirty) 
-         spill(cp, i);
-      aos_release_xmm_reg(cp, i);
-   }
-
-   /* Push caller-save (ie scratch) regs.  
-    */
-   x86_cdecl_caller_push_regs( cp->func );
-
-   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
-
-   x87_fld_src( cp, &op->Src[0], 0 );
-   x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
-
-   /* tmp_EAX has been pushed & will be restored below */
-   x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
-   x86_call( cp->func, cp->tmp_EAX );
-
-   x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
-
-   x86_cdecl_caller_pop_regs( cp->func );
-
-   /* Note retval on x87 stack:
-    */
-   cp->func->x87_stack++;
-
-   x87_fstp_dest4( cp, &op->Dst[0] );
-
-   return TRUE;
-}
-#endif
-
-
-static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg dst = aos_get_xmm_reg(cp);
-
-   if (cp->have_sse2) {
-      sse2_rcpss(cp->func, dst, arg0);
-      /* extend precision here...
-       */
-   }
-   else {
-      struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
-      sse_movss(cp->func, dst, ones);
-      sse_divss(cp->func, dst, arg0);
-   }
-
-   store_scalar_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-
-/* Although rsqrtps() and rcpps() are low precision on some/all SSE
- * implementations, it is possible to improve its precision at
- * fairly low cost, using a newton/raphson step, as below:
- * 
- * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
- * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
- * or:
- *   x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
- * 
- *
- * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
- */
-static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   if (0) {
-      struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-      struct x86_reg r = aos_get_xmm_reg(cp);
-      sse_rsqrtss(cp->func, r, arg0);
-      store_scalar_dest(cp, &op->Dst[0], r);
-      return TRUE;
-   }
-   else {
-      struct x86_reg arg0           = fetch_src(cp, &op->Src[0]);
-      struct x86_reg r              = aos_get_xmm_reg(cp);
-
-      struct x86_reg neg_half       = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
-      struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
-      struct x86_reg src            = get_xmm_writable( cp, arg0 );
-      struct x86_reg neg            = aos_get_internal(cp, IMM_NEGS);
-      struct x86_reg tmp            = aos_get_xmm_reg(cp);
-
-      sse_movaps(cp->func, tmp, src);
-      sse_mulps(cp->func, tmp, neg);
-      sse_maxps(cp->func, tmp, src);
-   
-      sse_rsqrtss( cp->func, r, tmp  );             /* rsqrtss(a) */
-      sse_mulss(   cp->func, tmp, neg_half  );      /* -.5 * a */
-      sse_mulss(   cp->func, tmp,  r );             /* -.5 * a * r */
-      sse_mulss(   cp->func, tmp,  r );             /* -.5 * a * r * r */
-      sse_addss(   cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
-      sse_mulss(   cp->func, r,  tmp );             /* r * (1.5 - .5 * a * r * r) */
-
-      store_scalar_dest(cp, &op->Dst[0], r);
-
-      aos_release_xmm_reg(cp, tmp.idx);
-
-      return TRUE;
-   }
-}
-
-
-static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
-   sse_andps(cp->func, dst, ones);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   x87_fld_src(cp, &op->Src[0], 0);
-   x87_fsin(cp->func);
-   x87_fstp_dest4(cp, &op->Dst[0]);
-   return TRUE;
-}
-
-
-
-static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-   
-   sse_cmpps(cp->func, dst, arg1, cc_LessThan);
-   sse_andps(cp->func, dst, ones);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg dst = get_xmm_writable(cp, arg0);
-
-   sse_subps(cp->func, dst, arg1);
-
-   store_dest(cp, &op->Dst[0], dst);
-   return TRUE;
-}
-
-static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
-
-   sse2_cvttps2dq(cp->func, tmp0, arg0);
-   sse2_cvtdq2ps(cp->func, tmp0, tmp0);
-
-   store_dest(cp, &op->Dst[0], tmp0);
-   return TRUE;
-}
-
-static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
-{
-   struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
-   struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
-   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
-   struct x86_reg tmp1 = aos_get_xmm_reg(cp);
-
-   emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
-   sse_mulps(cp->func, tmp1, arg0);
-   emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
-   sse_mulps(cp->func, tmp0, arg1);
-   sse_subps(cp->func, tmp1, tmp0);
-   sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
-
-/*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
-/*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
-/*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
-/*    dst[3] is undef */
-
-
-   aos_release_xmm_reg(cp, tmp0.idx);
-   store_dest(cp, &op->Dst[0], tmp1);
-   return TRUE;
-}
-
-
-
-static boolean
-emit_instruction( struct aos_compilation *cp,
-                  struct tgsi_full_instruction *inst )
-{
-   x87_assert_stack_empty(cp->func);
-
-   switch( inst->Instruction.Opcode ) {
-   case TGSI_OPCODE_MOV:
-      return emit_MOV( cp, inst );
-
-   case TGSI_OPCODE_LIT:
-      return emit_LIT(cp, inst);
-
-   case TGSI_OPCODE_RCP:
-      return emit_RCP(cp, inst);
-
-   case TGSI_OPCODE_RSQ:
-      return emit_RSQ(cp, inst);
-
-   case TGSI_OPCODE_EXP:
-      /*return emit_EXP(cp, inst);*/
-      return FALSE;
-
-   case TGSI_OPCODE_LOG:
-      /*return emit_LOG(cp, inst);*/
-      return FALSE;
-
-   case TGSI_OPCODE_MUL:
-      return emit_MUL(cp, inst);
-
-   case TGSI_OPCODE_ADD:
-      return emit_ADD(cp, inst);
-
-   case TGSI_OPCODE_DP3:
-      return emit_DP3(cp, inst);
-
-   case TGSI_OPCODE_DP4:
-      return emit_DP4(cp, inst);
-
-   case TGSI_OPCODE_DST:
-      return emit_DST(cp, inst);
-
-   case TGSI_OPCODE_MIN:
-      return emit_MIN(cp, inst);
-
-   case TGSI_OPCODE_MAX:
-      return emit_MAX(cp, inst);
-
-   case TGSI_OPCODE_SLT:
-      return emit_SLT(cp, inst);
-
-   case TGSI_OPCODE_SGE:
-      return emit_SGE(cp, inst);
-
-   case TGSI_OPCODE_MAD:
-      return emit_MAD(cp, inst);
-
-   case TGSI_OPCODE_SUB:
-      return emit_SUB(cp, inst);
- 
-   case TGSI_OPCODE_LRP:
-      /*return emit_LERP(cp, inst);*/
-      return FALSE;
-
-   case TGSI_OPCODE_FRC:
-      return emit_FRC(cp, inst);
-
-   case TGSI_OPCODE_CLAMP:
-      /*return emit_CLAMP(cp, inst);*/
-      return FALSE;
-
-   case TGSI_OPCODE_FLR:
-      return emit_FLR(cp, inst);
-
-   case TGSI_OPCODE_ROUND:
-      return emit_RND(cp, inst);
-
-   case TGSI_OPCODE_EX2:
-#if FAST_MATH
-      return emit_EXPBASE2(cp, inst);
-#elif 0
-      /* this seems to fail for "larger" exponents.
-       * See glean tvertProg1's EX2 test.
-       */
-      return emit_EX2(cp, inst);
-#else
-      return FALSE;
-#endif
-
-   case TGSI_OPCODE_LG2:
-      return emit_LG2(cp, inst);
-
-   case TGSI_OPCODE_POW:
-      return emit_POW(cp, inst);
-
-   case TGSI_OPCODE_XPD:
-      return emit_XPD(cp, inst);
-
-   case TGSI_OPCODE_ABS:
-      return emit_ABS(cp, inst);
-
-   case TGSI_OPCODE_DPH:
-      return emit_DPH(cp, inst);
-
-   case TGSI_OPCODE_COS:
-      return emit_COS(cp, inst);
-
-   case TGSI_OPCODE_SIN:
-      return emit_SIN(cp, inst);
-
-   case TGSI_OPCODE_TRUNC:
-      return emit_TRUNC(cp, inst);
-
-   case TGSI_OPCODE_END:
-      return TRUE;
-
-   default:
-      return FALSE;
-   }
-}
-
-
-static boolean emit_viewport( struct aos_compilation *cp )
-{
-   struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
-                                               TGSI_FILE_OUTPUT, 
-                                               cp->vaos->draw->vs.position_output );
-
-   struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
-                                        Offset(struct aos_machine, scale));
-
-   struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
-                                        Offset(struct aos_machine, translate));
-
-   sse_mulps(cp->func, pos, scale);
-   sse_addps(cp->func, pos, translate);
-
-   aos_adopt_xmm_reg( cp,
-                      pos,
-                      TGSI_FILE_OUTPUT,
-                      cp->vaos->draw->vs.position_output,
-                      TRUE );
-   return TRUE;
-}
-
-
-/* This is useful to be able to see the results on softpipe.  Doesn't
- * do proper clipping, just assumes the backend can do it during
- * rasterization -- for debug only...
- */
-static boolean emit_rhw_viewport( struct aos_compilation *cp )
-{
-   struct x86_reg tmp = aos_get_xmm_reg(cp);
-   struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
-                                               TGSI_FILE_OUTPUT, 
-                                               cp->vaos->draw->vs.position_output);
-
-   struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
-                                        Offset(struct aos_machine, scale));
-
-   struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
-                                        Offset(struct aos_machine, translate));
-
-
-
-   emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
-   sse2_rcpss(cp->func, tmp, tmp);
-   sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
-   
-   sse_mulps(cp->func, pos, scale);
-   sse_mulps(cp->func, pos, tmp);
-   sse_addps(cp->func, pos, translate);
-
-   /* Set pos[3] = w 
-    */
-   mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
-
-   aos_adopt_xmm_reg( cp,
-                      pos,
-                      TGSI_FILE_OUTPUT,
-                      cp->vaos->draw->vs.position_output,
-                      TRUE );
-   return TRUE;
-}
-
-
-#if 0
-static boolean note_immediate( struct aos_compilation *cp,
-                               struct tgsi_full_immediate *imm )
-{
-   unsigned pos = cp->num_immediates++;
-   unsigned j;
-
-   assert( imm->Immediate.NrTokens <= 4 + 1 );
-   for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
-      cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
-   }
-
-   return TRUE;
-}
-#endif
-
-
-
-
-static void find_last_write_outputs( struct aos_compilation *cp )
-{
-   struct tgsi_parse_context parse;
-   unsigned this_instruction = 0;
-   unsigned i;
-
-   tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
-
-   while (!tgsi_parse_end_of_tokens( &parse )) {
-      
-      tgsi_parse_token( &parse );
-
-      if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) 
-         continue;
-
-      for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
-         if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
-             TGSI_FILE_OUTPUT) 
-         {
-            unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
-            cp->output_last_write[idx] = this_instruction;
-         }
-      }
-
-      this_instruction++;
-   }
-
-   tgsi_parse_free( &parse );
-}
-
-
-#define ARG_MACHINE    1
-#define ARG_START_ELTS 2
-#define ARG_COUNT      3
-#define ARG_OUTBUF     4
-
-
-static boolean build_vertex_program( struct draw_vs_variant_aos_sse *variant,
-                                     boolean linear )
-{ 
-   struct tgsi_parse_context parse;
-   struct aos_compilation cp;
-   unsigned fixup, label;
-
-   util_init_math();
-
-   tgsi_parse_init( &parse, variant->base.vs->state.tokens );
-
-   memset(&cp, 0, sizeof(cp));
-
-   cp.insn_counter = 1;
-   cp.vaos = variant;
-   cp.have_sse2 = 1;
-   cp.func = &variant->func[ linear ? 0 : 1 ];
-
-   cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
-   cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX);
-   cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
-   cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX);
-   cp.count_ESI     = x86_make_reg(file_REG32, reg_SI);
-   cp.temp_EBP     = x86_make_reg(file_REG32, reg_BP);
-   cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
-
-   x86_init_func(cp.func);
-
-   find_last_write_outputs(&cp);
-
-   x86_push(cp.func, cp.idx_EBX);
-   x86_push(cp.func, cp.count_ESI);
-   x86_push(cp.func, cp.temp_EBP);
-
-
-   /* Load arguments into regs:
-    */
-   x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
-   x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
-   x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
-   x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
-
-
-   /* Compare count to zero and possibly bail.
-    */
-   x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
-   x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
-   fixup = x86_jcc_forward(cp.func, cc_E);
-
-
-   save_fpu_state( &cp );
-   set_fpu_round_nearest( &cp );
-
-   aos_init_inputs( &cp, linear );
-
-   cp.x86_reg[0] = 0;
-   cp.x86_reg[1] = 0;
-   
-   /* Note address for loop jump 
-    */
-   label = x86_get_label(cp.func);
-   {
-      /* Fetch inputs...  TODO:  fetch lazily...
-       */
-      if (!aos_fetch_inputs( &cp, linear ))
-         goto fail;
-
-      /* Emit the shader:
-       */
-      while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) 
-      {
-         tgsi_parse_token( &parse );
-
-         switch (parse.FullToken.Token.Type) {
-         case TGSI_TOKEN_TYPE_IMMEDIATE:
-#if 0
-            if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
-               goto fail;
-#endif
-            break;
-
-         case TGSI_TOKEN_TYPE_INSTRUCTION:
-            if (DISASSEM)
-               tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
-
-            if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
-               goto fail;
-            break;
-         }
-
-         x87_assert_stack_empty(cp.func);
-         cp.insn_counter++;
-
-         if (DISASSEM)
-            debug_printf("\n");
-      }
-
-   
-      {
-         unsigned i;
-         for (i = 0; i < 8; i++) {
-            if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
-               cp.xmm[i].file = TGSI_FILE_NULL;
-               cp.xmm[i].dirty = 0;
-            }
-         }
-      }
-
-      if (cp.error)
-         goto fail;
-
-      if (cp.vaos->base.key.clip) {
-         /* not really handling clipping, just do the rhw so we can
-          * see the results...
-          */
-         emit_rhw_viewport(&cp); 
-      }
-      else if (cp.vaos->base.key.viewport) {
-         emit_viewport(&cp);
-      }
-
-      /* Emit output...  TODO: do this eagerly after the last write to a
-       * given output.
-       */
-      if (!aos_emit_outputs( &cp ))
-         goto fail;
-
-
-      /* Next vertex:
-       */
-      x86_lea(cp.func, 
-              cp.outbuf_ECX, 
-              x86_make_disp(cp.outbuf_ECX, 
-                            cp.vaos->base.key.output_stride));
-
-      /* Incr index
-       */   
-      aos_incr_inputs( &cp, linear );
-   }
-   /* decr count, loop if not zero
-    */
-   x86_dec(cp.func, cp.count_ESI);
-   x86_jcc(cp.func, cc_NZ, label);
-
-   restore_fpu_state(&cp);
-
-   /* Land forward jump here:
-    */
-   x86_fixup_fwd_jump(cp.func, fixup);
-
-   /* Exit mmx state?
-    */
-   if (cp.func->need_emms)
-      mmx_emms(cp.func);
-
-   x86_pop(cp.func, cp.temp_EBP);
-   x86_pop(cp.func, cp.count_ESI);
-   x86_pop(cp.func, cp.idx_EBX);
-
-   x87_assert_stack_empty(cp.func);
-   x86_ret(cp.func);
-
-   tgsi_parse_free( &parse );
-   return !cp.error;
-
- fail:
-   tgsi_parse_free( &parse );
-   return FALSE;
-}
-
-
-/** cast wrapper */
-static INLINE struct draw_vs_variant_aos_sse *
-draw_vs_variant_aos_sse(struct draw_vs_variant *variant)
-{
-   return (struct draw_vs_variant_aos_sse *) variant;
-}
-
-
-static void vaos_set_buffer( struct draw_vs_variant *variant,
-                             unsigned buf,
-                             const void *ptr,
-                             unsigned stride,
-                             unsigned max_stride)
-{
-   struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-
-   if (buf < vaos->nr_vb) {
-      vaos->buffer[buf].base_ptr = (char *)ptr;
-      vaos->buffer[buf].stride = stride;
-   }
-
-   if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
-}
-
-
-
-static void PIPE_CDECL vaos_run_elts( struct draw_vs_variant *variant,
-                                      const unsigned *elts,
-                                      unsigned count,
-                                      void *output_buffer )
-{
-   struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-   struct aos_machine *machine = vaos->draw->vs.aos_machine;
-   unsigned i;
-
-   if (0) debug_printf("%s %d\n", __FUNCTION__, count);
-
-   machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
-   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-      machine->constants[i] = vaos->draw->vs.aligned_constants[i];
-   }
-   machine->immediates = vaos->base.vs->immediates;
-   machine->buffer = vaos->buffer;
-
-   vaos->gen_run_elts( machine,
-                       elts,
-                       count,
-                       output_buffer );
-}
-
-static void PIPE_CDECL vaos_run_linear( struct draw_vs_variant *variant,
-                                        unsigned start,
-                                        unsigned count,
-                                        void *output_buffer )
-{
-   struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-   struct aos_machine *machine = vaos->draw->vs.aos_machine;
-   unsigned i;
-
-   if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count, 
-                       vaos->base.key.const_vbuffers);
-
-   machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
-   for (i = 0; i < PIPE_MAX_CONSTANT_BUFFERS; i++) {
-      machine->constants[i] = vaos->draw->vs.aligned_constants[i];
-   }
-   machine->immediates = vaos->base.vs->immediates;
-   machine->buffer = vaos->buffer;
-
-   vaos->gen_run_linear( machine,
-                         start,
-                         count,
-                         output_buffer );
-
-   /* Sanity spot checks to make sure we didn't trash our constants */
-   assert(machine->internal[IMM_ONES][0] == 1.0f);
-   assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
-   assert(machine->internal[IMM_NEGS][0] == -1.0f);
-}
-
-
-
-static void vaos_destroy( struct draw_vs_variant *variant )
-{
-   struct draw_vs_variant_aos_sse *vaos = draw_vs_variant_aos_sse(variant);
-
-   FREE( vaos->buffer );
-
-   x86_release_func( &vaos->func[0] );
-   x86_release_func( &vaos->func[1] );
-
-   FREE(vaos);
-}
-
-
-
-static struct draw_vs_variant *variant_aos_sse( struct draw_vertex_shader *vs,
-                                                 const struct draw_vs_variant_key *key )
-{
-   unsigned i;
-   struct draw_vs_variant_aos_sse *vaos = CALLOC_STRUCT(draw_vs_variant_aos_sse);
-
-   if (!vaos)
-      goto fail;
-   
-   vaos->base.key = *key;
-   vaos->base.vs = vs;
-   vaos->base.set_buffer = vaos_set_buffer;
-   vaos->base.destroy = vaos_destroy;
-   vaos->base.run_linear = vaos_run_linear;
-   vaos->base.run_elts = vaos_run_elts;
-
-   vaos->draw = vs->draw;
-
-   for (i = 0; i < key->nr_inputs; i++) 
-      vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
-
-   vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
-   if (!vaos->buffer)
-      goto fail;
-
-   if (0)
-      debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
-
-#if 0
-   tgsi_dump(vs->state.tokens, 0);
-#endif
-
-   if (!build_vertex_program( vaos, TRUE ))
-      goto fail;
-
-   if (!build_vertex_program( vaos, FALSE ))
-      goto fail;
-
-   vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
-   if (!vaos->gen_run_linear)
-      goto fail;
-
-   vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
-   if (!vaos->gen_run_elts)
-      goto fail;
-
-   return &vaos->base;
-
- fail:
-   if (vaos && vaos->buffer)
-      FREE(vaos->buffer);
-
-   if (vaos)
-      x86_release_func( &vaos->func[0] );
-
-   if (vaos)
-      x86_release_func( &vaos->func[1] );
-
-   FREE(vaos);
-   
-   return NULL;
-}
-
-
-struct draw_vs_variant *
-draw_vs_create_variant_aos_sse( struct draw_vertex_shader *vs,
-                                const struct draw_vs_variant_key *key )
-{
-   struct draw_vs_variant *variant = variant_aos_sse( vs, key );
-
-   if (variant == NULL) {
-      variant = draw_vs_create_variant_generic( vs, key );
-   }
-
-   return variant;
-}
-
-
-
-#endif /* PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
deleted file mode 100644
index 55e63d8b9fa..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos.h
+++ /dev/null
@@ -1,255 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/* Authors:  Keith Whitwell <[email protected]>
- */
-
-#ifndef DRAW_VS_AOS_H
-#define DRAW_VS_AOS_H
-
-#include "pipe/p_config.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-
-#ifdef PIPE_ARCH_X86
-
-struct tgsi_token;
-struct x86_function;
-
-#include "pipe/p_state.h"
-#include "rtasm/rtasm_x86sse.h"
-
-
-
-
-
-#define X    0
-#define Y    1
-#define Z    2
-#define W    3
-
-#define MAX_INPUTS     PIPE_MAX_ATTRIBS
-#define MAX_OUTPUTS    PIPE_MAX_SHADER_OUTPUTS
-#define MAX_TEMPS      TGSI_EXEC_NUM_TEMPS
-#define MAX_CONSTANTS  1024  /** only used for sanity checking */
-#define MAX_IMMEDIATES 1024  /** only used for sanity checking */
-#define MAX_INTERNALS  8     /** see IMM_x values below */
-
-#define AOS_FILE_INTERNAL TGSI_FILE_COUNT
-
-#define FPU_RND_NEG    1
-#define FPU_RND_NEAREST 2
-
-struct aos_machine;
-typedef void (PIPE_CDECL *lit_func)( struct aos_machine *,
-                                    float *result,
-                                    const float *in,
-                                    unsigned count );
-
-void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
-                            float *result,
-                            const float *in,
-                            unsigned count );
-
-struct shine_tab {
-   float exponent;
-   float values[258];
-   unsigned last_used;
-};
-
-struct lit_info {
-   lit_func func;
-   struct shine_tab *shine_tab;
-};
-
-#define MAX_SHINE_TAB    4
-#define MAX_LIT_INFO     16
-
-struct aos_buffer {
-   const void *base_ptr;
-   unsigned stride;
-   void *ptr;                   /* updated per vertex */
-};
-
-
-
-
-/* This is the temporary storage used by all the aos_sse vs variants.
- * Create one per context and reuse by passing a pointer in at
- * vs_variant creation??
- */
-struct aos_machine {
-   float input    [MAX_INPUTS    ][4];
-   float output   [MAX_OUTPUTS   ][4];
-   float temp     [MAX_TEMPS     ][4];
-   float internal [MAX_INTERNALS ][4];
-
-   float scale[4];              /* viewport */
-   float translate[4];          /* viewport */
-
-   float tmp[2][4];             /* scratch space for LIT */
-
-   struct shine_tab shine_tab[MAX_SHINE_TAB];
-   struct lit_info  lit_info[MAX_LIT_INFO];
-   unsigned now;
-   
-
-   ushort fpu_rnd_nearest;
-   ushort fpu_rnd_neg_inf;
-   ushort fpu_restore;
-   ushort fpucntl;              /* one of FPU_* above */
-
-   const float (*immediates)[4];     /* points to shader data */
-   const void *constants[PIPE_MAX_CONSTANT_BUFFERS]; /* points to draw data */
-
-   const struct aos_buffer *buffer; /* points to ? */
-};
-
-
-
-
-struct aos_compilation {
-   struct x86_function *func;
-   struct draw_vs_variant_aos_sse *vaos;
-
-   unsigned insn_counter;
-   unsigned num_immediates;
-   unsigned count;
-   unsigned lit_count;
-
-   struct {
-      unsigned idx:16;
-      unsigned file:8;
-      unsigned dirty:8;
-      unsigned last_used;
-   } xmm[8];
-
-   unsigned x86_reg[2];                /* one of X86_* */
-
-   boolean input_fetched[PIPE_MAX_ATTRIBS];
-   unsigned output_last_write[PIPE_MAX_ATTRIBS];
-
-   boolean have_sse2;
-   boolean error;
-   short fpucntl;
-
-   /* these are actually known values, but putting them in a struct
-    * like this is helpful to keep them in sync across the file.
-    */
-   struct x86_reg tmp_EAX;
-   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
-   struct x86_reg outbuf_ECX;
-   struct x86_reg machine_EDX;
-   struct x86_reg count_ESI;    /* decrements to zero */
-   struct x86_reg temp_EBP;
-   struct x86_reg stack_ESP;
-};
-
-struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp );
-void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx );
-
-void aos_adopt_xmm_reg( struct aos_compilation *cp,
-                        struct x86_reg reg,
-                        unsigned file,
-                        unsigned idx,
-                        unsigned dirty );
-
-void aos_spill_all( struct aos_compilation *cp );
-
-struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
-                                   unsigned file,
-                                   unsigned idx );
-
-boolean aos_init_inputs( struct aos_compilation *cp, boolean linear );
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear );
-boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear );
-
-boolean aos_emit_outputs( struct aos_compilation *cp );
-
-
-#define IMM_ONES     0              /* 1, 1,1,1 */
-#define IMM_SWZ      1              /* 1,-1,0, 0xffffffff */
-#define IMM_IDENTITY 2              /* 0, 0,0,1 */
-#define IMM_INV_255  3              /* 1/255, 1/255, 1/255, 1/255 */
-#define IMM_255      4              /* 255, 255, 255, 255 */
-#define IMM_NEGS     5              /* -1,-1,-1,-1 */
-#define IMM_RSQ      6              /* -.5,1.5,_,_ */
-#define IMM_PSIZE    7              /* not really an immediate - updated each run */
-
-struct x86_reg aos_get_internal( struct aos_compilation *cp,
-                                 unsigned imm );
-struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
-                                     unsigned imm );
-
-
-#define AOS_ERROR(cp, msg)                                                  \
-do {                                                                    \
-   if (0) debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \
-   cp->error = 1;                                                       \
-} while (0)
-
-
-#define X86_NULL       0
-#define X86_IMMEDIATES 1
-#define X86_CONSTANTS  2
-#define X86_BUFFERS    3
-
-struct x86_reg aos_get_x86( struct aos_compilation *cp,
-                            unsigned which_reg,
-                            unsigned value );
-
-
-typedef void (PIPE_CDECL *vaos_run_elts_func)( struct aos_machine *,
-                                               const unsigned *elts,
-                                               unsigned count,
-                                               void *output_buffer);
-
-typedef void (PIPE_CDECL *vaos_run_linear_func)( struct aos_machine *,
-                                                unsigned start,
-                                                unsigned count,
-                                                void *output_buffer);
-
-
-struct draw_vs_variant_aos_sse {
-   struct draw_vs_variant base;
-   struct draw_context *draw;
-
-   struct aos_buffer *buffer;
-   unsigned nr_vb;
-
-   vaos_run_linear_func gen_run_linear;
-   vaos_run_elts_func gen_run_elts;
-
-
-   struct x86_function func[2];
-};
-
-
-#endif
-
-#endif 
-
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
deleted file mode 100644
index f1dd4487732..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos_io.c
+++ /dev/null
@@ -1,460 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "util/u_memory.h"
-#include "pipe/p_shader_tokens.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-#include "draw_vertex.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-#ifdef PIPE_ARCH_X86
-
-/* Note - don't yet have to worry about interacting with the code in
- * draw_vs_aos.c as there is no intermingling of generated code...
- * That may have to change, we'll see.
- */
-static void emit_load_R32G32B32A32( struct aos_compilation *cp, 			   
-				    struct x86_reg data,
-				    struct x86_reg src_ptr )
-{
-   sse_movups(cp->func, data, src_ptr);
-}
-
-static void emit_load_R32G32B32( struct aos_compilation *cp, 			   
-				 struct x86_reg data,
-				 struct x86_reg src_ptr )
-{
-#if 1
-   sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
-   /* data = z ? ? ? */
-   sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
-   /* data = z ? 0 1 */
-   sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
-   /* data = ? 0 z 1 */
-   sse_movlps(cp->func, data, src_ptr);
-   /* data = x y z 1 */
-#else
-   sse_movups(cp->func, data, src_ptr);
-   /* data = x y z ? */
-   sse2_pshufd(cp->func, data, data, SHUF(W,X,Y,Z) );
-   /* data = ? x y z */
-   sse_movss(cp->func, data, aos_get_internal_xmm( cp, IMM_ONES ) );
-   /* data = 1 x y z */
-   sse2_pshufd(cp->func, data, data, SHUF(Y,Z,W,X) );
-   /* data = x y z 1 */
-#endif
-}
-
-static void emit_load_R32G32( struct aos_compilation *cp, 
-			   struct x86_reg data,
-			   struct x86_reg src_ptr )
-{
-   sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
-   sse_movlps(cp->func, data, src_ptr);
-}
-
-
-static void emit_load_R32( struct aos_compilation *cp, 
-			   struct x86_reg data,
-			   struct x86_reg src_ptr )
-{
-   sse_movss(cp->func, data, src_ptr);
-   sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
-}
-
-
-static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
-				       struct x86_reg data,
-				       struct x86_reg src_ptr )
-{
-   sse_movss(cp->func, data, src_ptr);
-   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
-   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
-   sse2_cvtdq2ps(cp->func, data, data);
-   sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255));
-}
-
-
-
-/* Extended swizzles?  Maybe later.
- */  
-static void emit_swizzle( struct aos_compilation *cp,
-			  struct x86_reg dest,
-			  struct x86_reg src,
-			  ubyte shuffle )
-{
-   sse_shufps(cp->func, dest, src, shuffle);
-}
-
-
-
-static boolean get_buffer_ptr( struct aos_compilation *cp,
-                               boolean linear,
-                               unsigned buf_idx,
-                               struct x86_reg elt,
-                               struct x86_reg ptr)
-{
-   struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
-                                      buf_idx * sizeof(struct aos_buffer));
-
-   struct x86_reg buf_stride = x86_make_disp(buf, 
-                                             Offset(struct aos_buffer, stride));
-   if (linear) {
-      struct x86_reg buf_ptr = x86_make_disp(buf, 
-                                             Offset(struct aos_buffer, ptr));
-
-
-      /* Calculate pointer to current attrib:
-       */
-      x86_mov(cp->func, ptr, buf_ptr);
-      x86_mov(cp->func, elt, buf_stride);
-      x86_add(cp->func, elt, ptr);
-      if (buf_idx == 0) sse_prefetchnta(cp->func, x86_make_disp(elt, 192));
-      x86_mov(cp->func, buf_ptr, elt);
-   }
-   else {
-      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
-                                                  Offset(struct aos_buffer, base_ptr));
-
-
-      /* Calculate pointer to current attrib:
-       */
-      x86_mov(cp->func, ptr, buf_stride);
-      x86_imul(cp->func, ptr, elt);
-      x86_add(cp->func, ptr, buf_base_ptr);
-   }
-
-   cp->insn_counter++;
-
-   return TRUE;
-}
-
-
-static boolean load_input( struct aos_compilation *cp,
-                           unsigned idx,
-                           struct x86_reg bufptr )
-{
-   unsigned format = cp->vaos->base.key.element[idx].in.format;
-   unsigned offset = cp->vaos->base.key.element[idx].in.offset;
-   struct x86_reg dataXMM = aos_get_xmm_reg(cp);
-
-   /* Figure out source pointer address:
-    */
-   struct x86_reg src = x86_make_disp(bufptr, offset);
-
-   aos_adopt_xmm_reg( cp,
-                      dataXMM,
-                      TGSI_FILE_INPUT,
-                      idx,
-                      TRUE );
-
-   switch (format) {
-   case PIPE_FORMAT_R32_FLOAT:
-      emit_load_R32(cp, dataXMM, src);
-      break;
-   case PIPE_FORMAT_R32G32_FLOAT:
-      emit_load_R32G32(cp, dataXMM, src);
-      break;
-   case PIPE_FORMAT_R32G32B32_FLOAT:
-      emit_load_R32G32B32(cp, dataXMM, src);
-      break;
-   case PIPE_FORMAT_R32G32B32A32_FLOAT:
-      emit_load_R32G32B32A32(cp, dataXMM, src);
-      break;
-   case PIPE_FORMAT_A8R8G8B8_UNORM:
-      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
-      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      break;
-   case PIPE_FORMAT_R8G8B8A8_UNORM:
-      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
-      break;
-   default:
-      AOS_ERROR(cp, "unhandled input format");
-      return FALSE;
-   }
-
-   return TRUE;
-}
-
-static boolean load_inputs( struct aos_compilation *cp,
-                            unsigned buffer,
-                            struct x86_reg ptr )
-{
-   unsigned i;
-
-   for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
-      if (cp->vaos->base.key.element[i].in.buffer == buffer) {
-
-         if (!load_input( cp, i, ptr ))
-            return FALSE;
-
-         cp->insn_counter++;
-      }
-   }
-   
-   return TRUE;
-}
-
-boolean aos_init_inputs( struct aos_compilation *cp, boolean linear )
-{
-   unsigned i;
-   for (i = 0; i < cp->vaos->nr_vb; i++) {
-      struct x86_reg buf = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
-                                         i * sizeof(struct aos_buffer));
-
-      struct x86_reg buf_base_ptr = x86_make_disp(buf, 
-                                                  Offset(struct aos_buffer, base_ptr));
-
-      if (cp->vaos->base.key.const_vbuffers & (1<<i)) {
-         struct x86_reg ptr = cp->tmp_EAX;
-
-         x86_mov(cp->func, ptr, buf_base_ptr);
-
-         /* Load all inputs for this constant vertex buffer
-          */
-         load_inputs( cp, i, x86_deref(ptr) );
-         
-         /* Then just force them out to aos_machine.input[]
-          */
-         aos_spill_all( cp );
-
-      }
-      else if (linear) {
-
-         struct x86_reg elt = cp->idx_EBX;
-         struct x86_reg ptr = cp->tmp_EAX;
-
-         struct x86_reg buf_stride = x86_make_disp(buf, 
-                                                   Offset(struct aos_buffer, stride));
-
-         struct x86_reg buf_ptr = x86_make_disp(buf, 
-                                                Offset(struct aos_buffer, ptr));
-
-
-         /* Calculate pointer to current attrib:
-          */
-         x86_mov(cp->func, ptr, buf_stride);
-         x86_imul(cp->func, ptr, elt);
-         x86_add(cp->func, ptr, buf_base_ptr);
-
-
-         /* In the linear case, keep the buffer pointer instead of the
-          * index number.
-          */
-         if (cp->vaos->nr_vb == 1) 
-            x86_mov( cp->func, elt, ptr );
-         else
-            x86_mov( cp->func, buf_ptr, ptr );
-
-         cp->insn_counter++;
-      }
-   }
-
-   return TRUE;
-}
-
-boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
-{
-   unsigned j;
-
-   for (j = 0; j < cp->vaos->nr_vb; j++) {
-      if (cp->vaos->base.key.const_vbuffers & (1<<j)) {
-         /* just retreive pre-transformed input */
-      }
-      else if (linear && cp->vaos->nr_vb == 1) {
-         load_inputs( cp, 0, cp->idx_EBX );
-      }
-      else {
-         struct x86_reg elt = linear ? cp->idx_EBX : x86_deref(cp->idx_EBX);
-         struct x86_reg ptr = cp->tmp_EAX;
-
-         if (!get_buffer_ptr( cp, linear, j, elt, ptr ))
-            return FALSE;
-
-         if (!load_inputs( cp, j, ptr ))
-            return FALSE;
-      }
-   }
-
-   return TRUE;
-}
-
-boolean aos_incr_inputs( struct aos_compilation *cp, boolean linear )
-{
-   if (linear && cp->vaos->nr_vb == 1) {
-      struct x86_reg stride = x86_make_disp(aos_get_x86( cp, 0, X86_BUFFERS ), 
-                                            (0 * sizeof(struct aos_buffer) + 
-                                             Offset(struct aos_buffer, stride)));
-
-      x86_add(cp->func, cp->idx_EBX, stride);
-      sse_prefetchnta(cp->func, x86_make_disp(cp->idx_EBX, 192));
-   }
-   else if (linear) {
-      /* Nothing to do */
-   } 
-   else {
-      x86_lea(cp->func, cp->idx_EBX, x86_make_disp(cp->idx_EBX, 4));
-   }
-
-   return TRUE;
-}
-
-
-
-
-
-
-static void emit_store_R32G32B32A32( struct aos_compilation *cp, 			   
-				     struct x86_reg dst_ptr,
-				     struct x86_reg dataXMM )
-{
-   sse_movups(cp->func, dst_ptr, dataXMM);
-}
-
-static void emit_store_R32G32B32( struct aos_compilation *cp, 
-				  struct x86_reg dst_ptr,
-				  struct x86_reg dataXMM )
-{
-   sse_movlps(cp->func, dst_ptr, dataXMM);
-   sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
-   sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM);
-}
-
-static void emit_store_R32G32( struct aos_compilation *cp, 
-			       struct x86_reg dst_ptr,
-			       struct x86_reg dataXMM )
-{
-   sse_movlps(cp->func, dst_ptr, dataXMM);
-}
-
-static void emit_store_R32( struct aos_compilation *cp, 
-			    struct x86_reg dst_ptr,
-			    struct x86_reg dataXMM )
-{
-   sse_movss(cp->func, dst_ptr, dataXMM);
-}
-
-
-
-static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp,
-				       struct x86_reg dst_ptr,
-				       struct x86_reg dataXMM )
-{
-   sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255));
-   sse2_cvtps2dq(cp->func, dataXMM, dataXMM);
-   sse2_packssdw(cp->func, dataXMM, dataXMM);
-   sse2_packuswb(cp->func, dataXMM, dataXMM);
-   sse_movss(cp->func, dst_ptr, dataXMM);
-}
-
-
-
-
-
-static boolean emit_output( struct aos_compilation *cp,
-                            struct x86_reg ptr,
-                            struct x86_reg dataXMM, 
-                            enum attrib_emit format )
-{
-   switch (format) {
-   case EMIT_1F:
-   case EMIT_1F_PSIZE:
-      emit_store_R32(cp, ptr, dataXMM);
-      break;
-   case EMIT_2F:
-      emit_store_R32G32(cp, ptr, dataXMM);
-      break;
-   case EMIT_3F:
-      emit_store_R32G32B32(cp, ptr, dataXMM);
-      break;
-   case EMIT_4F:
-      emit_store_R32G32B32A32(cp, ptr, dataXMM);
-      break;
-   case EMIT_4UB:
-      emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
-      break;
-   case EMIT_4UB_BGRA:
-      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
-      emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
-      break;
-   default:
-      AOS_ERROR(cp, "unhandled output format");
-      return FALSE;
-   }
-
-   return TRUE;
-}
-
-
-
-boolean aos_emit_outputs( struct aos_compilation *cp )
-{
-   unsigned i;
-   
-   for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) {
-      enum attrib_emit format = cp->vaos->base.key.element[i].out.format;
-      unsigned offset = cp->vaos->base.key.element[i].out.offset;
-      unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output;
-
-      struct x86_reg data;
-
-      if (format == EMIT_1F_PSIZE) {
-         data = aos_get_internal_xmm( cp, IMM_PSIZE );
-      }
-      else {
-         data = aos_get_shader_reg( cp, 
-                                    TGSI_FILE_OUTPUT,
-                                    vs_output );
-      }
-
-      if (data.file != file_XMM) {
-         struct x86_reg tmp = aos_get_xmm_reg( cp );
-         sse_movaps(cp->func, tmp, data);
-         data = tmp;
-      }
-      
-      if (!emit_output( cp, 
-                        x86_make_disp( cp->outbuf_ECX, offset ),
-                        data, 
-                        format ))
-         return FALSE;
-
-      aos_release_xmm_reg( cp, data.idx );
-
-      cp->insn_counter++;
-   }
-
-   return TRUE;
-}
-
-#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c b/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
deleted file mode 100644
index 0eda414ee6a..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_aos_machine.c
+++ /dev/null
@@ -1,328 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#include "pipe/p_config.h"
-
-
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_exec.h"
-#include "draw_vs.h"
-#include "draw_vs_aos.h"
-#include "draw_vertex.h"
-
-#ifdef PIPE_ARCH_X86
-
-#include "rtasm/rtasm_x86sse.h"
-
-
-#define X87_CW_EXCEPTION_INV_OP       (1<<0)
-#define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
-#define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
-#define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
-#define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
-#define X87_CW_EXCEPTION_PRECISION    (1<<5)
-#define X87_CW_PRECISION_SINGLE       (0<<8)
-#define X87_CW_PRECISION_RESERVED     (1<<8)
-#define X87_CW_PRECISION_DOUBLE       (2<<8)
-#define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
-#define X87_CW_PRECISION_MASK         (3<<8)
-#define X87_CW_ROUND_NEAREST          (0<<10)
-#define X87_CW_ROUND_DOWN             (1<<10)
-#define X87_CW_ROUND_UP               (2<<10)
-#define X87_CW_ROUND_ZERO             (3<<10)
-#define X87_CW_ROUND_MASK             (3<<10)
-#define X87_CW_INFINITY               (1<<12)
-
-
-void PIPE_CDECL aos_do_lit( struct aos_machine *machine,
-                            float *result,
-                            const float *in,
-                            unsigned count )
-{
-   if (in[0] > 0) 
-   {
-      if (in[1] <= 0.0) 
-      {
-         result[0] = 1.0F;
-         result[1] = in[0];
-         result[2] = 0.0F;
-         result[3] = 1.0F;
-      }
-      else
-      {
-         const float epsilon = 1.0F / 256.0F;    
-         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
-         result[0] = 1.0F;
-         result[1] = in[0];
-         result[2] = powf(in[1], exponent);
-         result[3] = 1.0;
-      }
-   }
-   else 
-   {
-      result[0] = 1.0F;
-      result[1] = 0.0;
-      result[2] = 0.0;
-      result[3] = 1.0F;
-   }
-}
-
-
-static void PIPE_CDECL do_lit_lut( struct aos_machine *machine,
-                                   float *result,
-                                   const float *in,
-                                   unsigned count )
-{
-   if (in[0] > 0) 
-   {
-      if (in[1] <= 0.0) 
-      {
-         result[0] = 1.0F;
-         result[1] = in[0];
-         result[2] = 0.0F;
-         result[3] = 1.0F;
-         return;
-      }
-      
-      if (machine->lit_info[count].shine_tab->exponent != in[3]) {
-         machine->lit_info[count].func = aos_do_lit;
-         goto no_luck;
-      }
-
-      if (in[1] <= 1.0)
-      {
-         const float *tab = machine->lit_info[count].shine_tab->values;
-         float f = in[1] * 256;
-         int k = (int)f;
-         float frac = f - (float)k;
-         
-         result[0] = 1.0F;
-         result[1] = in[0];
-         result[2] = tab[k] + frac*(tab[k+1]-tab[k]);
-         result[3] = 1.0;
-         return;
-      }
-      
-   no_luck:
-      {
-         const float epsilon = 1.0F / 256.0F;    
-         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
-         result[0] = 1.0F;
-         result[1] = in[0];
-         result[2] = powf(in[1], exponent);
-         result[3] = 1.0;
-      }
-   }
-   else 
-   {
-      result[0] = 1.0F;
-      result[1] = 0.0;
-      result[2] = 0.0;
-      result[3] = 1.0F;
-   }
-}
-
-
-static void do_populate_lut( struct shine_tab *tab,
-                             float unclamped_exponent )
-{
-   const float epsilon = 1.0F / 256.0F;    
-   float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon));
-   unsigned i;
-
-   tab->exponent = unclamped_exponent; /* for later comparison */
-   
-   tab->values[0] = 0;
-   if (exponent == 0) {
-      for (i = 1; i < 258; i++) {
-         tab->values[i] = 1.0;
-      }      
-   }
-   else {
-      for (i = 1; i < 258; i++) {
-         tab->values[i] = powf((float)i * epsilon, exponent);
-      }
-   }
-}
-
-
-
-
-static void PIPE_CDECL populate_lut( struct aos_machine *machine,
-                                     float *result,
-                                     const float *in,
-                                     unsigned count )
-{
-   unsigned i, tab;
-
-   /* Search for an existing table for this value.  Note that without
-    * static analysis we don't really know if in[3] will be constant,
-    * but it usually is...
-    */
-   for (tab = 0; tab < 4; tab++) {
-      if (machine->shine_tab[tab].exponent == in[3]) {
-         goto found;
-      }
-   }
-
-   for (tab = 0, i = 1; i < 4; i++) {
-      if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used)
-         tab = i;
-   }
-
-   if (machine->shine_tab[tab].last_used == machine->now) {
-      /* No unused tables (this is not a ffvertex program...).  Just
-       * call pow each time:
-       */
-      machine->lit_info[count].func = aos_do_lit;
-      machine->lit_info[count].func( machine, result, in, count );
-      return;
-   }
-   else {
-      do_populate_lut( &machine->shine_tab[tab], in[3] );
-   }
-
- found:
-   machine->shine_tab[tab].last_used = machine->now;
-   machine->lit_info[count].shine_tab = &machine->shine_tab[tab];
-   machine->lit_info[count].func = do_lit_lut;
-   machine->lit_info[count].func( machine, result, in, count );
-}
-
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
-                              unsigned slot,
-                              const void *constants)
-{
-   machine->constants[slot] = constants;
-
-   {
-      unsigned i;
-      for (i = 0; i < MAX_LIT_INFO; i++) {
-         machine->lit_info[i].func = populate_lut;
-         machine->now++;
-      }
-   }
-}
-
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
-                                   const struct pipe_viewport_state *viewport )
-{
-   memcpy(machine->scale, viewport->scale, 4 * sizeof(float));
-   memcpy(machine->translate, viewport->translate, 4 * sizeof(float));
-}
-
-
-
-void draw_vs_aos_machine_destroy( struct aos_machine *machine )
-{
-   align_free(machine);
-}
-
-struct aos_machine *draw_vs_aos_machine( void )
-{
-   struct aos_machine *machine;
-   unsigned i;
-   float inv = 1.0f/255.0f;
-   float f255 = 255.0f;
-
-   machine = align_malloc(sizeof(struct aos_machine), 16);
-   if (!machine)
-      return NULL;
-
-   memset(machine, 0, sizeof(*machine));
-
-   ASSIGN_4V(machine->internal[IMM_SWZ],       1.0f,  -1.0f,  0.0f, 1.0f);
-   *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff;
-
-   ASSIGN_4V(machine->internal[IMM_ONES],      1.0f,  1.0f,  1.0f,  1.0f);
-   ASSIGN_4V(machine->internal[IMM_NEGS],     -1.0f, -1.0f, -1.0f, -1.0f);
-   ASSIGN_4V(machine->internal[IMM_IDENTITY],  0.0f,  0.0f,  0.0f,  1.0f);
-   ASSIGN_4V(machine->internal[IMM_INV_255],   inv,   inv,   inv,   inv);
-   ASSIGN_4V(machine->internal[IMM_255],       f255,  f255,  f255,  f255);
-   ASSIGN_4V(machine->internal[IMM_RSQ],       -.5f,  1.5f,  0.0f,  0.0f);
-
-
-   machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP |
-                               X87_CW_EXCEPTION_DENORM_OP |
-                               X87_CW_EXCEPTION_ZERO_DIVIDE |
-                               X87_CW_EXCEPTION_OVERFLOW |
-                               X87_CW_EXCEPTION_UNDERFLOW |
-                               X87_CW_EXCEPTION_PRECISION |
-                               (1<<6) |
-                               X87_CW_ROUND_NEAREST |
-                               X87_CW_PRECISION_DOUBLE_EXT);
-
-   assert(machine->fpu_rnd_nearest == 0x37f);
-                               
-   machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP |
-                               X87_CW_EXCEPTION_DENORM_OP |
-                               X87_CW_EXCEPTION_ZERO_DIVIDE |
-                               X87_CW_EXCEPTION_OVERFLOW |
-                               X87_CW_EXCEPTION_UNDERFLOW |
-                               X87_CW_EXCEPTION_PRECISION |
-                               (1<<6) |
-                               X87_CW_ROUND_DOWN |
-                               X87_CW_PRECISION_DOUBLE_EXT);
-
-   for (i = 0; i < MAX_SHINE_TAB; i++)
-      do_populate_lut( &machine->shine_tab[i], 1.0f );
-
-   return machine;
-}
-
-#else
-
-void draw_vs_aos_machine_viewport( struct aos_machine *machine,
-                                   const struct pipe_viewport_state *viewport )
-{
-}
-
-void
-draw_vs_aos_machine_constants(struct aos_machine *machine,
-                              unsigned slot,
-                              const void *constants)
-{
-}
-
-void draw_vs_aos_machine_destroy( struct aos_machine *machine )
-{
-}
-
-struct aos_machine *draw_vs_aos_machine( void )
-{
-   return NULL;
-}
-#endif
-
diff --git a/src/gallium/auxiliary/draw/draw_vs_ppc.c b/src/gallium/auxiliary/draw/draw_vs_ppc.c
index cf894bbe8af..7fb0e0953e2 100644
--- a/src/gallium/auxiliary/draw/draw_vs_ppc.c
+++ b/src/gallium/auxiliary/draw/draw_vs_ppc.c
@@ -185,12 +185,7 @@ draw_create_vs_ppc(struct draw_context *draw,
    tgsi_scan_shader(templ->tokens, &vs->base.info);
 
    vs->base.draw = draw;
-#if 0
-   if (1)
-      vs->base.create_variant = draw_vs_variant_aos_ppc;
-   else
-#endif
-      vs->base.create_variant = draw_vs_create_variant_generic;
+   vs->base.create_variant = draw_vs_create_variant_generic;
    vs->base.prepare = vs_ppc_prepare;
    vs->base.run_linear = vs_ppc_run_linear;
    vs->base.delete = vs_ppc_delete;
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
deleted file mode 100644
index d918579bda4..00000000000
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ /dev/null
@@ -1,225 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
- /*
-  * Authors:
-  *   Keith Whitwell <[email protected]>
-  *   Brian Paul
-  */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "pipe/p_config.h"
-
-#include "draw_vs.h"
-
-#if defined(PIPE_ARCH_X86)
-
-#include "pipe/p_shader_tokens.h"
-
-#include "draw_private.h"
-#include "draw_context.h"
-
-#include "rtasm/rtasm_cpu.h"
-#include "rtasm/rtasm_x86sse.h"
-#include "tgsi/tgsi_sse2.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_exec.h"
-
-#define SSE_MAX_VERTICES 4
-
-
-struct draw_sse_vertex_shader {
-   struct draw_vertex_shader base;
-   struct x86_function sse2_program;
-
-   tgsi_sse2_vs_func func;
-   
-   struct tgsi_exec_machine *machine;
-};
-
-
-static void
-vs_sse_prepare( struct draw_vertex_shader *base,
-		struct draw_context *draw )
-{
-   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = shader->machine;
-
-   machine->Samplers = draw->vs.samplers;
-
-   if (base->info.uses_instanceid) {
-      unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_INSTANCEID];
-      assert(i < Elements(machine->SystemValue));
-      machine->SystemValue[i][0] = base->draw->instance_id;
-   }
-}
-
-
-
-/* Simplified vertex shader interface for the pt paths.  Given the
- * complexity of code-generating all the above operations together,
- * it's time to try doing all the other stuff separately.
- */
-static void
-vs_sse_run_linear( struct draw_vertex_shader *base,
-		   const float (*input)[4],
-		   float (*output)[4],
-                  const void *constants[PIPE_MAX_CONSTANT_BUFFERS],
-		  const unsigned const_size[PIPE_MAX_CONSTANT_BUFFERS],
-		   unsigned count,
-		   unsigned input_stride,
-		   unsigned output_stride )
-{
-   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-   struct tgsi_exec_machine *machine = shader->machine;
-   unsigned int i;
-
-   /* By default, execute all channels.  XXX move this inside the loop
-    * below when we support shader conditionals/loops.
-    */
-   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
-
-   for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
-      unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
-
-      if (max_vertices < 4) {
-         /* disable the unused execution channels */
-         tgsi_set_exec_mask(machine,
-                            1,
-                            max_vertices > 1,
-                            max_vertices > 2,
-                            0);
-      }
-
-      /* run compiled shader
-       */
-      shader->func(machine,
-                   (const float (*)[4])constants[0],
-		   shader->base.immediates,
-                   input,
-                   base->info.num_inputs,
-                   input_stride,
-                   output,
-                   base->info.num_outputs,
-                   output_stride );
-
-      input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
-      output = (float (*)[4])((char *)output + output_stride * max_vertices);
-   }
-}
-
-
-
-
-static void
-vs_sse_delete( struct draw_vertex_shader *base )
-{
-   struct draw_sse_vertex_shader *shader = (struct draw_sse_vertex_shader *)base;
-   
-   x86_release_func( &shader->sse2_program );
-
-   align_free( (void *) shader->base.immediates );
-
-   FREE( (void*) shader->base.state.tokens );
-   FREE( shader );
-}
-
-
-struct draw_vertex_shader *
-draw_create_vs_sse(struct draw_context *draw,
-                          const struct pipe_shader_state *templ)
-{
-   struct draw_sse_vertex_shader *vs;
-
-   if (!rtasm_cpu_has_sse2())
-      return NULL;
-
-   vs = CALLOC_STRUCT( draw_sse_vertex_shader );
-   if (vs == NULL) 
-      return NULL;
-
-   /* we make a private copy of the tokens */
-   vs->base.state.tokens = tgsi_dup_tokens(templ->tokens);
-   if (!vs->base.state.tokens)
-      goto fail;
-
-   tgsi_scan_shader(templ->tokens, &vs->base.info);
-
-   vs->base.draw = draw;
-   if (1)
-      vs->base.create_variant = draw_vs_create_variant_aos_sse;
-   else
-      vs->base.create_variant = draw_vs_create_variant_generic;
-   vs->base.prepare = vs_sse_prepare;
-   vs->base.run_linear = vs_sse_run_linear;
-   vs->base.delete = vs_sse_delete;
-   
-   vs->base.immediates = align_malloc(TGSI_EXEC_NUM_IMMEDIATES * 4 *
-                                      sizeof(float), 16);
-
-   vs->machine = draw->vs.machine;
-   
-   x86_init_func( &vs->sse2_program );
-
-   if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
-			&vs->sse2_program, 
-                        (float (*)[4])vs->base.immediates, 
-                        TRUE )) 
-      goto fail;
-      
-   vs->func = (tgsi_sse2_vs_func) x86_get_func( &vs->sse2_program );
-   if (!vs->func) {
-      goto fail;
-   }
-   
-   return &vs->base;
-
-fail:
-   if (0)
-      debug_warning("tgsi_emit_sse2() failed, falling back to interpreter\n");
-
-   x86_release_func( &vs->sse2_program );
-   
-   FREE(vs);
-   return NULL;
-}
-
-
-
-#else
-
-struct draw_vertex_shader *
-draw_create_vs_sse( struct draw_context *draw,
-		    const struct pipe_shader_state *templ )
-{
-   return (void *) 0;
-}
-
-
-#endif
-
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.c b/src/gallium/auxiliary/tgsi/tgsi_sse2.c
deleted file mode 100644
index 5614caf63e7..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.c
+++ /dev/null
@@ -1,3106 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007-2008 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * Copyright 2009-2010 VMware, Inc.  All rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#include "pipe/p_config.h"
-
-#include "tgsi/tgsi_sse2.h"
-
-#if defined(PIPE_ARCH_X86) && 0 /* See FIXME notes below */
-
-#include "util/u_debug.h"
-#include "pipe/p_shader_tokens.h"
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#if defined(PIPE_ARCH_SSE)
-#include "util/u_sse.h"
-#endif
-#include "tgsi/tgsi_info.h"
-#include "tgsi/tgsi_parse.h"
-#include "tgsi/tgsi_util.h"
-#include "tgsi/tgsi_dump.h"
-#include "tgsi/tgsi_exec.h"
-
-#include "rtasm/rtasm_x86sse.h"
-
-/* for 1/sqrt()
- *
- * This costs about 100fps (close to 10%) in gears:
- */
-#define HIGH_PRECISION 1
-
-#define FAST_MATH 1
-
-
-#define FOR_EACH_CHANNEL( CHAN )\
-   for (CHAN = 0; CHAN < NUM_CHANNELS; CHAN++)
-
-#define IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   ((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
-
-#define IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )\
-   if (IS_DST0_CHANNEL_ENABLED( INST, CHAN ))
-
-#define FOR_EACH_DST0_ENABLED_CHANNEL( INST, CHAN )\
-   FOR_EACH_CHANNEL( CHAN )\
-      IF_IS_DST0_CHANNEL_ENABLED( INST, CHAN )
-
-#define CHAN_X 0
-#define CHAN_Y 1
-#define CHAN_Z 2
-#define CHAN_W 3
-
-#define TEMP_ONE_I   TGSI_EXEC_TEMP_ONE_I
-#define TEMP_ONE_C   TGSI_EXEC_TEMP_ONE_C
-
-#define TEMP_R0   TGSI_EXEC_TEMP_R0
-#define TEMP_ADDR TGSI_EXEC_TEMP_ADDR
-#define TEMP_EXEC_MASK_I TGSI_EXEC_MASK_I
-#define TEMP_EXEC_MASK_C TGSI_EXEC_MASK_C
-
-
-/**
- * X86 utility functions.
- */
-
-static struct x86_reg
-make_xmm(
-   unsigned xmm )
-{
-   return x86_make_reg(
-      file_XMM,
-      (enum x86_reg_name) xmm );
-}
-
-/**
- * X86 register mapping helpers.
- */
-
-static struct x86_reg
-get_const_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_AX );
-}
-
-static struct x86_reg
-get_machine_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_CX );
-}
-
-static struct x86_reg
-get_input_base( void )
-{
-   /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
-   return x86_make_disp(
-      get_machine_base(),
-      Offset(struct tgsi_exec_machine, Inputs) );
-}
-
-static struct x86_reg
-get_output_base( void )
-{
-   /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
-   return x86_make_disp(
-      get_machine_base(),
-      Offset(struct tgsi_exec_machine, Outputs) );
-}
-
-static struct x86_reg
-get_temp_base( void )
-{
-   return x86_make_disp(
-      get_machine_base(),
-      Offset(struct tgsi_exec_machine, Temps) );
-}
-
-static struct x86_reg
-get_coef_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_BX );
-}
-
-static struct x86_reg
-get_sampler_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DI );
-}
-
-static struct x86_reg
-get_immediate_base( void )
-{
-   return x86_make_reg(
-      file_REG32,
-      reg_DX );
-}
-
-static struct x86_reg
-get_system_value_base( void )
-{
-   return x86_make_disp(
-      get_machine_base(),
-      Offset(struct tgsi_exec_machine, SystemValue) );
-}
-
-
-/**
- * Data access helpers.
- */
-
-
-static struct x86_reg
-get_immediate(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_immediate_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_const(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_const_base(),
-      (vec * 4 + chan) * 4 );
-}
-
-static struct x86_reg
-get_sampler_ptr(
-   unsigned unit )
-{
-   return x86_make_disp(
-      get_sampler_base(),
-      unit * sizeof( struct tgsi_sampler * ) );
-}
-
-static struct x86_reg
-get_input(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_input_base(),
-      (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_output(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_output_base(),
-      (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_temp(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_temp_base(),
-      (vec * 4 + chan) * 16 );
-}
-
-static struct x86_reg
-get_system_value(
-   unsigned vec,
-   unsigned chan )
-{
-   return x86_make_disp(
-      get_system_value_base(), /* base */
-      (vec * 4 + chan) * 4 );  /* byte offset from base */
-}
-
-static struct x86_reg
-get_coef(
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   return x86_make_disp(
-      get_coef_base(),
-      ((vec * 3 + member) * 4 + chan) * 4 );
-}
-
-
-static void
-emit_ret(
-   struct x86_function  *func )
-{
-   x86_ret( func );
-}
-
-
-/**
- * Data fetch helpers.
- */
-
-/**
- * Copy a shader constant to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src const buffer index
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_const(
-   struct x86_function *func,
-   uint xmm,
-   int vec,
-   uint chan,
-   uint indirect,
-   uint indirectFile,
-   int indirectIndex )
-{
-   if (indirect) {
-      /* 'vec' is the offset from the address register's value.
-       * We're loading CONST[ADDR+vec] into an xmm register.
-       */
-      struct x86_reg r0 = get_immediate_base();
-      struct x86_reg r1 = get_coef_base();
-      uint i;
-
-      assert( indirectFile == TGSI_FILE_ADDRESS );
-      assert( indirectIndex == 0 );
-      assert( r0.mod == mod_REG );
-      assert( r1.mod == mod_REG );
-
-      x86_push( func, r0 );
-      x86_push( func, r1 );
-
-      /*
-       * Loop over the four pixels or vertices in the quad.
-       * Get the value of the address (offset) register for pixel/vertex[i],
-       * add it to the src offset and index into the constant buffer.
-       * Note that we're working on SOA data.
-       * If any of the pixel/vertex execution channels are unused their
-       * values will be garbage.  It's very important that we don't use
-       * those garbage values as indexes into the constant buffer since
-       * that'll cause segfaults.
-       * The solution is to bitwise-AND the offset with the execution mask
-       * register whose values are either 0 or ~0.
-       * The caller must setup the execution mask register to indicate
-       * which channels are valid/alive before running the shader.
-       * The execution mask will also figure into loops and conditionals
-       * someday.
-       */
-      for (i = 0; i < QUAD_SIZE; i++) {
-         /* r1 = address register[i] */
-         x86_mov( func, r1, x86_make_disp( get_temp( TEMP_ADDR, CHAN_X ), i * 4 ) );
-         /* r0 = execution mask[i] */
-         x86_mov( func, r0, x86_make_disp( get_temp( TEMP_EXEC_MASK_I, TEMP_EXEC_MASK_C ), i * 4 ) );
-         /* r1 = r1 & r0 */
-         x86_and( func, r1, r0 );
-         /* r0 = 'vec', the offset */
-         x86_lea( func, r0, get_const( vec, chan ) );
-
-         /* Quick hack to multiply r1 by 16 -- need to add SHL to rtasm.
-          */
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-         x86_add( func, r1, r1 );
-
-         x86_add( func, r0, r1 );  /* r0 = r0 + r1 */
-         x86_mov( func, r1, x86_deref( r0 ) );
-         x86_mov( func, x86_make_disp( get_temp( TEMP_R0, CHAN_X ), i * 4 ), r1 );
-      }
-
-      x86_pop( func, r1 );
-      x86_pop( func, r0 );
-
-      sse_movaps(
-         func,
-         make_xmm( xmm ),
-         get_temp( TEMP_R0, CHAN_X ) );
-   }
-   else {
-      /* 'vec' is the index into the src register file, such as TEMP[vec] */
-      assert( vec >= 0 );
-
-      sse_movss(
-         func,
-         make_xmm( xmm ),
-         get_const( vec, chan ) );
-      sse_shufps(
-         func,
-         make_xmm( xmm ),
-         make_xmm( xmm ),
-         SHUF( 0, 0, 0, 0 ) );
-   }
-}
-
-static void
-emit_immediate(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_immediate( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
-
-
-/**
- * Copy a shader input to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src input attrib
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_inputf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm ),
-      get_input( vec, chan ) );
-}
-
-/**
- * Store an xmm register to a shader output
- * \param xmm  the source xmm register
- * \param vec  the dest output attrib
- * \param chan  src dest channel to store (X, Y, Z or W)
- */
-static void
-emit_output(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_output( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-/**
- * Copy a shader temporary to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the src temp register
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_tempf(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      make_xmm( xmm ),
-      get_temp( vec, chan ) );
-}
-
-/**
- * Copy a system value to xmm register
- * \param xmm  the destination xmm register
- * \param vec  the source system value register
- * \param chan  src channel to fetch (X, Y, Z or W)
- */
-static void
-emit_system_value(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_system_value( vec, chan ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
-
-/**
- * Load an xmm register with an input attrib coefficient (a0, dadx or dady)
- * \param xmm  the destination xmm register
- * \param vec  the src input/attribute coefficient index
- * \param chan  src channel to fetch (X, Y, Z or W)
- * \param member  0=a0, 1=dadx, 2=dady
- */
-static void
-emit_coef(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan,
-   unsigned member )
-{
-   sse_movss(
-      func,
-      make_xmm( xmm ),
-      get_coef( vec, chan, member ) );
-   sse_shufps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ),
-      SHUF( 0, 0, 0, 0 ) );
-}
-
-/**
- * Data store helpers.
- */
-
-static void
-emit_inputs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movups(
-      func,
-      get_input( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_temps(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   sse_movaps(
-      func,
-      get_temp( vec, chan ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_addrs(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   assert( vec == 0 );
-
-   emit_temps(
-      func,
-      xmm,
-      vec + TGSI_EXEC_TEMP_ADDR,
-      chan );
-}
-
-/**
- * Coefficent fetch helpers.
- */
-
-static void
-emit_coef_a0(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      0 );
-}
-
-static void
-emit_coef_dadx(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      1 );
-}
-
-static void
-emit_coef_dady(
-   struct x86_function *func,
-   unsigned xmm,
-   unsigned vec,
-   unsigned chan )
-{
-   emit_coef(
-      func,
-      xmm,
-      vec,
-      chan,
-      2 );
-}
-
-/**
- * Function call helpers.
- */
-
-/**
- * NOTE: In gcc, if the destination uses the SSE intrinsics, then it must be 
- * defined with __attribute__((force_align_arg_pointer)), as we do not guarantee
- * that the stack pointer is 16 byte aligned, as expected.
- */
-static void
-emit_func_call(
-   struct x86_function *func,
-   unsigned xmm_save_mask,
-   const struct x86_reg *arg,
-   unsigned nr_args,
-   void (PIPE_CDECL *code)() )
-{
-   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
-   unsigned i, n;
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   
-   /* Store XMM regs to the stack
-    */
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_save_mask & (1 << i))
-         ++n;
-   
-   x86_sub_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_save_mask & (1 << i)) {
-         sse_movups(
-            func,
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ),
-            make_xmm( i ) );
-         ++n;
-      }
-
-   for (i = 0; i < nr_args; i++) {
-      /* Load the address of the buffer we use for passing arguments and
-       * receiving results:
-       */
-      x86_lea(
-	 func,
-	 ecx,
-	 arg[i] );
-   
-      /* Push actual function arguments (currently just the pointer to
-       * the buffer above), and call the function:
-       */
-      x86_push( func, ecx );
-   }
-
-   x86_mov_reg_imm( func, ecx, (unsigned long) code );
-   x86_call( func, ecx );
-
-   /* Pop the arguments (or just add an immediate to esp)
-    */
-   for (i = 0; i < nr_args; i++) {
-      x86_pop(func, ecx );
-   }
-
-   /* Pop the saved XMM regs:
-    */
-   for(i = 0, n = 0; i < 8; ++i)
-      if(xmm_save_mask & (1 << i)) {
-         sse_movups(
-            func,
-            make_xmm( i ),
-            x86_make_disp( x86_make_reg( file_REG32, reg_SP ), n*16 ) );
-         ++n;
-      }
-   
-   x86_add_imm(
-      func, 
-      x86_make_reg( file_REG32, reg_SP ),
-      n*16);
-
-   /* Restore GP registers in a reverse order.
-    */
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_CX) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX) );
-}
-
-static void
-emit_func_call_dst_src1(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src0,
-   void (PIPE_CDECL *code)() )
-{
-   struct x86_reg store = get_temp( TEMP_R0, 0 );
-   unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
-   
-   /* Store our input parameters (in xmm regs) to the buffer we use
-    * for passing arguments.  We will pass a pointer to this buffer as
-    * the actual function argument.
-    */
-   sse_movaps(
-      func,
-      store,
-      make_xmm( xmm_src0 ) );
-
-   emit_func_call( func,
-                   xmm_mask,
-                   &store,
-                   1,
-                   code );
-
-   sse_movaps(
-      func,
-      make_xmm( xmm_dst ),
-      store );
-}
-
-
-static void
-emit_func_call_dst_src2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src0,
-   unsigned xmm_src1,
-   void (PIPE_CDECL *code)() )
-{
-   struct x86_reg store = get_temp( TEMP_R0, 0 );
-   unsigned xmm_mask = ((1 << xmm_save) - 1) & ~(1 << xmm_dst);
-
-   /* Store two inputs to parameter buffer.
-    */
-   sse_movaps(
-      func,
-      store,
-      make_xmm( xmm_src0 ) );
-
-   sse_movaps(
-      func,
-      x86_make_disp( store, 4 * sizeof(float) ),
-      make_xmm( xmm_src1 ) );
-
-
-   /* Emit the call
-    */
-   emit_func_call( func,
-                   xmm_mask,
-                   &store,
-                   1,
-                   code );
-
-   /* Retrieve the results:
-    */
-   sse_movaps(
-      func,
-      make_xmm( xmm_dst ),
-      store );
-}
-
-
-
-
-
-#if defined(PIPE_ARCH_SSE)
-
-/*
- * Fast SSE2 implementation of special math functions.
- */
-
-#define POLY0(x, c0) _mm_set1_ps(c0)
-#define POLY1(x, c0, c1) _mm_add_ps(_mm_mul_ps(POLY0(x, c1), x), _mm_set1_ps(c0))
-#define POLY2(x, c0, c1, c2) _mm_add_ps(_mm_mul_ps(POLY1(x, c1, c2), x), _mm_set1_ps(c0))
-#define POLY3(x, c0, c1, c2, c3) _mm_add_ps(_mm_mul_ps(POLY2(x, c1, c2, c3), x), _mm_set1_ps(c0))
-#define POLY4(x, c0, c1, c2, c3, c4) _mm_add_ps(_mm_mul_ps(POLY3(x, c1, c2, c3, c4), x), _mm_set1_ps(c0))
-#define POLY5(x, c0, c1, c2, c3, c4, c5) _mm_add_ps(_mm_mul_ps(POLY4(x, c1, c2, c3, c4, c5), x), _mm_set1_ps(c0))
-
-#define EXP_POLY_DEGREE 3
-#define LOG_POLY_DEGREE 5
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-exp2f4(__m128 x)
-{
-   __m128i ipart;
-   __m128 fpart, expipart, expfpart;
-
-   x = _mm_min_ps(x, _mm_set1_ps( 129.00000f));
-   x = _mm_max_ps(x, _mm_set1_ps(-126.99999f));
-
-   /* ipart = int(x - 0.5) */
-   ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_set1_ps(0.5f)));
-
-   /* fpart = x - ipart */
-   fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));
-
-   /* expipart = (float) (1 << ipart) */
-   expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_set1_epi32(127)), 23));
-
-   /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
-#if EXP_POLY_DEGREE == 5
-   expfpart = POLY5(fpart, 9.9999994e-1f, 6.9315308e-1f, 2.4015361e-1f, 5.5826318e-2f, 8.9893397e-3f, 1.8775767e-3f);
-#elif EXP_POLY_DEGREE == 4
-   expfpart = POLY4(fpart, 1.0000026f, 6.9300383e-1f, 2.4144275e-1f, 5.2011464e-2f, 1.3534167e-2f);
-#elif EXP_POLY_DEGREE == 3
-   expfpart = POLY3(fpart, 9.9992520e-1f, 6.9583356e-1f, 2.2606716e-1f, 7.8024521e-2f);
-#elif EXP_POLY_DEGREE == 2
-   expfpart = POLY2(fpart, 1.0017247f, 6.5763628e-1f, 3.3718944e-1f);
-#else
-#error
-#endif
-
-   return _mm_mul_ps(expipart, expfpart);
-}
-
-
-/**
- * See http://www.devmaster.net/forums/showthread.php?p=43580
- */
-static INLINE __m128 
-log2f4(__m128 x)
-{
-   __m128i expmask = _mm_set1_epi32(0x7f800000);
-   __m128i mantmask = _mm_set1_epi32(0x007fffff);
-   __m128 one = _mm_set1_ps(1.0f);
-
-   __m128i i = _mm_castps_si128(x);
-
-   /* exp = (float) exponent(x) */
-   __m128 exp = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, expmask), 23), _mm_set1_epi32(127)));
-
-   /* mant = (float) mantissa(x) */
-   __m128 mant = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mantmask)), one);
-
-   __m128 logmant;
-
-   /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ 
-    * These coefficients can be generate with 
-    * http://www.boost.org/doc/libs/1_36_0/libs/math/doc/sf_and_dist/html/math_toolkit/toolkit/internals2/minimax.html
-    */
-#if LOG_POLY_DEGREE == 6
-   logmant = POLY5(mant, 3.11578814719469302614f, -3.32419399085241980044f, 2.59883907202499966007f, -1.23152682416275988241f, 0.318212422185251071475f, -0.0344359067839062357313f);
-#elif LOG_POLY_DEGREE == 5
-   logmant = POLY4(mant, 2.8882704548164776201f, -2.52074962577807006663f, 1.48116647521213171641f, -0.465725644288844778798f, 0.0596515482674574969533f);
-#elif LOG_POLY_DEGREE == 4
-   logmant = POLY3(mant, 2.61761038894603480148f, -1.75647175389045657003f, 0.688243882994381274313f, -0.107254423828329604454f);
-#elif LOG_POLY_DEGREE == 3
-   logmant = POLY2(mant, 2.28330284476918490682f, -1.04913055217340124191f, 0.204446009836232697516f);
-#else
-#error
-#endif
-
-   /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/
-   logmant = _mm_mul_ps(logmant, _mm_sub_ps(mant, one));
-
-   return _mm_add_ps(logmant, exp);
-}
-
-
-static INLINE __m128
-powf4(__m128 x, __m128 y)
-{
-   return exp2f4(_mm_mul_ps(log2f4(x), y));
-}
-
-#endif /* PIPE_ARCH_SSE */
-
-
-
-/**
- * Low-level instruction translators.
- */
-
-static void
-emit_abs(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_andps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_7FFFFFFF_I,
-         TGSI_EXEC_TEMP_7FFFFFFF_C ) );
-}
-
-static void
-emit_add(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_addps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-cos4f(
-   float *store )
-{
-   store[0] = cosf( store[0] );
-   store[1] = cosf( store[1] );
-   store[2] = cosf( store[2] );
-   store[3] = cosf( store[3] );
-}
-
-static void
-emit_cos(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save, 
-      xmm_dst,
-      xmm_dst,
-      cos4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
-__attribute__((force_align_arg_pointer))
-#endif
-ex24f(
-   float *store )
-{
-#if defined(PIPE_ARCH_SSE)
-   _mm_store_ps(&store[0], exp2f4( _mm_load_ps(&store[0]) ));
-#else
-   store[0] = util_fast_exp2( store[0] );
-   store[1] = util_fast_exp2( store[1] );
-   store[2] = util_fast_exp2( store[2] );
-   store[3] = util_fast_exp2( store[3] );
-#endif
-}
-
-static void
-emit_ex2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      ex24f );
-}
-
-static void
-emit_f2it(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse2_cvttps2dq(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ) );
-}
-
-static void
-emit_i2f(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse2_cvtdq2ps(
-      func,
-      make_xmm( xmm ),
-      make_xmm( xmm ) );
-}
-
-static void PIPE_CDECL
-flr4f(
-   float *store )
-{
-   store[0] = floorf( store[0] );
-   store[1] = floorf( store[1] );
-   store[2] = floorf( store[2] );
-   store[3] = floorf( store[3] );
-}
-
-static void
-emit_flr(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      flr4f );
-}
-
-static void PIPE_CDECL
-frc4f(
-   float *store )
-{
-   store[0] -= floorf( store[0] );
-   store[1] -= floorf( store[1] );
-   store[2] -= floorf( store[2] );
-   store[3] -= floorf( store[3] );
-}
-
-static void
-emit_frc(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      frc4f );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
-__attribute__((force_align_arg_pointer))
-#endif
-lg24f(
-   float *store )
-{
-#if defined(PIPE_ARCH_SSE)
-   _mm_store_ps(&store[0], log2f4( _mm_load_ps(&store[0]) ));
-#else
-   store[0] = util_fast_log2( store[0] );
-   store[1] = util_fast_log2( store[1] );
-   store[2] = util_fast_log2( store[2] );
-   store[3] = util_fast_log2( store[3] );
-#endif
-}
-
-static void
-emit_lg2(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      lg24f );
-}
-
-static void
-emit_MOV(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_movups(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_mul (struct x86_function *func,
-          unsigned xmm_dst,
-          unsigned xmm_src)
-{
-   sse_mulps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void
-emit_neg(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_xorps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-#if defined(PIPE_CC_GCC) && defined(PIPE_ARCH_SSE)
-__attribute__((force_align_arg_pointer))
-#endif
-pow4f(
-   float *store )
-{
-#if defined(PIPE_ARCH_SSE)
-   _mm_store_ps(&store[0], powf4( _mm_load_ps(&store[0]), _mm_load_ps(&store[4]) ));
-#else
-   store[0] = util_fast_pow( store[0], store[4] );
-   store[1] = util_fast_pow( store[1], store[5] );
-   store[2] = util_fast_pow( store[2], store[6] );
-   store[3] = util_fast_pow( store[3], store[7] );
-#endif
-}
-
-static void
-emit_pow(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst,
-   unsigned xmm_src0,
-   unsigned xmm_src1 )
-{
-   emit_func_call_dst_src2(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_src0,
-      xmm_src1,
-      pow4f );
-}
-
-static void
-emit_rcp (
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.  Need to either emit a proper divide or use the
-    * iterative technique described below in emit_rsqrt().
-    */
-   sse2_rcpps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-static void PIPE_CDECL
-rnd4f(
-   float *store )
-{
-   store[0] = floorf( store[0] + 0.5f );
-   store[1] = floorf( store[1] + 0.5f );
-   store[2] = floorf( store[2] + 0.5f );
-   store[3] = floorf( store[3] + 0.5f );
-}
-
-static void
-emit_rnd(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      rnd4f );
-}
-
-static void
-emit_rsqrt(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-#if HIGH_PRECISION
-   /* Although rsqrtps() and rcpps() are low precision on some/all SSE
-    * implementations, it is possible to improve its precision at
-    * fairly low cost, using a newton/raphson step, as below:
-    * 
-    * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
-    * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
-    *
-    * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
-    */
-   {
-      struct x86_reg dst = make_xmm( xmm_dst );
-      struct x86_reg src = make_xmm( xmm_src );
-      struct x86_reg tmp0 = make_xmm( 2 );
-      struct x86_reg tmp1 = make_xmm( 3 );
-
-      assert( xmm_dst != xmm_src );
-      assert( xmm_dst != 2 && xmm_dst != 3 );
-      assert( xmm_src != 2 && xmm_src != 3 );
-
-      sse_movaps(  func, dst,  get_temp( TGSI_EXEC_TEMP_HALF_I, TGSI_EXEC_TEMP_HALF_C ) );
-      sse_movaps(  func, tmp0, get_temp( TGSI_EXEC_TEMP_THREE_I, TGSI_EXEC_TEMP_THREE_C ) );
-      sse_rsqrtps( func, tmp1, src  );
-      sse_mulps(   func, src,  tmp1 );
-      sse_mulps(   func, dst,  tmp1 );
-      sse_mulps(   func, src,  tmp1 );
-      sse_subps(   func, tmp0, src  );
-      sse_mulps(   func, dst,  tmp0 );
-   }
-#else
-   /* On Intel CPUs at least, this is only accurate to 12 bits -- not
-    * good enough.
-    */
-   sse_rsqrtps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-#endif
-}
-
-static void
-emit_setsign(
-   struct x86_function *func,
-   unsigned xmm )
-{
-   sse_orps(
-      func,
-      make_xmm( xmm ),
-      get_temp(
-         TGSI_EXEC_TEMP_80000000_I,
-         TGSI_EXEC_TEMP_80000000_C ) );
-}
-
-static void PIPE_CDECL
-sgn4f(
-   float *store )
-{
-   store[0] = store[0] < 0.0f ? -1.0f : store[0] > 0.0f ? 1.0f : 0.0f;
-   store[1] = store[1] < 0.0f ? -1.0f : store[1] > 0.0f ? 1.0f : 0.0f;
-   store[2] = store[2] < 0.0f ? -1.0f : store[2] > 0.0f ? 1.0f : 0.0f;
-   store[3] = store[3] < 0.0f ? -1.0f : store[3] > 0.0f ? 1.0f : 0.0f;
-}
-
-static void
-emit_sgn(
-   struct x86_function *func,
-   unsigned xmm_save, 
-   unsigned xmm_dst )
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      sgn4f );
-}
-
-static void PIPE_CDECL
-sin4f(
-   float *store )
-{
-   store[0] = sinf( store[0] );
-   store[1] = sinf( store[1] );
-   store[2] = sinf( store[2] );
-   store[3] = sinf( store[3] );
-}
-
-static void
-emit_sin (struct x86_function *func,
-          unsigned xmm_save, 
-          unsigned xmm_dst)
-{
-   emit_func_call_dst_src1(
-      func,
-      xmm_save,
-      xmm_dst,
-      xmm_dst,
-      sin4f );
-}
-
-static void
-emit_sub(
-   struct x86_function *func,
-   unsigned xmm_dst,
-   unsigned xmm_src )
-{
-   sse_subps(
-      func,
-      make_xmm( xmm_dst ),
-      make_xmm( xmm_src ) );
-}
-
-/**
- * Register fetch.
- */
-static void
-emit_fetch(
-   struct x86_function *func,
-   unsigned xmm,
-   const struct tgsi_full_src_register *reg,
-   const unsigned chan_index )
-{
-   unsigned swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
-
-   switch (swizzle) {
-   case TGSI_SWIZZLE_X:
-   case TGSI_SWIZZLE_Y:
-   case TGSI_SWIZZLE_Z:
-   case TGSI_SWIZZLE_W:
-      switch (reg->Register.File) {
-      case TGSI_FILE_CONSTANT:
-         emit_const(
-            func,
-            xmm,
-            reg->Register.Index,
-            swizzle,
-            reg->Register.Indirect,
-            reg->Indirect.File,
-            reg->Indirect.Index );
-         break;
-
-      case TGSI_FILE_IMMEDIATE:
-         emit_immediate(
-            func,
-            xmm,
-            reg->Register.Index,
-            swizzle );
-         break;
-
-      case TGSI_FILE_SYSTEM_VALUE:
-         emit_system_value(
-            func,
-            xmm,
-            reg->Register.Index,
-            swizzle );
-         break;
-
-      case TGSI_FILE_INPUT:
-         emit_inputf(
-            func,
-            xmm,
-            reg->Register.Index,
-            swizzle );
-         break;
-
-      case TGSI_FILE_TEMPORARY:
-         emit_tempf(
-            func,
-            xmm,
-            reg->Register.Index,
-            swizzle );
-         break;
-
-      default:
-         assert( 0 );
-      }
-      break;
-
-   default:
-      assert( 0 );
-   }
-
-   switch( tgsi_util_get_full_src_register_sign_mode( reg, chan_index ) ) {
-   case TGSI_UTIL_SIGN_CLEAR:
-      emit_abs( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_SET:
-      emit_setsign( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_TOGGLE:
-      emit_neg( func, xmm );
-      break;
-
-   case TGSI_UTIL_SIGN_KEEP:
-      break;
-   }
-}
-
-#define FETCH( FUNC, INST, XMM, INDEX, CHAN )\
-   emit_fetch( FUNC, XMM, &(INST).Src[INDEX], CHAN )
-
-/**
- * Register store.
- */
-static void
-emit_store(
-   struct x86_function *func,
-   unsigned xmm,
-   const struct tgsi_full_dst_register *reg,
-   const struct tgsi_full_instruction *inst,
-   unsigned chan_index )
-{
-   switch( inst->Instruction.Saturate ) {
-   case TGSI_SAT_NONE:
-      break;
-
-   case TGSI_SAT_ZERO_ONE:
-      sse_maxps(
-         func,
-         make_xmm( xmm ),
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ) );
-
-      sse_minps(
-         func,
-         make_xmm( xmm ),
-         get_temp(
-            TGSI_EXEC_TEMP_ONE_I,
-            TGSI_EXEC_TEMP_ONE_C ) );
-      break;
-
-   case TGSI_SAT_MINUS_PLUS_ONE:
-      assert( 0 );
-      break;
-   }
-
-
-   switch( reg->Register.File ) {
-   case TGSI_FILE_OUTPUT:
-      emit_output(
-         func,
-         xmm,
-         reg->Register.Index,
-         chan_index );
-      break;
-
-   case TGSI_FILE_TEMPORARY:
-      emit_temps(
-         func,
-         xmm,
-         reg->Register.Index,
-         chan_index );
-      break;
-
-   case TGSI_FILE_ADDRESS:
-      emit_addrs(
-         func,
-         xmm,
-         reg->Register.Index,
-         chan_index );
-      break;
-
-   default:
-      assert( 0 );
-   }
-}
-
-#define STORE( FUNC, INST, XMM, INDEX, CHAN )\
-   emit_store( FUNC, XMM, &(INST).Dst[INDEX], &(INST), CHAN )
-
-
-static void PIPE_CDECL
-fetch_texel( struct tgsi_sampler **sampler,
-             float *store )
-{
-#if 0
-   uint j;
-
-   debug_printf("%s sampler: %p (%p) store: %p\n", 
-                __FUNCTION__,
-                sampler, *sampler,
-                store );
-
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d texcoord %f %f %f lodbias %f\n",
-                   j, 
-                   store[0+j],
-                   store[4+j],
-                   store[8 + j],
-                   store[12 + j]);
-#endif
-
-   {
-      float rgba[NUM_CHANNELS][QUAD_SIZE];
-      (*sampler)->get_samples(*sampler, 
-                              &store[0],  /* s */
-                              &store[4],  /* t */
-                              &store[8],  /* r */
-                              &store[12], /* lodbias */
-                              tgsi_sampler_lod_bias,
-                              rgba);      /* results */
-
-      memcpy( store, rgba, 16 * sizeof(float));
-   }
-
-#if 0
-   for (j = 0; j < 4; j++)
-      debug_printf("sample %d result %f %f %f %f\n", 
-                   j, 
-                   store[0+j],
-                   store[4+j],
-                   store[8+j],
-                   store[12+j]);
-#endif
-}
-
-/**
- * High-level instruction translators.
- */
-static void
-emit_tex( struct x86_function *func,
-          const struct tgsi_full_instruction *inst,
-          boolean lodbias,
-          boolean projected)
-{
-   const uint unit = inst->Src[1].Register.Index;
-   struct x86_reg args[2];
-   unsigned count;
-   unsigned i;
-
-   assert(inst->Instruction.Texture);
-   switch (inst->Texture.Texture) {
-   case TGSI_TEXTURE_1D:
-      count = 1;
-      break;
-   case TGSI_TEXTURE_2D:
-   case TGSI_TEXTURE_RECT:
-   case TGSI_TEXTURE_1D_ARRAY:
-      count = 2;
-      break;
-   case TGSI_TEXTURE_SHADOW1D:
-   case TGSI_TEXTURE_SHADOW2D:
-   case TGSI_TEXTURE_SHADOWRECT:
-   case TGSI_TEXTURE_3D:
-   case TGSI_TEXTURE_CUBE:
-   case TGSI_TEXTURE_2D_ARRAY:
-   case TGSI_TEXTURE_SHADOW1D_ARRAY:
-      count = 3;
-      break;
-   case TGSI_TEXTURE_SHADOW2D_ARRAY:
-      count = 4;
-      break;
-   default:
-      assert(0);
-      return;
-   }
-
-   if (lodbias) {
-      FETCH( func, *inst, 3, 0, 3 );
-   }
-   else {
-      emit_tempf(
-         func,
-         3,
-         TGSI_EXEC_TEMP_00000000_I,
-         TGSI_EXEC_TEMP_00000000_C );
-
-   }
-
-   /* store lodbias whether enabled or not -- fetch_texel currently
-    * respects it always.
-    */
-   sse_movaps( func,
-               get_temp( TEMP_R0, 3 ),
-               make_xmm( 3 ) );
-
-   if (projected) {
-      FETCH( func, *inst, 3, 0, 3 );
-
-      emit_rcp( func, 3, 3 );
-   }
-
-   for (i = 0; i < count; i++) {
-      FETCH( func, *inst, i, 0, i );
-
-      if (projected) {
-         sse_mulps(
-            func,
-            make_xmm( i ),
-            make_xmm( 3 ) );
-      }
-      
-      /* Store in the argument buffer:
-       */
-      sse_movaps(
-         func,
-         get_temp( TEMP_R0, i ),
-         make_xmm( i ) );
-   }
-
-   args[0] = get_temp( TEMP_R0, 0 );
-   args[1] = get_sampler_ptr( unit );
-
-   emit_func_call( func,
-                   0,
-                   args,
-                   Elements(args),
-                   fetch_texel );
-
-   /* If all four channels are enabled, could use a pointer to
-    * dst[0].x instead of TEMP_R0 for store?
-    */
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, i ) {
-
-      sse_movaps(
-         func,
-         make_xmm( 0 ),
-         get_temp( TEMP_R0, i ) );
-
-      STORE( func, *inst, 0, 0, i );
-   }
-}
-
-
-static void
-emit_kil(
-   struct x86_function *func,
-   const struct tgsi_full_src_register *reg )
-{
-   unsigned uniquemask;
-   unsigned unique_count = 0;
-   unsigned chan_index;
-   unsigned i;
-
-   /* This mask stores component bits that were already tested. Note that
-    * we test if the value is less than zero, so 1.0 and 0.0 need not to be
-    * tested.
-    */
-   uniquemask = 0;
-
-   FOR_EACH_CHANNEL( chan_index ) {
-      unsigned swizzle;
-
-      /* unswizzle channel */
-      swizzle = tgsi_util_get_full_src_register_swizzle(
-         reg,
-         chan_index );
-
-      /* check if the component has not been already tested */
-      if( !(uniquemask & (1 << swizzle)) ) {
-         uniquemask |= 1 << swizzle;
-
-         /* allocate register */
-         emit_fetch(
-            func,
-            unique_count++,
-            reg,
-            chan_index );
-      }
-   }
-
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-   x86_push(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-
-   for (i = 0 ; i < unique_count; i++ ) {
-      struct x86_reg dataXMM = make_xmm(i);
-
-      sse_cmpps(
-         func,
-         dataXMM,
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ),
-         cc_LessThan );
-      
-      if( i == 0 ) {
-         sse_movmskps(
-            func,
-            x86_make_reg( file_REG32, reg_AX ),
-            dataXMM );
-      }
-      else {
-         sse_movmskps(
-            func,
-            x86_make_reg( file_REG32, reg_DX ),
-            dataXMM );
-         x86_or(
-            func,
-            x86_make_reg( file_REG32, reg_AX ),
-            x86_make_reg( file_REG32, reg_DX ) );
-      }
-   }
-
-   x86_or(
-      func,
-      get_temp(
-         TGSI_EXEC_TEMP_KILMASK_I,
-         TGSI_EXEC_TEMP_KILMASK_C ),
-      x86_make_reg( file_REG32, reg_AX ) );
-
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_DX ) );
-   x86_pop(
-      func,
-      x86_make_reg( file_REG32, reg_AX ) );
-}
-
-
-static void
-emit_kilp(
-   struct x86_function *func )
-{
-   /* XXX todo / fix me */
-}
-
-
-static void
-emit_setcc(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst,
-   enum sse_cc cc )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ),
-         cc );
-      sse_andps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TEMP_ONE_I,
-            TEMP_ONE_C ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-static void
-emit_cmp(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-      FETCH( func, *inst, 0, 0, chan_index );
-      FETCH( func, *inst, 1, 1, chan_index );
-      FETCH( func, *inst, 2, 2, chan_index );
-      sse_cmpps(
-         func,
-         make_xmm( 0 ),
-         get_temp(
-            TGSI_EXEC_TEMP_00000000_I,
-            TGSI_EXEC_TEMP_00000000_C ),
-         cc_LessThan );
-      sse_andps(
-         func,
-         make_xmm( 1 ),
-         make_xmm( 0 ) );
-      sse_andnps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 2 ) );
-      sse_orps(
-         func,
-         make_xmm( 0 ),
-         make_xmm( 1 ) );
-      STORE( func, *inst, 0, 0, chan_index );
-   }
-}
-
-
-/**
- * Check if inst src/dest regs use indirect addressing into temporary,
- * input or output register files.
- */
-static boolean
-indirect_reg_reference(const struct tgsi_full_instruction *inst)
-{
-   uint i;
-   for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
-      const struct tgsi_full_src_register *reg = &inst->Src[i];
-      if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
-           reg->Register.File == TGSI_FILE_INPUT ||
-           reg->Register.File == TGSI_FILE_OUTPUT) &&
-          reg->Register.Indirect)
-         return TRUE;
-   }
-   for (i = 0; i < inst->Instruction.NumDstRegs; i++) {
-      const struct tgsi_full_dst_register *reg = &inst->Dst[i];
-      if ((reg->Register.File == TGSI_FILE_TEMPORARY ||
-           reg->Register.File == TGSI_FILE_INPUT ||
-           reg->Register.File == TGSI_FILE_OUTPUT) &&
-          reg->Register.Indirect)
-         return TRUE;
-   }
-   return FALSE;
-}
-
-
-static int
-emit_instruction(
-   struct x86_function *func,
-   struct tgsi_full_instruction *inst )
-{
-   unsigned chan_index;
-
-   /* we can't handle indirect addressing into temp register file yet */
-   if (indirect_reg_reference(inst))
-      return FALSE;
-
-   switch (inst->Instruction.Opcode) {
-   case TGSI_OPCODE_ARL:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr(func, 0, 0);
-         emit_f2it( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MOV:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 4 + chan_index, 0, chan_index );
-      }
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 4 + chan_index, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LIT:
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C);
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
-            STORE( func, *inst, 0, 0, CHAN_X );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
-            STORE( func, *inst, 0, 0, CHAN_W );
-         }
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_maxps(
-               func,
-               make_xmm( 0 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            STORE( func, *inst, 0, 0, CHAN_Y );
-         }
-         if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-            /* XMM[1] = SrcReg[0].yyyy */
-            FETCH( func, *inst, 1, 0, CHAN_Y );
-            /* XMM[1] = max(XMM[1], 0) */
-            sse_maxps(
-               func,
-               make_xmm( 1 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_00000000_I,
-                  TGSI_EXEC_TEMP_00000000_C ) );
-            /* XMM[2] = SrcReg[0].wwww */
-            FETCH( func, *inst, 2, 0, CHAN_W );
-            /* XMM[2] = min(XMM[2], 128.0) */
-            sse_minps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_128_I,
-                  TGSI_EXEC_TEMP_128_C ) );
-            /* XMM[2] = max(XMM[2], -128.0) */
-            sse_maxps(
-               func,
-               make_xmm( 2 ),
-               get_temp(
-                  TGSI_EXEC_TEMP_MINUS_128_I,
-                  TGSI_EXEC_TEMP_MINUS_128_C ) );
-            emit_pow( func, 3, 1, 1, 2 );
-            FETCH( func, *inst, 0, 0, CHAN_X );
-            sse_xorps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 2 ) );
-            sse_cmpps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 0 ),
-               cc_LessThan );
-            sse_andps(
-               func,
-               make_xmm( 2 ),
-               make_xmm( 1 ) );
-            STORE( func, *inst, 2, 0, CHAN_Z );
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_RCP:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_rcp( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RSQ:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_abs( func, 0 );
-      emit_rsqrt( func, 1, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 1, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EXP:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_MOV( func, 1, 0 );
-            emit_flr( func, 2, 1 );
-            /* dst.x = ex2(floor(src.x)) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               emit_MOV( func, 2, 1 );
-               emit_ex2( func, 3, 2 );
-               STORE( func, *inst, 2, 0, CHAN_X );
-            }
-            /* dst.y = src.x - floor(src.x) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_MOV( func, 2, 0 );
-               emit_sub( func, 2, 1 );
-               STORE( func, *inst, 2, 0, CHAN_Y );
-            }
-         }
-         /* dst.z = ex2(src.x) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            emit_ex2( func, 3, 0 );
-            STORE( func, *inst, 0, 0, CHAN_Z );
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_LOG:
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_abs( func, 0 );
-         emit_MOV( func, 1, 0 );
-         emit_lg2( func, 2, 1 );
-         /* dst.z = lg2(abs(src.x)) */
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z )) {
-            STORE( func, *inst, 1, 0, CHAN_Z );
-         }
-         if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-             IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-            emit_flr( func, 2, 1 );
-            /* dst.x = floor(lg2(abs(src.x))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X )) {
-               STORE( func, *inst, 1, 0, CHAN_X );
-            }
-            /* dst.x = abs(src)/ex2(floor(lg2(abs(src.x)))) */
-            if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y )) {
-               emit_ex2( func, 2, 1 );
-               emit_rcp( func, 1, 1 );
-               emit_mul( func, 0, 1 );
-               STORE( func, *inst, 0, 0, CHAN_Y );
-            }
-         }
-      }
-      /* dst.w = 1.0 */
-      if (IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W )) {
-         emit_tempf( func, 0, TEMP_ONE_I, TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MUL:
-      /* do all fetches and adds, storing results in temp regs */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         int r = chan_index + 1;
-         FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
-         FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
-         emit_mul( func, r, 0 );   /* xmm[r] = xmm[r] * xmm[0] */
-      }
-      /* do all stores of the temp regs */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         int r = chan_index + 1;
-         STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
-      }
-      break;
-
-   case TGSI_OPCODE_ADD:
-      /* do all fetches and adds, storing results in temp regs */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         int r = chan_index + 1;
-         FETCH( func, *inst, 0, 0, chan_index ); /* load xmm[0] */
-         FETCH( func, *inst, r, 1, chan_index ); /* load xmm[r] */
-         emit_add( func, r, 0 );   /* xmm[r] = xmm[r] + xmm[0] */
-      }
-      /* do all stores of the temp regs */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         int r = chan_index + 1;
-         STORE( func, *inst, r, 0, chan_index ); /* store xmm[r] */
-      }
-      break;
-
-   case TGSI_OPCODE_DP3:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DP4:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul(func, 1, 2 );
-      emit_add(func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_W );
-      FETCH( func, *inst, 2, 1, CHAN_W );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DST:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_tempf(
-            func,
-            0,
-            TEMP_ONE_I,
-            TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y );
-         FETCH( func, *inst, 1, 1, CHAN_Y );
-         emit_mul( func, 0, 1 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         FETCH( func, *inst, 0, 0, CHAN_Z );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-         FETCH( func, *inst, 0, 1, CHAN_W );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_MIN:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_minps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_MAX:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         sse_maxps(
-            func,
-            make_xmm( 0 ),
-            make_xmm( 1 ) );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLT:
-      emit_setcc( func, inst, cc_LessThan );
-      break;
-
-   case TGSI_OPCODE_SGE:
-      emit_setcc( func, inst, cc_NotLessThan );
-      break;
-
-   case TGSI_OPCODE_MAD:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SUB:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         emit_sub( func, 0, 1 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LRP:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         FETCH( func, *inst, 1, 1, chan_index );
-         FETCH( func, *inst, 2, 2, chan_index );
-         emit_sub( func, 1, 2 );
-         emit_mul( func, 0, 1 );
-         emit_add( func, 0, 2 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DP2A:
-      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
-      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
-      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
-      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
-      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
-      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
-      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
-      FETCH( func, *inst, 1, 2, CHAN_X );  /* xmm1 = src[2].x */
-      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
-      }
-      break;
-
-   case TGSI_OPCODE_FRC:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_frc( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CLAMP:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_FLR:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_flr( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_ROUND:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_rnd( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_EX2:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_ex2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_LG2:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_lg2( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_POW:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_pow( func, 0, 0, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_XPD:
-      /* Note: we do all stores after all operands have been fetched
-       * to avoid src/dst register aliasing issues for an instruction
-       * such as:  XPD TEMP[2].xyz, TEMP[0], TEMP[2];
-       */
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
-         FETCH( func, *inst, 1, 1, CHAN_Z ); /* xmm[1] = src[1].z */
-         FETCH( func, *inst, 3, 0, CHAN_Z ); /* xmm[3] = src[0].z */
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 0, 0, CHAN_Y ); /* xmm[0] = src[0].y */
-         FETCH( func, *inst, 4, 1, CHAN_Y ); /* xmm[4] = src[1].y */
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         emit_MOV( func, 7, 0 );  /* xmm[7] = xmm[0] */
-         emit_mul( func, 7, 1 );  /* xmm[7] = xmm[2] * xmm[1] */
-         emit_MOV( func, 5, 3 );  /* xmm[5] = xmm[3] */
-         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
-         emit_sub( func, 7, 5 );  /* xmm[7] = xmm[2] - xmm[5] */
-         /* store xmm[7] in dst.x below */
-      }
-      if( IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) ||
-          IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
-         FETCH( func, *inst, 2, 1, CHAN_X ); /* xmm[2] = src[1].x */
-         FETCH( func, *inst, 5, 0, CHAN_X ); /* xmm[5] = src[0].x */
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         emit_mul( func, 3, 2 );  /* xmm[3] = xmm[3] * xmm[2] */
-         emit_mul( func, 1, 5 );  /* xmm[1] = xmm[1] * xmm[5] */
-         emit_sub( func, 3, 1 );  /* xmm[3] = xmm[3] - xmm[1] */
-         /* store xmm[3] in dst.y below */
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-         emit_mul( func, 5, 4 );  /* xmm[5] = xmm[5] * xmm[4] */
-         emit_mul( func, 0, 2 );  /* xmm[0] = xmm[0] * xmm[2] */
-         emit_sub( func, 5, 0 );  /* xmm[5] = xmm[5] - xmm[0] */
-         STORE( func, *inst, 5, 0, CHAN_Z ); /* dst.z = xmm[5] */
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         STORE( func, *inst, 7, 0, CHAN_X ); /* dst.x = xmm[7] */
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         STORE( func, *inst, 3, 0, CHAN_Y ); /* dst.y = xmm[3] */
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_ABS:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_abs( func, 0) ;
-
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_RCC:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DPH:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      FETCH( func, *inst, 1, 1, CHAN_X );
-      emit_mul( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Y );
-      FETCH( func, *inst, 2, 1, CHAN_Y );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 0, CHAN_Z );
-      FETCH( func, *inst, 2, 1, CHAN_Z );
-      emit_mul( func, 1, 2 );
-      emit_add( func, 0, 1 );
-      FETCH( func, *inst, 1, 1, CHAN_W );
-      emit_add( func, 0, 1 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_COS:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_cos( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_DDX:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DDY:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_KILP:
-      /* predicated kill */
-      emit_kilp( func );
-      return 0; /* XXX fix me */
-      break;
-
-   case TGSI_OPCODE_KIL:
-      /* conditional kill */
-      emit_kil( func, &inst->Src[0] );
-      break;
-
-   case TGSI_OPCODE_PK2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PK4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SEQ:
-      emit_setcc( func, inst, cc_Equal );
-      break;
-
-   case TGSI_OPCODE_SFL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SGT:
-      emit_setcc( func, inst, cc_NotLessThanEqual );
-      break;
-
-   case TGSI_OPCODE_SIN:
-      FETCH( func, *inst, 0, 0, CHAN_X );
-      emit_sin( func, 0, 0 );
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SLE:
-      emit_setcc( func, inst, cc_LessThanEqual );
-      break;
-
-   case TGSI_OPCODE_SNE:
-      emit_setcc( func, inst, cc_NotEqual );
-      break;
-
-   case TGSI_OPCODE_STR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TEX:
-      emit_tex( func, inst, FALSE, FALSE );
-      break;
-
-   case TGSI_OPCODE_TXD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2H:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP2US:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4B:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_UP4UB:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_X2D:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ARR:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_rnd( func, 0, 0 );
-         emit_f2it( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_BRA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CAL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_RET:
-      emit_ret( func );
-      break;
-
-   case TGSI_OPCODE_END:
-      break;
-
-   case TGSI_OPCODE_SSG:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_sgn( func, 0, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_CMP:
-      emit_cmp (func, inst);
-      break;
-
-   case TGSI_OPCODE_SCS:
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_X ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_cos( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_X );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Y ) {
-         FETCH( func, *inst, 0, 0, CHAN_X );
-         emit_sin( func, 0, 0 );
-         STORE( func, *inst, 0, 0, CHAN_Y );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_Z ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TGSI_EXEC_TEMP_00000000_I,
-	    TGSI_EXEC_TEMP_00000000_C );
-         STORE( func, *inst, 0, 0, CHAN_Z );
-      }
-      IF_IS_DST0_CHANNEL_ENABLED( *inst, CHAN_W ) {
-	 emit_tempf(
-	    func,
-	    0,
-	    TEMP_ONE_I,
-	    TEMP_ONE_C );
-         STORE( func, *inst, 0, 0, CHAN_W );
-      }
-      break;
-
-   case TGSI_OPCODE_TXB:
-      emit_tex( func, inst, TRUE, FALSE );
-      break;
-
-   case TGSI_OPCODE_NRM:
-      /* fall-through */
-   case TGSI_OPCODE_NRM4:
-      /* 3 or 4-component normalization */
-      {
-         uint dims = (inst->Instruction.Opcode == TGSI_OPCODE_NRM) ? 3 : 4;
-
-         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) ||
-             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y) ||
-             IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z) ||
-             (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 4)) {
-
-            /* NOTE: Cannot use xmm regs 2/3 here (see emit_rsqrt() above). */
-
-            /* xmm4 = src.x */
-            /* xmm0 = src.x * src.x */
-            FETCH(func, *inst, 0, 0, CHAN_X);
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
-               emit_MOV(func, 4, 0);
-            }
-            emit_mul(func, 0, 0);
-
-            /* xmm5 = src.y */
-            /* xmm0 = xmm0 + src.y * src.y */
-            FETCH(func, *inst, 1, 0, CHAN_Y);
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
-               emit_MOV(func, 5, 1);
-            }
-            emit_mul(func, 1, 1);
-            emit_add(func, 0, 1);
-
-            /* xmm6 = src.z */
-            /* xmm0 = xmm0 + src.z * src.z */
-            FETCH(func, *inst, 1, 0, CHAN_Z);
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-               emit_MOV(func, 6, 1);
-            }
-            emit_mul(func, 1, 1);
-            emit_add(func, 0, 1);
-
-            if (dims == 4) {
-               /* xmm7 = src.w */
-               /* xmm0 = xmm0 + src.w * src.w */
-               FETCH(func, *inst, 1, 0, CHAN_W);
-               if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W)) {
-                  emit_MOV(func, 7, 1);
-               }
-               emit_mul(func, 1, 1);
-               emit_add(func, 0, 1);
-            }
-
-            /* xmm1 = 1 / sqrt(xmm0) */
-            emit_rsqrt(func, 1, 0);
-
-            /* dst.x = xmm1 * src.x */
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X)) {
-               emit_mul(func, 4, 1);
-               STORE(func, *inst, 4, 0, CHAN_X);
-            }
-
-            /* dst.y = xmm1 * src.y */
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Y)) {
-               emit_mul(func, 5, 1);
-               STORE(func, *inst, 5, 0, CHAN_Y);
-            }
-
-            /* dst.z = xmm1 * src.z */
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_Z)) {
-               emit_mul(func, 6, 1);
-               STORE(func, *inst, 6, 0, CHAN_Z);
-            }
-
-            /* dst.w = xmm1 * src.w */
-            if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_X) && dims == 4) {
-               emit_mul(func, 7, 1);
-               STORE(func, *inst, 7, 0, CHAN_W);
-            }
-         }
-
-         /* dst0.w = 1.0 */
-         if (IS_DST0_CHANNEL_ENABLED(*inst, CHAN_W) && dims == 3) {
-            emit_tempf(func, 0, TEMP_ONE_I, TEMP_ONE_C);
-            STORE(func, *inst, 0, 0, CHAN_W);
-         }
-      }
-      break;
-
-   case TGSI_OPCODE_DIV:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_DP2:
-      FETCH( func, *inst, 0, 0, CHAN_X );  /* xmm0 = src[0].x */
-      FETCH( func, *inst, 1, 1, CHAN_X );  /* xmm1 = src[1].x */
-      emit_mul( func, 0, 1 );              /* xmm0 = xmm0 * xmm1 */
-      FETCH( func, *inst, 1, 0, CHAN_Y );  /* xmm1 = src[0].y */
-      FETCH( func, *inst, 2, 1, CHAN_Y );  /* xmm2 = src[1].y */
-      emit_mul( func, 1, 2 );              /* xmm1 = xmm1 * xmm2 */
-      emit_add( func, 0, 1 );              /* xmm0 = xmm0 + xmm1 */
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         STORE( func, *inst, 0, 0, chan_index );  /* dest[ch] = xmm0 */
-      }
-      break;
-
-   case TGSI_OPCODE_TXL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXP:
-      emit_tex( func, inst, FALSE, TRUE );
-      break;
-      
-   case TGSI_OPCODE_BRK:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_IF:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ELSE:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDIF:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_PUSHA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_POPA:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CEIL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_I2F:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_NOT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TRUNC:
-      FOR_EACH_DST0_ENABLED_CHANNEL( *inst, chan_index ) {
-         FETCH( func, *inst, 0, 0, chan_index );
-         emit_f2it( func, 0 );
-         emit_i2f( func, 0 );
-         STORE( func, *inst, 0, 0, chan_index );
-      }
-      break;
-
-   case TGSI_OPCODE_SHL:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ISHR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_AND:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_OR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_MOD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_XOR:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_SAD:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXF:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_TXQ:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_CONT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_EMIT:
-      return 0;
-      break;
-
-   case TGSI_OPCODE_ENDPRIM:
-      return 0;
-      break;
-
-   default:
-      return 0;
-   }
-   
-   return 1;
-}
-
-static void
-emit_declaration(
-   struct x86_function *func,
-   struct tgsi_full_declaration *decl )
-{
-   if( decl->Declaration.File == TGSI_FILE_INPUT ) {
-      unsigned first, last, mask;
-      unsigned i, j;
-
-      first = decl->Range.First;
-      last = decl->Range.Last;
-      mask = decl->Declaration.UsageMask;
-
-      for( i = first; i <= last; i++ ) {
-         for( j = 0; j < NUM_CHANNELS; j++ ) {
-            if( mask & (1 << j) ) {
-               switch( decl->Declaration.Interpolate ) {
-               case TGSI_INTERPOLATE_CONSTANT:
-                  emit_coef_a0( func, 0, i, j );
-                  emit_inputs( func, 0, i, j );
-                  break;
-
-               case TGSI_INTERPOLATE_LINEAR:
-                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
-                  emit_coef_dadx( func, 1, i, j );
-                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
-                  emit_coef_dady( func, 3, i, j );
-                  emit_mul( func, 0, 1 );    /* x * dadx */
-                  emit_coef_a0( func, 4, i, j );
-                  emit_mul( func, 2, 3 );    /* y * dady */
-                  emit_add( func, 0, 4 );    /* x * dadx + a0 */
-                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
-                  emit_inputs( func, 0, i, j );
-                  break;
-
-               case TGSI_INTERPOLATE_PERSPECTIVE:
-                  emit_tempf( func, 0, 0, TGSI_SWIZZLE_X );
-                  emit_coef_dadx( func, 1, i, j );
-                  emit_tempf( func, 2, 0, TGSI_SWIZZLE_Y );
-                  emit_coef_dady( func, 3, i, j );
-                  emit_mul( func, 0, 1 );    /* x * dadx */
-                  emit_tempf( func, 4, 0, TGSI_SWIZZLE_W );
-                  emit_coef_a0( func, 5, i, j );
-                  emit_rcp( func, 4, 4 );    /* 1.0 / w */
-                  emit_mul( func, 2, 3 );    /* y * dady */
-                  emit_add( func, 0, 5 );    /* x * dadx + a0 */
-                  emit_add( func, 0, 2 );    /* x * dadx + y * dady + a0 */
-                  emit_mul( func, 0, 4 );    /* (x * dadx + y * dady + a0) / w */
-                  emit_inputs( func, 0, i, j );
-                  break;
-
-               default:
-                  assert( 0 );
-		  break;
-               }
-            }
-         }
-      }
-   }
-}
-
-static void aos_to_soa( struct x86_function *func, 
-                        uint arg_aos,
-                        uint arg_machine, 
-                        uint arg_num, 
-                        uint arg_stride )
-{
-   struct x86_reg soa_input = x86_make_reg( file_REG32, reg_AX );
-   struct x86_reg aos_input = x86_make_reg( file_REG32, reg_BX );
-   struct x86_reg num_inputs = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg stride = x86_make_reg( file_REG32, reg_DX );
-   int loop_top, loop_exit_fixup;
-
-   /* Save EBX */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   x86_mov( func, aos_input,  x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_input,  x86_fn_arg( func, arg_machine ) );
-   /* FIXME: tgsi_exec_machine::Inputs is a pointer now! */
-   x86_lea( func, soa_input,  
-	    x86_make_disp( soa_input, 
-			   Offset(struct tgsi_exec_machine, Inputs) ) );
-   x86_mov( func, num_inputs, x86_fn_arg( func, arg_num ) );
-   x86_mov( func, stride,     x86_fn_arg( func, arg_stride ) );
-
-   /* while (num_inputs != 0) */
-   loop_top = x86_get_label( func );
-   x86_cmp_imm( func, num_inputs, 0 );
-   loop_exit_fixup = x86_jcc_forward( func, cc_E );
-
-   {
-      x86_push( func, aos_input );
-      sse_movlps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 0 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 3 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movlps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movlps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_add( func, aos_input, stride );
-      sse_movhps( func, make_xmm( 1 ), x86_make_disp( aos_input, 0 ) );
-      sse_movhps( func, make_xmm( 4 ), x86_make_disp( aos_input, 8 ) );
-      x86_pop( func, aos_input );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_shufps( func, make_xmm( 0 ), make_xmm( 1 ), 0x88 );
-      sse_shufps( func, make_xmm( 2 ), make_xmm( 1 ), 0xdd );
-      sse_shufps( func, make_xmm( 3 ), make_xmm( 4 ), 0x88 );
-      sse_shufps( func, make_xmm( 5 ), make_xmm( 4 ), 0xdd );
-
-      sse_movups( func, x86_make_disp( soa_input, 0 ), make_xmm( 0 ) );
-      sse_movups( func, x86_make_disp( soa_input, 16 ), make_xmm( 2 ) );
-      sse_movups( func, x86_make_disp( soa_input, 32 ), make_xmm( 3 ) );
-      sse_movups( func, x86_make_disp( soa_input, 48 ), make_xmm( 5 ) );
-
-      /* Advance to next input */
-      x86_lea( func, aos_input, x86_make_disp(aos_input, 16) );
-      x86_lea( func, soa_input, x86_make_disp(soa_input, 64) );
-   }
-   /* --num_inputs */
-   x86_dec( func, num_inputs );
-   x86_jmp( func, loop_top );
-   x86_fixup_fwd_jump( func, loop_exit_fixup );
-
-   /* Restore EBX */
-   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
-}
-
-static void soa_to_aos( struct x86_function *func, 
-			uint arg_aos, 
-			uint arg_machine, 
-			uint arg_num, 
-			uint arg_stride )
-{
-   struct x86_reg soa_output = x86_make_reg( file_REG32, reg_AX );
-   struct x86_reg aos_output = x86_make_reg( file_REG32, reg_BX );
-   struct x86_reg num_outputs = x86_make_reg( file_REG32, reg_CX );
-   struct x86_reg temp = x86_make_reg( file_REG32, reg_DX );
-   int inner_loop;
-
-   /* Save EBX */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   x86_mov( func, aos_output, x86_fn_arg( func, arg_aos ) );
-   x86_mov( func, soa_output, x86_fn_arg( func, arg_machine ) );
-   /* FIXME: tgsi_exec_machine::Ouputs is a pointer now! */
-   x86_lea( func, soa_output, 
-	    x86_make_disp( soa_output, 
-			   Offset(struct tgsi_exec_machine, Outputs) ) );
-   x86_mov( func, num_outputs, x86_fn_arg( func, arg_num ) );
-
-   /* do */
-   inner_loop = x86_get_label( func );
-   {
-      sse_movups( func, make_xmm( 0 ), x86_make_disp( soa_output, 0 ) );
-      sse_movups( func, make_xmm( 1 ), x86_make_disp( soa_output, 16 ) );
-      sse_movups( func, make_xmm( 3 ), x86_make_disp( soa_output, 32 ) );
-      sse_movups( func, make_xmm( 4 ), x86_make_disp( soa_output, 48 ) );
-
-      sse_movaps( func, make_xmm( 2 ), make_xmm( 0 ) );
-      sse_movaps( func, make_xmm( 5 ), make_xmm( 3 ) );
-      sse_unpcklps( func, make_xmm( 0 ), make_xmm( 1 ) );
-      sse_unpckhps( func, make_xmm( 2 ), make_xmm( 1 ) );
-      sse_unpcklps( func, make_xmm( 3 ), make_xmm( 4 ) );
-      sse_unpckhps( func, make_xmm( 5 ), make_xmm( 4 ) );
-
-      x86_mov( func, temp, x86_fn_arg( func, arg_stride ) );
-      x86_push( func, aos_output );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 0 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 3 ) );
-      x86_add( func, aos_output, temp );
-      sse_movlps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movlps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_add( func, aos_output, temp );
-      sse_movhps( func, x86_make_disp( aos_output, 0 ), make_xmm( 2 ) );
-      sse_movhps( func, x86_make_disp( aos_output, 8 ), make_xmm( 5 ) );
-      x86_pop( func, aos_output );
-
-      /* Advance to next output */
-      x86_lea( func, aos_output, x86_make_disp(aos_output, 16) );
-      x86_lea( func, soa_output, x86_make_disp(soa_output, 64) );
-   }
-   /* while --num_outputs */
-   x86_dec( func, num_outputs );
-   x86_jcc( func, cc_NE, inner_loop );
-
-   /* Restore EBX */
-   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
-}
-
-
-/**
- * Check if the instructions dst register is the same as any src
- * register and warn if there's a posible SOA dependency.
- */
-static boolean
-check_soa_dependencies(const struct tgsi_full_instruction *inst)
-{
-   uint opcode = inst->Instruction.Opcode;
-
-   /* XXX: we only handle src/dst aliasing in a few opcodes currently.
-    * Need to use an additional temporay to hold the result in the
-    * cases where the code is too opaque to fix.
-    */
-
-   switch (opcode) {
-   case TGSI_OPCODE_ADD:
-   case TGSI_OPCODE_MOV:
-   case TGSI_OPCODE_MUL:
-   case TGSI_OPCODE_RCP:
-   case TGSI_OPCODE_RSQ:
-   case TGSI_OPCODE_EXP:
-   case TGSI_OPCODE_LOG:
-   case TGSI_OPCODE_DP3:
-   case TGSI_OPCODE_DP4:
-   case TGSI_OPCODE_DP2A:
-   case TGSI_OPCODE_EX2:
-   case TGSI_OPCODE_LG2:
-   case TGSI_OPCODE_POW:
-   case TGSI_OPCODE_XPD:
-   case TGSI_OPCODE_DPH:
-   case TGSI_OPCODE_COS:
-   case TGSI_OPCODE_SIN:
-   case TGSI_OPCODE_TEX:
-   case TGSI_OPCODE_TXB:
-   case TGSI_OPCODE_TXP:
-   case TGSI_OPCODE_NRM:
-   case TGSI_OPCODE_NRM4:
-   case TGSI_OPCODE_DP2:
-      /* OK - these opcodes correctly handle SOA dependencies */
-      return TRUE;
-   default:
-      if (!tgsi_check_soa_dependencies(inst))
-         return TRUE;
-
-      debug_printf("Warning: src/dst aliasing in instruction"
-                   " is not handled:\n");
-      debug_printf("Warning: ");
-      tgsi_dump_instruction(inst, 1);
-
-      return FALSE;
-   }
-}
-
-
-/**
- * Translate a TGSI vertex/fragment shader to SSE2 code.
- * Slightly different things are done for vertex vs. fragment shaders.
- *
- * \param tokens  the TGSI input shader
- * \param func  the output SSE code/function
- * \param immediates  buffer to place immediates, later passed to SSE func
- * \param return  1 for success, 0 if translation failed
- */
-unsigned
-tgsi_emit_sse2(
-   const struct tgsi_token *tokens,
-   struct x86_function *func,
-   float (*immediates)[4],
-   boolean do_swizzles )
-{
-   struct tgsi_parse_context parse;
-   unsigned ok = 1;
-   uint num_immediates = 0;
-
-   util_init_math();
-
-   func->csr = func->store;
-
-   tgsi_parse_init( &parse, tokens );
-
-   /* Can't just use EDI, EBX without save/restoring them:
-    */
-   x86_push( func, x86_make_reg( file_REG32, reg_BX ) );
-   x86_push( func, x86_make_reg( file_REG32, reg_DI ) );
-
-   /*
-    * Different function args for vertex/fragment shaders:
-    */
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
-      if (do_swizzles)
-         aos_to_soa( func, 
-                     4,         /* aos_input */
-                     1,         /* machine */
-                     5,         /* num_inputs */
-                     6 );       /* input_stride */
-   }
-
-   x86_mov(
-      func,
-      get_machine_base(),
-      x86_fn_arg( func, 1 ) );
-   x86_mov(
-      func,
-      get_const_base(),
-      x86_fn_arg( func, 2 ) );
-   x86_mov(
-      func,
-      get_immediate_base(),
-      x86_fn_arg( func, 3 ) );
-
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-      x86_mov(
-	 func,
-	 get_coef_base(),
-	 x86_fn_arg( func, 4 ) );
-   }
-
-   x86_mov(
-      func,
-      get_sampler_base(),
-      x86_make_disp( get_machine_base(),
-                     Offset( struct tgsi_exec_machine, Samplers ) ) );
-
-   while( !tgsi_parse_end_of_tokens( &parse ) && ok ) {
-      tgsi_parse_token( &parse );
-
-      switch( parse.FullToken.Token.Type ) {
-      case TGSI_TOKEN_TYPE_DECLARATION:
-         if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_FRAGMENT) {
-            emit_declaration(
-               func,
-               &parse.FullToken.FullDeclaration );
-         }
-         break;
-
-      case TGSI_TOKEN_TYPE_INSTRUCTION:
-         ok = emit_instruction(
-            func,
-            &parse.FullToken.FullInstruction );
-
-	 if (!ok) {
-            uint opcode = parse.FullToken.FullInstruction.Instruction.Opcode;
-            uint proc = parse.FullHeader.Processor.Processor;
-	    debug_printf("failed to translate tgsi opcode %d (%s) to SSE (%s)\n", 
-			 opcode,
-                         tgsi_get_opcode_name(opcode),
-                         tgsi_get_processor_name(proc));
-	 }
-
-         if (ok)
-            ok = check_soa_dependencies(&parse.FullToken.FullInstruction);
-         break;
-
-      case TGSI_TOKEN_TYPE_IMMEDIATE:
-         /* simply copy the immediate values into the next immediates[] slot */
-         {
-            const uint size = parse.FullToken.FullImmediate.Immediate.NrTokens - 1;
-            uint i;
-            assert(size <= 4);
-            assert(num_immediates < TGSI_EXEC_NUM_IMMEDIATES);
-            for( i = 0; i < size; i++ ) {
-               immediates[num_immediates][i] =
-		  parse.FullToken.FullImmediate.u[i].Float;
-            }
-#if 0
-            debug_printf("SSE FS immediate[%d] = %f %f %f %f\n",
-                   num_immediates,
-                   immediates[num_immediates][0],
-                   immediates[num_immediates][1],
-                   immediates[num_immediates][2],
-                   immediates[num_immediates][3]);
-#endif
-            num_immediates++;
-         }
-         break;
-      case TGSI_TOKEN_TYPE_PROPERTY:
-         /* we just ignore them for now */
-         break;
-
-      default:
-	 ok = 0;
-         assert( 0 );
-      }
-   }
-
-   if (parse.FullHeader.Processor.Processor == TGSI_PROCESSOR_VERTEX) {
-      if (do_swizzles)
-         soa_to_aos( func, 
-		     7, 	/* aos_output */
-		     1, 	/* machine */
-		     8, 	/* num_outputs */
-		     9 );	/* output_stride */
-   }
-
-   /* Can't just use EBX, EDI without save/restoring them:
-    */
-   x86_pop( func, x86_make_reg( file_REG32, reg_DI ) );
-   x86_pop( func, x86_make_reg( file_REG32, reg_BX ) );
-
-   emit_ret( func );
-
-   tgsi_parse_free( &parse );
-
-   return ok;
-}
-
-#else /* !PIPE_ARCH_X86 */
-
-unsigned
-tgsi_emit_sse2(
-   const struct tgsi_token *tokens,
-   struct x86_function *func,
-   float (*immediates)[4],
-   boolean do_swizzles )
-{
-   return 0;
-}
-
-#endif /* !PIPE_ARCH_X86 */
diff --git a/src/gallium/auxiliary/tgsi/tgsi_sse2.h b/src/gallium/auxiliary/tgsi/tgsi_sse2.h
deleted file mode 100644
index 00aa8b84fe9..00000000000
--- a/src/gallium/auxiliary/tgsi/tgsi_sse2.h
+++ /dev/null
@@ -1,80 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef TGSI_SSE2_H
-#define TGSI_SSE2_H
-
-#if defined __cplusplus
-extern "C" {
-#endif
-
-#include "pipe/p_compiler.h"
-
-struct tgsi_exec_machine;
-struct tgsi_interp_coef;
-struct tgsi_token;
-struct x86_function;
-
-unsigned
-tgsi_emit_sse2(
-   const struct tgsi_token *tokens,
-   struct x86_function *function,
-   float (*immediates)[4],
-   boolean do_swizzles );
-
-
-/* This is the function prototype generated when do_swizzles is false
- * -- effectively for fragment shaders.
- */
-typedef void (PIPE_CDECL *tgsi_sse2_fs_function) (
-   struct tgsi_exec_machine *machine, /* 1 */
-   const float (*constant)[4],		    /* 2 */
-   const float (*immediate)[4],		    /* 3 */
-   const struct tgsi_interp_coef *coef	    /* 4 */
-   );
-
-
-/* This is the function prototype generated when do_swizzles is true
- * -- effectively for vertex shaders.
- */
-typedef void (PIPE_CDECL *tgsi_sse2_vs_func) (
-   struct tgsi_exec_machine *machine, /* 1 */
-   const float (*constant)[4],        /* 2 */
-   const float (*immediate)[4],       /* 3 */
-   const float (*aos_input)[4], /* 4 */
-   uint num_inputs,             /* 5 */
-   uint input_stride,           /* 6 */
-   float (*aos_output)[4],      /* 7 */
-   uint num_outputs,            /* 8 */
-   uint output_stride );        /* 9 */
-
-
-#if defined __cplusplus
-}
-#endif
-
-#endif /* TGSI_SSE2_H */
diff --git a/src/gallium/drivers/softpipe/Android.mk b/src/gallium/drivers/softpipe/Android.mk
index d198fa5d0f2..6a125a5d412 100644
--- a/src/gallium/drivers/softpipe/Android.mk
+++ b/src/gallium/drivers/softpipe/Android.mk
@@ -26,7 +26,6 @@ LOCAL_PATH := $(call my-dir)
 # from Makefile
 C_SOURCES = \
 	sp_fs_exec.c \
-	sp_fs_sse.c \
 	sp_clear.c \
 	sp_fence.c \
 	sp_flush.c \
diff --git a/src/gallium/drivers/softpipe/Makefile b/src/gallium/drivers/softpipe/Makefile
index 9403e6cf0b8..27b5d991a75 100644
--- a/src/gallium/drivers/softpipe/Makefile
+++ b/src/gallium/drivers/softpipe/Makefile
@@ -5,7 +5,6 @@ LIBNAME = softpipe
 
 C_SOURCES = \
 	sp_fs_exec.c \
-	sp_fs_sse.c \
 	sp_clear.c \
 	sp_fence.c \
 	sp_flush.c \
diff --git a/src/gallium/drivers/softpipe/SConscript b/src/gallium/drivers/softpipe/SConscript
index ea10e8a9f98..da2c93ee5fa 100644
--- a/src/gallium/drivers/softpipe/SConscript
+++ b/src/gallium/drivers/softpipe/SConscript
@@ -6,7 +6,6 @@ softpipe = env.ConvenienceLibrary(
 	target = 'softpipe',
 	source = [
 		'sp_fs_exec.c',
-		'sp_fs_sse.c',
 		'sp_clear.c',
 		'sp_context.c',
 		'sp_draw_arrays.c',
diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
index c97b0333035..3a83e5870dc 100644
--- a/src/gallium/drivers/softpipe/sp_context.c
+++ b/src/gallium/drivers/softpipe/sp_context.c
@@ -235,12 +235,6 @@ softpipe_create_context( struct pipe_screen *screen,
 
    util_init_math();
 
-#ifdef PIPE_ARCH_X86
-   softpipe->use_sse = !debug_get_bool_option( "GALLIUM_NOSSE", FALSE );
-#else
-   softpipe->use_sse = FALSE;
-#endif
-
    softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
    softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
 
diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
index d51ce9fe333..5442aba9019 100644
--- a/src/gallium/drivers/softpipe/sp_context.h
+++ b/src/gallium/drivers/softpipe/sp_context.h
@@ -190,7 +190,6 @@ struct softpipe_context {
    struct softpipe_tex_tile_cache *vertex_tex_cache[PIPE_MAX_VERTEX_SAMPLERS];
    struct softpipe_tex_tile_cache *geometry_tex_cache[PIPE_MAX_GEOMETRY_SAMPLERS];
 
-   unsigned use_sse : 1;
    unsigned dump_fs : 1;
    unsigned dump_gs : 1;
    unsigned no_rast : 1;
diff --git a/src/gallium/drivers/softpipe/sp_fs.h b/src/gallium/drivers/softpipe/sp_fs.h
index d46d7d5a657..db689b82bd5 100644
--- a/src/gallium/drivers/softpipe/sp_fs.h
+++ b/src/gallium/drivers/softpipe/sp_fs.h
@@ -36,10 +36,6 @@ struct sp_fragment_shader_variant *
 softpipe_create_fs_variant_exec(struct softpipe_context *softpipe,
                                 const struct pipe_shader_state *templ);
 
-struct sp_fragment_shader_variant *
-softpipe_create_fs_variant_sse(struct softpipe_context *softpipe,
-                               const struct pipe_shader_state *templ);
-
 
 struct tgsi_interp_coef;
 struct tgsi_exec_vector;
diff --git a/src/gallium/drivers/softpipe/sp_fs_sse.c b/src/gallium/drivers/softpipe/sp_fs_sse.c
deleted file mode 100644
index c873af125bd..00000000000
--- a/src/gallium/drivers/softpipe/sp_fs_sse.c
+++ /dev/null
@@ -1,248 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Execute fragment shader using runtime SSE code generation.
- */
-
-#include "sp_context.h"
-#include "sp_state.h"
-#include "sp_fs.h"
-#include "sp_quad.h"
-
-#include "pipe/p_state.h"
-#include "pipe/p_defines.h"
-#include "util/u_memory.h"
-#include "tgsi/tgsi_exec.h"
-#include "tgsi/tgsi_sse2.h"
-
-
-#if defined(PIPE_ARCH_X86)
-
-#include "rtasm/rtasm_x86sse.h"
-
-
-
-/**
- * Subclass of sp_fragment_shader_variant
- */
-struct sp_sse_fragment_shader
-{
-   struct sp_fragment_shader_variant base;
-   struct x86_function sse2_program;
-   tgsi_sse2_fs_function func;
-   float immediates[TGSI_EXEC_NUM_IMMEDIATES][4];
-};
-
-
-/** cast wrapper */
-static INLINE struct sp_sse_fragment_shader *
-sp_sse_fragment_shader(const struct sp_fragment_shader_variant *base)
-{
-   return (struct sp_sse_fragment_shader *) base;
-}
-
-
-static void
-fs_sse_prepare( const struct sp_fragment_shader_variant *base,
-		struct tgsi_exec_machine *machine,
-		struct tgsi_sampler **samplers )
-{
-   machine->Samplers = samplers;
-}
-
-
-
-/**
- * Compute quad X,Y,Z,W for the four fragments in a quad.
- *
- * This should really be part of the compiled shader.
- */
-static void
-setup_pos_vector(const struct tgsi_interp_coef *coef,
-		    float x, float y,
-		    struct tgsi_exec_vector *quadpos)
-{
-   uint chan;
-   /* do X */
-   quadpos->xyzw[0].f[0] = x;
-   quadpos->xyzw[0].f[1] = x + 1;
-   quadpos->xyzw[0].f[2] = x;
-   quadpos->xyzw[0].f[3] = x + 1;
-
-   /* do Y */
-   quadpos->xyzw[1].f[0] = y;
-   quadpos->xyzw[1].f[1] = y;
-   quadpos->xyzw[1].f[2] = y + 1;
-   quadpos->xyzw[1].f[3] = y + 1;
-
-   /* do Z and W for all fragments in the quad */
-   for (chan = 2; chan < 4; chan++) {
-      const float dadx = coef->dadx[chan];
-      const float dady = coef->dady[chan];
-      const float a0 = coef->a0[chan] + dadx * x + dady * y;
-      quadpos->xyzw[chan].f[0] = a0;
-      quadpos->xyzw[chan].f[1] = a0 + dadx;
-      quadpos->xyzw[chan].f[2] = a0 + dady;
-      quadpos->xyzw[chan].f[3] = a0 + dadx + dady;
-   }
-}
-
-
-/* TODO: codegenerate the whole run function, skip this wrapper.
- * TODO: break dependency on tgsi_exec_machine struct
- * TODO: push Position calculation into the generated shader
- * TODO: process >1 quad at a time
- */
-static unsigned 
-fs_sse_run( const struct sp_fragment_shader_variant *base,
-	    struct tgsi_exec_machine *machine,
-	    struct quad_header *quad )
-{
-   struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
-
-   /* Compute X, Y, Z, W vals for this quad -- place in temp[0] for now */
-   setup_pos_vector(quad->posCoef, 
-                    (float)quad->input.x0, (float)quad->input.y0, 
-                    machine->Temps);
-
-   /* init kill mask */
-   tgsi_set_kill_mask(machine, 0x0);
-   tgsi_set_exec_mask(machine, 1, 1, 1, 1);
-
-   shader->func( machine,
-                 (const float (*)[4])machine->Consts[0],
-                 (const float (*)[4])shader->immediates,
-		 machine->InterpCoefs
-		 /*, &machine->QuadPos*/
-      );
-
-   quad->inout.mask &= ~(machine->Temps[TGSI_EXEC_TEMP_KILMASK_I].xyzw[TGSI_EXEC_TEMP_KILMASK_C].u[0]);
-   if (quad->inout.mask == 0)
-      return FALSE;
-
-   /* store outputs */
-   {
-      const ubyte *sem_name = base->info.output_semantic_name;
-      const ubyte *sem_index = base->info.output_semantic_index;
-      const uint n = base->info.num_outputs;
-      uint i;
-      for (i = 0; i < n; i++) {
-         switch (sem_name[i]) {
-         case TGSI_SEMANTIC_COLOR:
-            {
-               uint cbuf = sem_index[i];
-
-               assert(sizeof(quad->output.color[cbuf]) ==
-                      sizeof(machine->Outputs[i]));
-
-               /* copy float[4][4] result */
-               memcpy(quad->output.color[cbuf],
-                      &machine->Outputs[i],
-                      sizeof(quad->output.color[0]) );
-            }
-            break;
-         case TGSI_SEMANTIC_POSITION:
-            {
-               uint j;
-               for (j = 0; j < 4; j++)
-                  quad->output.depth[j] = machine->Outputs[i].xyzw[2].f[j];
-            }
-            break;
-         case TGSI_SEMANTIC_STENCIL:
-            {
-               uint j;
-               for (j = 0; j < 4; j++)
-                  quad->output.stencil[j] = machine->Outputs[i].xyzw[1].f[j];
-            }
-            break;
-         }
-      }
-   }
-
-   return TRUE;
-}
-
-
-static void 
-fs_sse_delete( struct sp_fragment_shader_variant *base )
-{
-   struct sp_sse_fragment_shader *shader = sp_sse_fragment_shader(base);
-
-   x86_release_func( &shader->sse2_program );
-   FREE(shader);
-}
-
-
-struct sp_fragment_shader_variant *
-softpipe_create_fs_variant_sse(struct softpipe_context *softpipe,
-                               const struct pipe_shader_state *templ)
-{
-   struct sp_sse_fragment_shader *shader;
-
-   if (!softpipe->use_sse)
-      return NULL;
-
-   shader = CALLOC_STRUCT(sp_sse_fragment_shader);
-   if (!shader)
-      return NULL;
-
-   x86_init_func( &shader->sse2_program );
-   
-   if (!tgsi_emit_sse2( templ->tokens, &shader->sse2_program,
-                        shader->immediates, FALSE )) {
-      FREE(shader);
-      return NULL;
-   }
-
-   shader->func = (tgsi_sse2_fs_function) x86_get_func( &shader->sse2_program );
-   if (!shader->func) {
-      x86_release_func( &shader->sse2_program );
-      FREE(shader);
-      return NULL;
-   }
-
-   shader->base.prepare = fs_sse_prepare;
-   shader->base.run = fs_sse_run;
-   shader->base.delete = fs_sse_delete;
-
-   return &shader->base;
-}
-
-
-#else
-
-/* Maybe put this variant in the header file.
- */
-struct sp_fragment_shader_variant *
-softpipe_create_fs_variant_sse(struct softpipe_context *softpipe,
-                               const struct pipe_shader_state *templ)
-{
-   return NULL;
-}
-
-#endif
diff --git a/src/gallium/drivers/softpipe/sp_state_shader.c b/src/gallium/drivers/softpipe/sp_state_shader.c
index 612dcb38eb4..6acb57b3fe6 100644
--- a/src/gallium/drivers/softpipe/sp_state_shader.c
+++ b/src/gallium/drivers/softpipe/sp_state_shader.c
@@ -65,10 +65,7 @@ create_fs_variant(struct softpipe_context *softpipe,
 #endif
 
    /* codegen, create variant object */
-   var = softpipe_create_fs_variant_sse(softpipe, curfs);
-   if (!var) {
-      var = softpipe_create_fs_variant_exec(softpipe, curfs);
-   }
+   var = softpipe_create_fs_variant_exec(softpipe, curfs);
 
    if (var) {
       var->key = *key;
author	José Fonseca <[email protected]>	2011-11-08 00:10:47 +0000
committer	José Fonseca <[email protected]>	2011-11-08 22:57:34 +0000
commit	4eb3225b38ce12cb34ab3d90804c9683bd7b4ed3 (patch)
tree	857d6c1740eb32fc86744f7afd81322862f6150c /src
parent	207a016ecaabbccf865a5b8e026b95a4276adc15 (diff)