42 files changed, 5028 insertions, 359 deletions
diff --git a/src/gallium/auxiliary/draw/Makefile b/src/gallium/auxiliary/draw/Makefile
index da7eded21f6..9a88ecc0708 100644
--- a/src/gallium/auxiliary/draw/Makefile
+++ b/src/gallium/auxiliary/draw/Makefile
@@ -26,12 +26,17 @@ C_SOURCES = \
 	draw_pt_emit.c \
 	draw_pt_fetch.c \
 	draw_pt_fetch_emit.c \
+	draw_pt_fetch_shade_emit.c \
 	draw_pt_fetch_shade_pipeline.c \
 	draw_pt_post_vs.c \
+        draw_pt_util.c \
         draw_pt_varray.c \
 	draw_pt_vcache.c \
 	draw_vertex.c \
 	draw_vs.c \
+	draw_vs_varient.c \
+	draw_vs_aos.c \
+	draw_vs_aos_io.c \
 	draw_vs_exec.c \
 	draw_vs_llvm.c \
 	draw_vs_sse.c 
diff --git a/src/gallium/auxiliary/draw/draw_context.c b/src/gallium/auxiliary/draw/draw_context.c
index 98e23fa8305..22420749656 100644
--- a/src/gallium/auxiliary/draw/draw_context.c
+++ b/src/gallium/auxiliary/draw/draw_context.c
@@ -56,12 +56,6 @@ struct draw_context *draw_create( void )
 
    draw->reduced_prim = ~0; /* != any of PIPE_PRIM_x */
 
-   tgsi_exec_machine_init(&draw->machine);
-
-   /* FIXME: give this machine thing a proper constructor:
-    */
-   draw->machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
-   draw->machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
 
    if (!draw_pipeline_init( draw ))
       goto fail;
@@ -69,6 +63,9 @@ struct draw_context *draw_create( void )
    if (!draw_pt_init( draw ))
       goto fail;
 
+   if (!draw_vs_init( draw ))
+      goto fail;
+
    return draw;
 
 fail:
@@ -83,13 +80,6 @@ void draw_destroy( struct draw_context *draw )
       return;
 
 
-   if (draw->machine.Inputs)
-      align_free(draw->machine.Inputs);
-
-   if (draw->machine.Outputs)
-      align_free(draw->machine.Outputs);
-
-   tgsi_exec_machine_free_data(&draw->machine);
 
    /* Not so fast -- we're just borrowing this at the moment.
     * 
@@ -99,6 +89,7 @@ void draw_destroy( struct draw_context *draw )
 
    draw_pipeline_destroy( draw );
    draw_pt_destroy( draw );
+   draw_vs_destroy( draw );
 
    FREE( draw );
 }
@@ -295,7 +286,7 @@ int
 draw_find_vs_output(struct draw_context *draw,
                     uint semantic_name, uint semantic_index)
 {
-   const struct draw_vertex_shader *vs = draw->vertex_shader;
+   const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
    uint i;
    for (i = 0; i < vs->info.num_outputs; i++) {
       if (vs->info.output_semantic_name[i] == semantic_name &&
@@ -320,7 +311,7 @@ draw_find_vs_output(struct draw_context *draw,
 uint
 draw_num_vs_outputs(struct draw_context *draw)
 {
-   uint count = draw->vertex_shader->info.num_outputs;
+   uint count = draw->vs.vertex_shader->info.num_outputs;
    if (draw->extra_vp_outputs.slot > 0)
       count++;
    return count;
diff --git a/src/gallium/auxiliary/draw/draw_pipe.c b/src/gallium/auxiliary/draw/draw_pipe.c
index 46afb0f41f7..1d26706deee 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.c
+++ b/src/gallium/auxiliary/draw/draw_pipe.c
@@ -212,6 +212,71 @@ void draw_pipeline_run( struct draw_context *draw,
    draw->pipeline.vertex_count = 0;
 }
 
+#define QUAD(i0,i1,i2,i3)                                        \
+   do_triangle( draw,                                            \
+                ( DRAW_PIPE_RESET_STIPPLE |                      \
+                  DRAW_PIPE_EDGE_FLAG_0 |                        \
+                  DRAW_PIPE_EDGE_FLAG_2 ),                       \
+                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
+                 verts + stride * (i1),                          \
+                verts + stride * (i3));                          \
+      do_triangle( draw,                                         \
+                   ( DRAW_PIPE_EDGE_FLAG_0 |                     \
+                     DRAW_PIPE_EDGE_FLAG_1 ),                    \
+                 verts + stride * ((i1) & ~DRAW_PIPE_FLAG_MASK), \
+                 verts + stride * (i2),                          \
+                 verts + stride * (i3))
+
+#define TRIANGLE(flags,i0,i1,i2)                                 \
+   do_triangle( draw,                                            \
+                flags,  /* flags */                              \
+                 verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
+                 verts + stride * (i1),                          \
+                 verts + stride * (i2))
+
+#define LINE(flags,i0,i1)                                   \
+   do_line( draw,                                           \
+            flags,                                          \
+            verts + stride * ((i0) & ~DRAW_PIPE_FLAG_MASK), \
+            verts + stride * (i+1))
+
+#define POINT(i0)                               \
+   do_point( draw,                              \
+             verts + stride * i0 )
+
+#define FUNC pipe_run_linear
+#define ARGS                                    \
+    struct draw_context *draw,                  \
+    unsigned prim,                              \
+    struct vertex_header *vertices,             \
+    unsigned stride
+
+#define LOCAL_VARS                                           \
+   char *verts = (char *)vertices;                           \
+   boolean flatfirst = (draw->rasterizer->flatshade &&       \
+                        draw->rasterizer->flatshade_first);  \
+   unsigned i, flags
+
+#define FLUSH
+
+#include "draw_pt_decompose.h"
+
+void draw_pipeline_run_linear( struct draw_context *draw,
+                               unsigned prim,
+                               struct vertex_header *vertices,
+                               unsigned count,
+                               unsigned stride )
+{
+   char *verts = (char *)vertices;
+   draw->pipeline.verts = verts;
+   draw->pipeline.vertex_stride = stride;
+   draw->pipeline.vertex_count = count;
+
+   pipe_run_linear(draw, prim, vertices, stride, count);
+
+   draw->pipeline.verts = NULL;
+   draw->pipeline.vertex_count = 0;
+}
 
 
 void draw_pipeline_flush( struct draw_context *draw, 
diff --git a/src/gallium/auxiliary/draw/draw_pipe.h b/src/gallium/auxiliary/draw/draw_pipe.h
index f1cb0891caa..dbad8f98ac7 100644
--- a/src/gallium/auxiliary/draw/draw_pipe.h
+++ b/src/gallium/auxiliary/draw/draw_pipe.h
@@ -116,7 +116,7 @@ dup_vert( struct draw_stage *stage,
 {   
    struct vertex_header *tmp = stage->tmp[idx];
    const uint vsize = sizeof(struct vertex_header)
-      + stage->draw->num_vs_outputs * 4 * sizeof(float);
+      + stage->draw->vs.num_vs_outputs * 4 * sizeof(float);
    memcpy(tmp, vert, vsize);
    tmp->vertex_id = UNDEFINED_VERTEX_ID;
    return tmp;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aaline.c b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
index b1ed8aa24ea..fd48b224b48 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aaline.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aaline.c
@@ -653,7 +653,7 @@ aaline_first_line(struct draw_stage *stage, struct prim_header *header)
    }
 
    /* update vertex attrib info */
-   aaline->tex_slot = draw->num_vs_outputs;
+   aaline->tex_slot = draw->vs.num_vs_outputs;
    assert(aaline->tex_slot > 0); /* output[0] is vertex pos */
 
    /* advertise the extra post-transformed vertex attribute */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
index 122a48660ad..97d74ad693b 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_aapoint.c
@@ -681,7 +681,7 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)
    bind_aapoint_fragment_shader(aapoint);
 
    /* update vertex attrib info */
-   aapoint->tex_slot = draw->num_vs_outputs;
+   aapoint->tex_slot = draw->vs.num_vs_outputs;
    assert(aapoint->tex_slot > 0); /* output[0] is vertex pos */
 
    draw->extra_vp_outputs.semantic_name = TGSI_SEMANTIC_GENERIC;
@@ -692,7 +692,7 @@ aapoint_first_point(struct draw_stage *stage, struct prim_header *header)
    aapoint->psize_slot = -1;
    if (draw->rasterizer->point_size_per_vertex) {
       /* find PSIZ vertex output */
-      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
       uint i;
       for (i = 0; i < vs->info.num_outputs; i++) {
          if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
diff --git a/src/gallium/auxiliary/draw/draw_pipe_clip.c b/src/gallium/auxiliary/draw/draw_pipe_clip.c
index ce80c941639..c11ed934a46 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_clip.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_clip.c
@@ -112,7 +112,7 @@ static void interp( const struct clipper *clip,
 		    const struct vertex_header *out, 
 		    const struct vertex_header *in )
 {
-   const unsigned nr_attrs = clip->stage.draw->num_vs_outputs;
+   const unsigned nr_attrs = clip->stage.draw->vs.num_vs_outputs;
    unsigned j;
 
    /* Vertex header.
@@ -180,7 +180,7 @@ static void emit_poly( struct draw_stage *stage,
         header.flags |= edge_last;
 
       if (0) {
-         const struct draw_vertex_shader *vs = stage->draw->vertex_shader;
+         const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
          uint j, k;
          debug_printf("Clipped tri:\n");
          for (j = 0; j < 3; j++) {
@@ -425,7 +425,7 @@ clip_init_state( struct draw_stage *stage )
    clipper->flat = stage->draw->rasterizer->flatshade ? TRUE : FALSE;
 
    if (clipper->flat) {
-      const struct draw_vertex_shader *vs = stage->draw->vertex_shader;
+      const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
       uint i;
 
       clipper->num_color_attribs = 0;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
index 09b68c45599..21a9c3b77f8 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_flatshade.c
@@ -159,7 +159,7 @@ static void flatshade_line_1( struct draw_stage *stage,
 static void flatshade_init_state( struct draw_stage *stage )
 {
    struct flat_stage *flat = flat_stage(stage);
-   const struct draw_vertex_shader *vs = stage->draw->vertex_shader;
+   const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
    uint i;
 
    /* Find which vertex shader outputs are colors, make a list */
diff --git a/src/gallium/auxiliary/draw/draw_pipe_stipple.c b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
index 3cbced362e8..9522b79582e 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_stipple.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_stipple.c
@@ -71,7 +71,7 @@ screen_interp( struct draw_context *draw,
                const struct vertex_header *v1 )
 {
    uint attr;
-   for (attr = 0; attr < draw->num_vs_outputs; attr++) {
+   for (attr = 0; attr < draw->vs.num_vs_outputs; attr++) {
       const float *val0 = v0->data[attr];
       const float *val1 = v1->data[attr];
       float *newv = dst->data[attr];
@@ -175,6 +175,22 @@ reset_stipple_counter(struct draw_stage *stage)
    stage->next->reset_stipple_counter( stage->next );
 }
 
+static void
+stipple_reset_point(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->point(stage->next, header);
+}
+
+static void
+stipple_reset_tri(struct draw_stage *stage, struct prim_header *header)
+{
+   struct stipple_stage *stipple = stipple_stage(stage);
+   stipple->counter = 0;
+   stage->next->tri(stage->next, header);
+}
+
 
 static void
 stipple_first_line(struct draw_stage *stage, 
@@ -220,9 +236,9 @@ struct draw_stage *draw_stipple_stage( struct draw_context *draw )
 
    stipple->stage.draw = draw;
    stipple->stage.next = NULL;
-   stipple->stage.point = draw_pipe_passthrough_point;
+   stipple->stage.point = stipple_reset_point;
    stipple->stage.line = stipple_first_line;
-   stipple->stage.tri = draw_pipe_passthrough_tri;
+   stipple->stage.tri = stipple_reset_tri;
    stipple->stage.reset_stipple_counter = reset_stipple_counter;
    stipple->stage.flush = stipple_flush;
    stipple->stage.destroy = stipple_destroy;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_twoside.c b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
index 50872fdbe93..3ac825f5656 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_twoside.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_twoside.c
@@ -105,7 +105,7 @@ static void twoside_first_tri( struct draw_stage *stage,
 			       struct prim_header *header )
 {
    struct twoside_stage *twoside = twoside_stage(stage);
-   const struct draw_vertex_shader *vs = stage->draw->vertex_shader;
+   const struct draw_vertex_shader *vs = stage->draw->vs.vertex_shader;
    uint i;
 
    twoside->attrib_front0 = 0;
diff --git a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
index ed08573382d..df92e3f2d06 100644
--- a/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
+++ b/src/gallium/auxiliary/draw/draw_pipe_wide_point.c
@@ -197,7 +197,7 @@ static void widepoint_first_point( struct draw_stage *stage,
 
    if (draw->rasterizer->point_sprite) {
       /* find vertex shader texcoord outputs */
-      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
       uint i, j = 0;
       for (i = 0; i < vs->info.num_outputs; i++) {
          if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_GENERIC) {
@@ -212,7 +212,7 @@ static void widepoint_first_point( struct draw_stage *stage,
    wide->psize_slot = -1;
    if (draw->rasterizer->point_size_per_vertex) {
       /* find PSIZ vertex output */
-      const struct draw_vertex_shader *vs = draw->vertex_shader;
+      const struct draw_vertex_shader *vs = draw->vs.vertex_shader;
       uint i;
       for (i = 0; i < vs->info.num_outputs; i++) {
          if (vs->info.output_semantic_name[i] == TGSI_SEMANTIC_PSIZE) {
diff --git a/src/gallium/auxiliary/draw/draw_private.h b/src/gallium/auxiliary/draw/draw_private.h
index cee58bbf73e..c095bf3d7b9 100644
--- a/src/gallium/auxiliary/draw/draw_private.h
+++ b/src/gallium/auxiliary/draw/draw_private.h
@@ -124,6 +124,7 @@ struct draw_context
    struct {
       struct {
          struct draw_pt_middle_end *fetch_emit;
+         struct draw_pt_middle_end *fetch_shade_emit;
          struct draw_pt_middle_end *general;
       } middle;
 
@@ -154,6 +155,7 @@ struct draw_context
          const void *constants;
       } user;
 
+      boolean test_fse;
    } pt;
 
    struct {
@@ -167,13 +169,26 @@ struct draw_context
    /* pipe state that we need: */
    const struct pipe_rasterizer_state *rasterizer;
    struct pipe_viewport_state viewport;
+   boolean identity_viewport;
 
-   struct draw_vertex_shader *vertex_shader;
+   struct {
+      struct draw_vertex_shader *vertex_shader;
+      uint num_vs_outputs;  /**< convenience, from vertex_shader */
 
-   boolean identity_viewport;
 
-   uint num_vs_outputs;  /**< convenience, from vertex_shader */
+      /** TGSI program interpreter runtime state */
+      struct tgsi_exec_machine machine;
+
+      /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private.
+       */
+      struct gallivm_cpu_engine *engine;   
+
 
+      struct translate *fetch;
+      struct translate_cache *fetch_cache;
+      struct translate *emit;
+      struct translate_cache *emit_cache;
+   } vs;
 
    /* Clip derived state:
     */
@@ -190,16 +205,15 @@ struct draw_context
 
    unsigned reduced_prim;
 
-   /** TGSI program interpreter runtime state */
-   struct tgsi_exec_machine machine;
-
-   /* This (and the tgsi_exec_machine struct) probably need to be moved somewhere private.
-    */
-   struct gallivm_cpu_engine *engine;   
    void *driver_private;
 };
 
 
+/*******************************************************************************
+ * Vertex shader code:
+ */
+boolean draw_vs_init( struct draw_context *draw );
+void draw_vs_destroy( struct draw_context *draw );
 
 
 
@@ -247,6 +261,12 @@ void draw_pipeline_run( struct draw_context *draw,
                         const ushort *elts,
                         unsigned count );
 
+void draw_pipeline_run_linear( struct draw_context *draw,
+                               unsigned prim,
+                               struct vertex_header *vertices,
+                               unsigned count,
+                               unsigned stride );
+
 
 
 void draw_pipeline_flush( struct draw_context *draw, 
diff --git a/src/gallium/auxiliary/draw/draw_pt.c b/src/gallium/auxiliary/draw/draw_pt.c
index c9c5d183135..75f44d503eb 100644
--- a/src/gallium/auxiliary/draw/draw_pt.c
+++ b/src/gallium/auxiliary/draw/draw_pt.c
@@ -64,7 +64,7 @@ draw_pt_arrays(struct draw_context *draw,
       opt |= PT_PIPELINE;
    }
 
-   if (!draw->bypass_clipping) {
+   if (!draw->bypass_clipping && !draw->pt.test_fse) {
       opt |= PT_CLIPTEST;
    }
 
@@ -72,16 +72,18 @@ draw_pt_arrays(struct draw_context *draw,
       opt |= PT_SHADE;
    }
 
-   if (opt)
-      middle = draw->pt.middle.general;
-   else
+
+   if (opt == 0) 
       middle = draw->pt.middle.fetch_emit;
+   else if (opt == PT_SHADE && draw->pt.test_fse)
+      middle = draw->pt.middle.fetch_shade_emit;
+   else
+      middle = draw->pt.middle.general;
 
 
    /* Pick the right frontend
     */
-   if (draw->pt.user.elts ||
-       count >= 256) {
+   if (draw->pt.user.elts || (opt & PT_PIPELINE)) {
       frontend = draw->pt.front.vcache;
    } else {
       frontend = draw->pt.front.varray;
@@ -102,6 +104,8 @@ draw_pt_arrays(struct draw_context *draw,
 
 boolean draw_pt_init( struct draw_context *draw )
 {
+   draw->pt.test_fse = GETENV("DRAW_FSE") != NULL;
+
    draw->pt.front.vcache = draw_pt_vcache( draw );
    if (!draw->pt.front.vcache)
       return FALSE;
@@ -114,6 +118,13 @@ boolean draw_pt_init( struct draw_context *draw )
    if (!draw->pt.middle.fetch_emit)
       return FALSE;
 
+   if (draw->pt.test_fse) {
+      draw->pt.middle.fetch_shade_emit = draw_pt_middle_fse( draw );
+      if (!draw->pt.middle.fetch_shade_emit)
+         return FALSE;
+   }
+
+
    draw->pt.middle.general = draw_pt_fetch_pipeline_or_emit( draw );
    if (!draw->pt.middle.general)
       return FALSE;
@@ -134,6 +145,11 @@ void draw_pt_destroy( struct draw_context *draw )
       draw->pt.middle.fetch_emit = NULL;
    }
 
+   if (draw->pt.middle.fetch_shade_emit) {
+      draw->pt.middle.fetch_shade_emit->destroy( draw->pt.middle.fetch_shade_emit );
+      draw->pt.middle.fetch_shade_emit = NULL;
+   }
+
    if (draw->pt.front.vcache) {
       draw->pt.front.vcache->destroy( draw->pt.front.vcache );
       draw->pt.front.vcache = NULL;
@@ -147,19 +163,6 @@ void draw_pt_destroy( struct draw_context *draw )
 
 
 
-static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
-   PIPE_PRIM_POINTS,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES
-};
-
 
 /**
  * Draw vertex arrays
@@ -172,9 +175,10 @@ void
 draw_arrays(struct draw_context *draw, unsigned prim,
             unsigned start, unsigned count)
 {
-   if (reduced_prim[prim] != draw->reduced_prim) {
+   unsigned reduced_prim = draw_pt_reduced_prim(prim);
+   if (reduced_prim != draw->reduced_prim) {
       draw_do_flush( draw, DRAW_FLUSH_STATE_CHANGE );
-      draw->reduced_prim = reduced_prim[prim];
+      draw->reduced_prim = reduced_prim;
    }
 
    /* drawing done here: */
diff --git a/src/gallium/auxiliary/draw/draw_pt.h b/src/gallium/auxiliary/draw/draw_pt.h
index 2dec376ceee..e03816ebbc7 100644
--- a/src/gallium/auxiliary/draw/draw_pt.h
+++ b/src/gallium/auxiliary/draw/draw_pt.h
@@ -92,6 +92,10 @@ struct draw_pt_middle_end {
                 const ushort *draw_elts,
                 unsigned draw_count );
 
+   void (*run_linear)(struct draw_pt_middle_end *,
+                      unsigned start,
+                      unsigned count);
+
    void (*finish)( struct draw_pt_middle_end * );
    void (*destroy)( struct draw_pt_middle_end * );
 };
@@ -117,6 +121,7 @@ const void *draw_pt_elt_ptr( struct draw_context *draw,
 struct draw_pt_front_end *draw_pt_vcache( struct draw_context *draw );
 struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
 
+
 /* Middle-ends:
  *
  * Currently one general-purpose case which can do all possibilities,
@@ -128,6 +133,7 @@ struct draw_pt_front_end *draw_pt_varray(struct draw_context *draw);
  * vertex_elements.
  */
 struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw );
+struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw );
 struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit(struct draw_context *draw);
 
 
@@ -152,6 +158,13 @@ void draw_pt_emit( struct pt_emit *emit,
 		   const ushort *elts,
 		   unsigned count );
 
+void draw_pt_emit_linear( struct pt_emit *emit,
+                          const float (*vertex_data)[4],
+                          unsigned vertex_count,
+                          unsigned stride,
+                          unsigned start,
+                          unsigned count );
+
 void draw_pt_emit_destroy( struct pt_emit *emit );
 
 struct pt_emit *draw_pt_emit_create( struct draw_context *draw );
@@ -170,6 +183,11 @@ void draw_pt_fetch_run( struct pt_fetch *fetch,
 			unsigned count,
 			char *verts );
 
+void draw_pt_fetch_run_linear( struct pt_fetch *fetch,
+                               unsigned start,
+                               unsigned count,
+                               char *verts );
+
 void draw_pt_fetch_destroy( struct pt_fetch *fetch );
 
 struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw );
@@ -194,4 +212,11 @@ struct pt_post_vs *draw_pt_post_vs_create( struct draw_context *draw );
 void draw_pt_post_vs_destroy( struct pt_post_vs *pvs );
 
 
+/*******************************************************************************
+ * Utils: 
+ */
+void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr);
+unsigned draw_pt_reduced_prim(unsigned prim);
+
+
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_pt_decompose.h b/src/gallium/auxiliary/draw/draw_pt_decompose.h
new file mode 100644
index 00000000000..dccfde99dde
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_decompose.h
@@ -0,0 +1,153 @@
+
+
+static void FUNC( ARGS,
+                  unsigned count )
+{
+   LOCAL_VARS;
+
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      for (i = 0; i < count; i ++) {
+	 POINT( (i + 0) );
+      }
+      break;
+
+   case PIPE_PRIM_LINES:
+      for (i = 0; i+1 < count; i += 2) {
+         LINE( DRAW_PIPE_RESET_STIPPLE,
+               (i + 0),
+               (i + 1));
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      if (count >= 2) {
+         flags = DRAW_PIPE_RESET_STIPPLE;
+
+         for (i = 1; i < count; i++, flags = 0) {
+            LINE( flags,
+                  (i - 1),
+                  (i ));
+         }
+
+	 LINE( flags,
+               (i - 1),
+               (0 ));
+      }
+      break;
+
+   case PIPE_PRIM_LINE_STRIP:
+      flags = DRAW_PIPE_RESET_STIPPLE;
+      for (i = 1; i < count; i++, flags = 0) {
+         LINE( flags,
+               (i - 1),
+               (i ));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLES:
+      for (i = 0; i+2 < count; i += 3) {
+         TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                   (i + 0),
+                   (i + 1),
+                   (i + 2 ));
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_STRIP:
+      if (flatfirst) {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                      (i + 0),
+                      (i + 1 + (i&1)),
+                      (i + 2 - (i&1)));
+         }
+      }
+      else {
+         for (i = 0; i+2 < count; i++) {
+            TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                      (i + 0 + (i&1)),
+                      (i + 1 - (i&1)),
+                      (i + 2 ));
+         }
+      }
+      break;
+
+   case PIPE_PRIM_TRIANGLE_FAN:
+      if (count >= 3) {
+         if (flatfirst) {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                         (i + 1),
+                         (i + 2),
+                         (0 ));
+            }
+         }
+         else {
+            for (i = 0; i+2 < count; i++) {
+               TRIANGLE( DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                         (0),
+                         (i + 1),
+                         (i + 2 ));
+            }
+         }
+      }
+      break;
+
+
+   case PIPE_PRIM_QUADS:
+      for (i = 0; i+3 < count; i += 4) {
+         QUAD( (i + 0),
+               (i + 1),
+               (i + 2),
+               (i + 3));
+      }
+      break;
+
+   case PIPE_PRIM_QUAD_STRIP:
+      for (i = 0; i+3 < count; i += 2) {
+         QUAD( (i + 2),
+               (i + 0),
+               (i + 1),
+               (i + 3));
+      }
+      break;
+
+   case PIPE_PRIM_POLYGON:
+      {
+         /* These bitflags look a little odd because we submit the
+          * vertices as (1,2,0) to satisfy flatshade requirements.
+          */
+         const unsigned edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+         const unsigned edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+         const unsigned edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+
+         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
+
+	 for (i = 0; i+2 < count; i++, flags = edge_middle) {
+
+            if (i + 3 == count)
+               flags |= edge_last;
+
+	    TRIANGLE( flags,
+                      (i + 1),
+                      (i + 2),
+                      (0));
+	 }
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   FLUSH;
+}
+
+
+#undef TRIANGLE
+#undef QUAD
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_emit.c b/src/gallium/auxiliary/draw/draw_pt_emit.c
index ce3a153f647..cf87cde9965 100644
--- a/src/gallium/auxiliary/draw/draw_pt_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_emit.c
@@ -40,6 +40,9 @@ struct pt_emit {
    struct translate *translate;
 
    struct translate_cache *cache;
+   unsigned prim;
+
+   const struct vertex_info *vinfo;
 };
 
 void draw_pt_emit_prepare( struct pt_emit *emit,
@@ -51,8 +54,18 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
    struct translate_key hw_key;
    unsigned i;
    boolean ok;
+   
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
 
-   ok = draw->render->set_primitive(draw->render, prim);
+   /* XXX: may need to defensively reset this later on as clipping can
+    * clobber this state in the render backend.
+    */
+   emit->prim = prim;
+
+   ok = draw->render->set_primitive(draw->render, emit->prim);
    if (!ok) {
       assert(0);
       return;
@@ -60,7 +73,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
 
    /* Must do this after set_primitive() above:
     */
-   vinfo = draw->render->get_vertex_info(draw->render);
+   emit->vinfo = vinfo = draw->render->get_vertex_info(draw->render);
 
 
    /* Translate from pipeline vertices to hw vertices.
@@ -100,6 +113,7 @@ void draw_pt_emit_prepare( struct pt_emit *emit,
       case EMIT_4UB:
 	 output_format = PIPE_FORMAT_B8G8R8A8_UNORM;
 	 emit_sz = 4 * sizeof(ubyte);
+         break;
       default:
 	 assert(0);
 	 output_format = PIPE_FORMAT_NONE;
@@ -144,6 +158,14 @@ void draw_pt_emit( struct pt_emit *emit,
     */
    draw_do_flush( draw, DRAW_FLUSH_BACKEND );
 
+   /* XXX: and work out some way to coordinate the render primitive
+    * between vbuf.c and here...
+    */
+   if (!draw->render->set_primitive(draw->render, emit->prim)) {
+      assert(0);
+      return;
+   }
+
    hw_verts = render->allocate_vertices(render,
 					(ushort)translate->key.output_stride,
 					(ushort)vertex_count);
@@ -178,6 +200,72 @@ void draw_pt_emit( struct pt_emit *emit,
 }
 
 
+void draw_pt_emit_linear(struct pt_emit *emit,
+                         const float (*vertex_data)[4],
+                         unsigned vertex_count,
+                         unsigned stride,
+                         unsigned start,
+                         unsigned count)
+{
+   struct draw_context *draw = emit->draw;
+   struct translate *translate = emit->translate;
+   struct vbuf_render *render = draw->render;
+   void *hw_verts;
+
+#if 0
+   debug_printf("Linear emit\n");
+#endif
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   /* XXX: and work out some way to coordinate the render primitive
+    * between vbuf.c and here...
+    */
+   if (!draw->render->set_primitive(draw->render, emit->prim)) {
+      assert(0);
+      return;
+   }
+
+   hw_verts = render->allocate_vertices(render,
+					(ushort)translate->key.output_stride,
+					(ushort)count);
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   translate->set_buffer(translate, 0,
+			 vertex_data, stride);
+
+   translate->set_buffer(translate, 1,
+			 &draw->rasterizer->point_size,
+			 0);
+
+   translate->run(translate,
+                  0,
+                  vertex_count,
+                  hw_verts);
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < vertex_count; i++) {
+         debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i);
+         draw_dump_emitted_vertex( emit->vinfo, 
+                                   (const uint8_t *)hw_verts + 
+                                   translate->key.output_stride * i );
+      }
+   }
+
+
+   render->draw_arrays(render, start, count);
+
+   render->release_vertices(render,
+			    hw_verts,
+			    translate->key.output_stride,
+			    vertex_count);
+}
+
 struct pt_emit *draw_pt_emit_create( struct draw_context *draw )
 {
    struct pt_emit *emit = CALLOC_STRUCT(pt_emit);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch.c b/src/gallium/auxiliary/draw/draw_pt_fetch.c
index b96335b7895..07f4c991642 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch.c
@@ -166,6 +166,42 @@ void draw_pt_fetch_run( struct pt_fetch *fetch,
 }
 
 
+void draw_pt_fetch_run_linear( struct pt_fetch *fetch,
+                               unsigned start,
+                               unsigned count,
+                               char *verts )
+{
+   struct draw_context *draw = fetch->draw;
+   struct translate *translate = fetch->translate;
+   unsigned i;
+
+   for (i = 0; i < draw->pt.nr_vertex_buffers; i++) {
+      translate->set_buffer(translate,
+			    i,
+			    ((char *)draw->pt.user.vbuffer[i] +
+			     draw->pt.vertex_buffer[i].buffer_offset),
+			    draw->pt.vertex_buffer[i].pitch );
+   }
+
+   translate->run( translate,
+                   start,
+                   count,
+                   verts );
+
+   /* Edgeflags are hard to fit into a translate program, populate
+    * them separately if required.  In the setup above they are
+    * defaulted to one, so only need this if there is reason to change
+    * that default:
+    */
+   if (fetch->need_edgeflags) {
+      for (i = 0; i < count; i++) {
+         struct vertex_header *vh = (struct vertex_header *)(verts + i * fetch->vertex_size);
+         vh->edgeflag = draw_pt_get_edgeflag( draw, start + i );
+      }
+   }
+}
+
+
 struct pt_fetch *draw_pt_fetch_create( struct draw_context *draw )
 {
    struct pt_fetch *fetch = CALLOC_STRUCT(pt_fetch);
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
index 4ea7d4359ff..a1d041a74f5 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_emit.c
@@ -258,6 +258,59 @@ static void fetch_emit_run( struct draw_pt_middle_end *middle,
 }
 
 
+static void fetch_emit_run_linear( struct draw_pt_middle_end *middle,
+                                   unsigned start,
+                                   unsigned count )
+{
+   struct fetch_emit_middle_end *feme = (struct fetch_emit_middle_end *)middle;
+   struct draw_context *draw = feme->draw;
+   void *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)feme->translate->key.output_stride,
+                                               (ushort)count );
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   /* Single routine to fetch vertices and emit HW verts.
+    */
+   feme->translate->run( feme->translate,
+                         start,
+                         count,
+                         hw_verts );
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < count; i++) {
+         debug_printf("\n\nvertex %d:\n", i);
+         draw_dump_emitted_vertex( feme->vinfo,
+                                   (const uint8_t *)hw_verts + feme->vinfo->size * 4 * i );
+      }
+   }
+
+   /* XXX: Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw_arrays( draw->render,
+                              0, /*start*/
+                              count );
+
+   /* Done -- that was easy, wasn't it:
+    */
+   draw->render->release_vertices( draw->render,
+                                   hw_verts,
+                                   feme->translate->key.output_stride,
+                                   count );
+
+}
+
+
 
 static void fetch_emit_finish( struct draw_pt_middle_end *middle )
 {
@@ -287,10 +340,11 @@ struct draw_pt_middle_end *draw_pt_fetch_emit( struct draw_context *draw )
       return NULL;
    }
 
-   fetch_emit->base.prepare = fetch_emit_prepare;
-   fetch_emit->base.run     = fetch_emit_run;
-   fetch_emit->base.finish  = fetch_emit_finish;
-   fetch_emit->base.destroy = fetch_emit_destroy;
+   fetch_emit->base.prepare    = fetch_emit_prepare;
+   fetch_emit->base.run        = fetch_emit_run;
+   fetch_emit->base.run_linear = fetch_emit_run_linear;
+   fetch_emit->base.finish     = fetch_emit_finish;
+   fetch_emit->base.destroy    = fetch_emit_destroy;
 
    fetch_emit->draw = draw;
      
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
new file mode 100644
index 00000000000..729c7db9999
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_emit.c
@@ -0,0 +1,344 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_pt.h"
+#include "draw/draw_vs.h"
+
+#include "translate/translate.h"
+
+struct fetch_shade_emit;
+
+
+/* Prototype fetch, shade, emit-hw-verts all in one go.
+ */
+struct fetch_shade_emit {
+   struct draw_pt_middle_end base;
+   struct draw_context *draw;
+
+
+   /* Temporaries:
+    */
+   const float *constants;
+   unsigned pitch[PIPE_MAX_ATTRIBS];
+   const ubyte *src[PIPE_MAX_ATTRIBS];
+   unsigned prim;
+
+   struct draw_vs_varient_key key;
+   struct draw_vs_varient *active;
+
+
+   const struct vertex_info *vinfo;
+};
+
+
+
+			       
+static void fse_prepare( struct draw_pt_middle_end *middle,
+                         unsigned prim, 
+                         unsigned opt )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   unsigned num_vs_inputs = draw->vs.vertex_shader->info.num_inputs;
+   const struct vertex_info *vinfo;
+   unsigned i;
+   
+
+   if (!draw->render->set_primitive( draw->render, 
+                                     prim )) {
+      assert(0);
+      return;
+   }
+
+   /* Must do this after set_primitive() above:
+    */
+   fse->vinfo = vinfo = draw->render->get_vertex_info(draw->render);
+   
+
+
+   fse->key.output_stride = vinfo->size * 4;
+   fse->key.nr_outputs = vinfo->num_attribs;
+   fse->key.nr_inputs = num_vs_inputs;
+
+   fse->key.nr_elements = MAX2(fse->key.nr_outputs,     /* outputs - translate to hw format */
+                               fse->key.nr_inputs);     /* inputs - fetch from api format */
+
+   fse->key.viewport = !draw->identity_viewport;
+   fse->key.clip = !draw->bypass_clipping;
+   fse->key.pad = 0;
+
+   memset(fse->key.element, 0, 
+          fse->key.nr_elements * sizeof(fse->key.element[0]));
+
+   for (i = 0; i < num_vs_inputs; i++) {
+      const struct pipe_vertex_element *src = &draw->pt.vertex_element[i];
+      fse->key.element[i].in.format = src->src_format;
+
+      /* Consider ignoring these, ie make generated programs
+       * independent of this state:
+       */
+      fse->key.element[i].in.buffer = src->vertex_buffer_index;
+      fse->key.element[i].in.offset = src->src_offset;
+   }
+   
+
+   {
+      unsigned dst_offset = 0;
+
+      for (i = 0; i < vinfo->num_attribs; i++) {
+         unsigned emit_sz = 0;
+
+         switch (vinfo->emit[i]) {
+         case EMIT_4F:
+            emit_sz = 4 * sizeof(float);
+            break;
+         case EMIT_3F:
+            emit_sz = 3 * sizeof(float);
+            break;
+         case EMIT_2F:
+            emit_sz = 2 * sizeof(float);
+            break;
+         case EMIT_1F:
+            emit_sz = 1 * sizeof(float);
+            break;
+         case EMIT_1F_PSIZE:
+            emit_sz = 1 * sizeof(float);
+            break;
+         case EMIT_4UB:
+            emit_sz = 4 * sizeof(ubyte);
+            break;
+         default:
+            assert(0);
+            break;
+         }
+
+         /* The elements in the key correspond to vertex shader output
+          * numbers, not to positions in the hw vertex description --
+          * that's handled by the output_offset field.
+          */
+         fse->key.element[i].out.format = vinfo->emit[i];
+         fse->key.element[i].out.vs_output = vinfo->src_index[i];
+         fse->key.element[i].out.offset = dst_offset;
+      
+         dst_offset += emit_sz;
+         assert(fse->key.output_stride >= dst_offset);
+      }
+   }
+
+
+   /* Would normally look up a vertex shader and peruse its list of
+    * varients somehow.  We omitted that step and put all the
+    * hardcoded "shaders" into an array.  We're just making the
+    * assumption that this happens to be a matching shader...  ie
+    * you're running isosurf, aren't you?
+    */
+   fse->active = draw_vs_lookup_varient( draw->vs.vertex_shader, 
+                                         &fse->key );
+
+   if (!fse->active) {
+      assert(0);
+      return ;
+   }
+
+   /* Now set buffer pointers:
+    */
+   for (i = 0; i < num_vs_inputs; i++) {
+      unsigned buf = draw->pt.vertex_element[i].vertex_buffer_index;
+
+      fse->active->set_input( fse->active, 
+                              i, 
+                              
+                              ((const ubyte *) draw->pt.user.vbuffer[buf] + 
+                               draw->pt.vertex_buffer[buf].buffer_offset),
+                              
+                              draw->pt.vertex_buffer[buf].pitch );
+   }
+
+   fse->active->set_constants( fse->active,
+                               (const float (*)[4])draw->pt.user.constants );
+
+   fse->active->set_viewport( fse->active,
+                              &draw->viewport );
+
+   //return TRUE;
+}
+
+
+
+
+
+
+
+static void fse_run_linear( struct draw_pt_middle_end *middle, 
+                            unsigned start, 
+                            unsigned count )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   unsigned alloc_count = align(count, 4);
+   char *hw_verts;
+
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation??
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)fse->key.output_stride,
+                                               (ushort)alloc_count );
+
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+
+   /* Single routine to fetch vertices, run shader and emit HW verts.
+    * Clipping is done elsewhere -- either by the API or on hardware,
+    * or for some other reason not required...
+    */
+   fse->active->run_linear( fse->active, 
+                            start, count,
+                            hw_verts );
+
+   /* Draw arrays path to avoid re-emitting index list again and
+    * again.
+    */
+   draw->render->draw_arrays( draw->render,
+                              0,
+                              count );
+   
+   if (0) {
+      unsigned i;
+      for (i = 0; i < count; i++) {
+         debug_printf("\n\n%s vertex %d: (stride %d, offset %d)\n", __FUNCTION__, i,
+                      fse->key.output_stride,
+                      fse->key.output_stride * i);
+
+         draw_dump_emitted_vertex( fse->vinfo, 
+                                   (const uint8_t *)hw_verts + fse->key.output_stride * i );
+      }
+   }
+
+
+   draw->render->release_vertices( draw->render, 
+				   hw_verts, 
+				   fse->key.output_stride, 
+				   count );
+}
+
+
+static void
+fse_run(struct draw_pt_middle_end *middle,
+        const unsigned *fetch_elts,
+        unsigned fetch_count,
+        const ushort *draw_elts,
+        unsigned draw_count )
+{
+   struct fetch_shade_emit *fse = (struct fetch_shade_emit *)middle;
+   struct draw_context *draw = fse->draw;
+   unsigned alloc_count = align(fetch_count, 4);
+   void *hw_verts;
+   
+   /* XXX: need to flush to get prim_vbuf.c to release its allocation?? 
+    */
+   draw_do_flush( draw, DRAW_FLUSH_BACKEND );
+
+   hw_verts = draw->render->allocate_vertices( draw->render,
+                                               (ushort)fse->key.output_stride,
+                                               (ushort)alloc_count );
+   if (!hw_verts) {
+      assert(0);
+      return;
+   }
+         
+					
+   /* Single routine to fetch vertices, run shader and emit HW verts.
+    */
+   fse->active->run_elts( fse->active, 
+                          fetch_elts,
+                          fetch_count,
+                          hw_verts );
+
+   draw->render->draw( draw->render, 
+                       draw_elts, 
+                       draw_count );
+
+   if (0) {
+      unsigned i;
+      for (i = 0; i < fetch_count; i++) {
+         debug_printf("\n\n%s vertex %d:\n", __FUNCTION__, i);
+         draw_dump_emitted_vertex( fse->vinfo, 
+                                   (const uint8_t *)hw_verts + 
+                                   fse->key.output_stride * i );
+      }
+   }
+
+
+   draw->render->release_vertices( draw->render, 
+                                   hw_verts, 
+                                   fse->key.output_stride, 
+                                   fetch_count );
+
+}
+
+
+static void fse_finish( struct draw_pt_middle_end *middle )
+{
+}
+
+
+static void
+fse_destroy( struct draw_pt_middle_end *middle ) 
+{
+   FREE(middle);
+}
+
+struct draw_pt_middle_end *draw_pt_middle_fse( struct draw_context *draw )
+{
+   struct fetch_shade_emit *fse = CALLOC_STRUCT(fetch_shade_emit);
+   if (!fse)
+      return NULL;
+
+   fse->base.prepare = fse_prepare;
+   fse->base.run = fse_run;
+   fse->base.run_linear = fse_run_linear;
+   fse->base.finish = fse_finish;
+   fse->base.destroy = fse_destroy;
+   fse->draw = draw;
+
+   return &fse->base;
+}
diff --git a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
index 4ec20493c4c..06718779a5b 100644
--- a/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
+++ b/src/gallium/auxiliary/draw/draw_pt_fetch_shade_pipeline.c
@@ -55,7 +55,7 @@ static void fetch_pipeline_prepare( struct draw_pt_middle_end *middle,
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *vs = draw->vertex_shader;
+   struct draw_vertex_shader *vs = draw->vs.vertex_shader;
 
    /* Add one to num_outputs because the pipeline occasionally tags on
     * an additional texcoord, eg for AA lines.
@@ -107,7 +107,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
 {
    struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
    struct draw_context *draw = fpme->draw;
-   struct draw_vertex_shader *shader = draw->vertex_shader;
+   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
    unsigned opt = fpme->opt;
    unsigned alloc_count = align_int( fetch_count, 4 );
 
@@ -162,7 +162,7 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
                          fpme->vertex_size,
                          draw_elts,
                          draw_count );
-   } 
+   }
    else {
       draw_pt_emit( fpme->emit,
 		    (const float (*)[4])pipeline_verts->data,
@@ -177,6 +177,79 @@ static void fetch_pipeline_run( struct draw_pt_middle_end *middle,
 }
 
 
+static void fetch_pipeline_linear_run( struct draw_pt_middle_end *middle,
+                                       unsigned start,
+                                       unsigned count)
+{
+   struct fetch_pipeline_middle_end *fpme = (struct fetch_pipeline_middle_end *)middle;
+   struct draw_context *draw = fpme->draw;
+   struct draw_vertex_shader *shader = draw->vs.vertex_shader;
+   unsigned opt = fpme->opt;
+   unsigned alloc_count = align_int( count, 4 );
+
+   struct vertex_header *pipeline_verts =
+      (struct vertex_header *)MALLOC(fpme->vertex_size * alloc_count);
+
+   if (!pipeline_verts) {
+      /* Not much we can do here - just skip the rendering.
+       */
+      assert(0);
+      return;
+   }
+
+   /* Fetch into our vertex buffer
+    */
+   draw_pt_fetch_run_linear( fpme->fetch,
+                             start,
+                             count,
+                             (char *)pipeline_verts );
+
+   /* Run the shader, note that this overwrites the data[] parts of
+    * the pipeline verts.  If there is no shader, ie a bypass shader,
+    * then the inputs == outputs, and are already in the correct
+    * place.
+    */
+   if (opt & PT_SHADE)
+   {
+      shader->run_linear(shader,
+			 (const float (*)[4])pipeline_verts->data,
+			 (      float (*)[4])pipeline_verts->data,
+			 (const float (*)[4])draw->pt.user.constants,
+			 count,
+			 fpme->vertex_size,
+			 fpme->vertex_size);
+   }
+
+   if (draw_pt_post_vs_run( fpme->post_vs,
+			    pipeline_verts,
+			    count,
+			    fpme->vertex_size ))
+   {
+      opt |= PT_PIPELINE;
+   }
+
+   /* Do we need to run the pipeline?
+    */
+   if (opt & PT_PIPELINE) {
+      draw_pipeline_run_linear( fpme->draw,
+                                fpme->prim,
+                                pipeline_verts,
+                                count,
+                                fpme->vertex_size);
+   }
+   else {
+      draw_pt_emit_linear( fpme->emit,
+                           (const float (*)[4])pipeline_verts->data,
+                           count,
+                           fpme->vertex_size,
+                           0, /*start*/
+                           count );
+   }
+
+   FREE(pipeline_verts);
+}
+
+
 
 static void fetch_pipeline_finish( struct draw_pt_middle_end *middle )
 {
@@ -206,10 +279,11 @@ struct draw_pt_middle_end *draw_pt_fetch_pipeline_or_emit( struct draw_context *
    if (!fpme)
       goto fail;
 
-   fpme->base.prepare = fetch_pipeline_prepare;
-   fpme->base.run     = fetch_pipeline_run;
-   fpme->base.finish  = fetch_pipeline_finish;
-   fpme->base.destroy = fetch_pipeline_destroy;
+   fpme->base.prepare        = fetch_pipeline_prepare;
+   fpme->base.run            = fetch_pipeline_run;
+   fpme->base.run_linear     = fetch_pipeline_linear_run;
+   fpme->base.finish         = fetch_pipeline_finish;
+   fpme->base.destroy        = fetch_pipeline_destroy;
 
    fpme->draw = draw;
 
diff --git a/src/gallium/auxiliary/draw/draw_pt_util.c b/src/gallium/auxiliary/draw/draw_pt_util.c
new file mode 100644
index 00000000000..32c8a9632cb
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_util.c
@@ -0,0 +1,103 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_pt.h"
+
+void draw_pt_split_prim(unsigned prim, unsigned *first, unsigned *incr)
+{
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      *first = 1;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_LINES:
+      *first = 2;
+      *incr = 2;
+      break;
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_LINE_LOOP:
+      *first = 2;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_TRIANGLES:
+      *first = 3;
+      *incr = 3;
+      break;
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+      *first = 3;
+      *incr = 1;
+      break;
+   case PIPE_PRIM_QUADS:
+      *first = 4;
+      *incr = 4;
+      break;
+   case PIPE_PRIM_QUAD_STRIP:
+      *first = 4;
+      *incr = 2;
+      break;
+   default:
+      assert(0);
+      *first = 0;
+      *incr = 1;		/* set to one so that count % incr works */
+      break;
+   }
+}
+
+
+unsigned draw_pt_reduced_prim(unsigned prim)
+{
+   switch (prim) {
+   case PIPE_PRIM_POINTS:
+      return PIPE_PRIM_POINTS;
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_LINE_LOOP:
+      return PIPE_PRIM_LINES;
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_TRIANGLE_FAN:
+   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+      return PIPE_PRIM_TRIANGLES;
+   default:
+      assert(0);
+      return PIPE_PRIM_POINTS;
+   }
+}
+
+
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray.c b/src/gallium/auxiliary/draw/draw_pt_varray.c
index 355093f945e..260f28f2845 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray.c
+++ b/src/gallium/auxiliary/draw/draw_pt_varray.c
@@ -43,6 +43,8 @@ struct varray_frontend {
    unsigned draw_count;
    unsigned fetch_count;
 
+   unsigned fetch_start;
+
    struct draw_pt_middle_end *middle;
 
    unsigned input_prim;
@@ -56,6 +58,11 @@ static void varray_flush(struct varray_frontend *varray)
       debug_printf("FLUSH fc = %d, dc = %d\n",
                    varray->fetch_count,
                    varray->draw_count);
+      debug_printf("\telt0 = %d, eltx = %d, draw0 = %d, drawx = %d\n",
+                   varray->fetch_elts[0],
+                   varray->fetch_elts[varray->fetch_count-1],
+                   varray->draw_elts[0],
+                   varray->draw_elts[varray->draw_count-1]);
 #endif
       varray->middle->run(varray->middle,
                           varray->fetch_elts,
@@ -68,20 +75,43 @@ static void varray_flush(struct varray_frontend *varray)
    varray->draw_count = 0;
 }
 
-#if 0
-static void varray_check_flush(struct varray_frontend *varray)
+static void varray_flush_linear(struct varray_frontend *varray,
+                                unsigned start, unsigned count)
 {
-   if (varray->draw_count + 6 >= DRAW_MAX/* ||
-       varray->fetch_count + 4 >= FETCH_MAX*/) {
-      varray_flush(varray);
+   if (count) {
+#if 0
+      debug_printf("FLUSH LINEAR start = %d, count = %d\n",
+                   start,
+                   count);
+#endif
+      assert(varray->middle->run_linear);
+      varray->middle->run_linear(varray->middle, start, count);
    }
 }
+
+static INLINE void fetch_init(struct varray_frontend *varray,
+                              unsigned count)
+{
+   unsigned idx;
+#if 0
+      debug_printf("FETCH INIT c = %d, fs = %d\n",
+                   count,
+                   varray->fetch_start);
 #endif
+   for (idx = 0; idx < count; ++idx) {
+      varray->fetch_elts[idx] = varray->fetch_start + idx;
+   }
+   varray->fetch_start += idx;
+   varray->fetch_count = idx;
+}
+
+
+
 
 static INLINE void add_draw_el(struct varray_frontend *varray,
-                               int idx, ushort flags)
+                               int idx)
 {
-   varray->draw_elts[varray->draw_count++] = idx | flags;
+   varray->draw_elts[varray->draw_count++] = idx;
 }
 
 
@@ -90,106 +120,52 @@ static INLINE void varray_triangle( struct varray_frontend *varray,
                                     unsigned i1,
                                     unsigned i2 )
 {
-   add_draw_el(varray, i0, 0);
-   add_draw_el(varray, i1, 0);
-   add_draw_el(varray, i2, 0);
-}
-
-static INLINE void varray_triangle_flags( struct varray_frontend *varray,
-                                          ushort flags,
-                                          unsigned i0,
-                                          unsigned i1,
-                                          unsigned i2 )
-{
-   add_draw_el(varray, i0, flags);
-   add_draw_el(varray, i1, 0);
-   add_draw_el(varray, i2, 0);
+   add_draw_el(varray, i0);
+   add_draw_el(varray, i1);
+   add_draw_el(varray, i2);
 }
 
 static INLINE void varray_line( struct varray_frontend *varray,
                                 unsigned i0,
                                 unsigned i1 )
 {
-   add_draw_el(varray, i0, 0);
-   add_draw_el(varray, i1, 0);
-}
-
-
-static INLINE void varray_line_flags( struct varray_frontend *varray,
-                                      ushort flags,
-                                      unsigned i0,
-                                      unsigned i1 )
-{
-   add_draw_el(varray, i0, flags);
-   add_draw_el(varray, i1, 0);
+   add_draw_el(varray, i0);
+   add_draw_el(varray, i1);
 }
 
 
 static INLINE void varray_point( struct varray_frontend *varray,
                                  unsigned i0 )
 {
-   add_draw_el(varray, i0, 0);
-}
-
-static INLINE void varray_quad( struct varray_frontend *varray,
-                                unsigned i0,
-                                unsigned i1,
-                                unsigned i2,
-                                unsigned i3 )
-{
-   varray_triangle( varray, i0, i1, i3 );
-   varray_triangle( varray, i1, i2, i3 );
-}
-
-static INLINE void varray_ef_quad( struct varray_frontend *varray,
-                                   unsigned i0,
-                                   unsigned i1,
-                                   unsigned i2,
-                                   unsigned i3 )
-{
-   const unsigned omitEdge1 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_2;
-   const unsigned omitEdge2 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1;
-
-   varray_triangle_flags( varray,
-                          DRAW_PIPE_RESET_STIPPLE | omitEdge1,
-                          i0, i1, i3 );
-
-   varray_triangle_flags( varray,
-                          omitEdge2,
-                          i1, i2, i3 );
+   add_draw_el(varray, i0);
 }
 
-/* At least for now, we're back to using a template include file for
- * this.  The two paths aren't too different though - it may be
- * possible to reunify them.
- */
-#define TRIANGLE(vc,flags,i0,i1,i2) varray_triangle_flags(vc,flags,i0,i1,i2)
-#define QUAD(vc,i0,i1,i2,i3)        varray_ef_quad(vc,i0,i1,i2,i3)
-#define LINE(vc,flags,i0,i1)        varray_line_flags(vc,flags,i0,i1)
-#define POINT(vc,i0)                varray_point(vc,i0)
-#define FUNC varray_run_extras
-#include "draw_pt_varray_tmp.h"
 
-#define TRIANGLE(vc,flags,i0,i1,i2) varray_triangle(vc,i0,i1,i2)
-#define QUAD(vc,i0,i1,i2,i3)        varray_quad(vc,i0,i1,i2,i3)
-#define LINE(vc,flags,i0,i1)        varray_line(vc,i0,i1)
+#if 0
+#define TRIANGLE(flags,i0,i1,i2)       varray_triangle(varray,i0,i1,i2)
+#define LINE(flags,i0,i1)              varray_line(varray,i0,i1)
+#define POINT(i0)                      varray_point(varray,i0)
+#define FUNC varray_decompose
+#include "draw_pt_decompose.h"
+#else
+#define TRIANGLE(vc,i0,i1,i2)       varray_triangle(vc,i0,i1,i2)
+#define LINE(vc,i0,i1)              varray_line(vc,i0,i1)
 #define POINT(vc,i0)                varray_point(vc,i0)
 #define FUNC varray_run
-#include "draw_pt_varray_tmp.h"
-
-
+#include "draw_pt_varray_tmp_linear.h"
+#endif
 
-static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
+static unsigned decompose_prim[PIPE_PRIM_POLYGON + 1] = {
    PIPE_PRIM_POINTS,
    PIPE_PRIM_LINES,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINES,
+   PIPE_PRIM_LINES,             /* decomposed LINELOOP */
+   PIPE_PRIM_LINE_STRIP,
    PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES
+   PIPE_PRIM_TRIANGLE_STRIP,
+   PIPE_PRIM_TRIANGLES,         /* decomposed TRI_FAN */
+   PIPE_PRIM_QUADS,
+   PIPE_PRIM_QUAD_STRIP,
+   PIPE_PRIM_TRIANGLES          /* decomposed POLYGON */
 };
 
 
@@ -201,17 +177,10 @@ static void varray_prepare(struct draw_pt_front_end *frontend,
 {
    struct varray_frontend *varray = (struct varray_frontend *)frontend;
 
-   if (opt & PT_PIPELINE)
-   {
-      varray->base.run = varray_run_extras;
-   } 
-   else 
-   {
-      varray->base.run = varray_run;
-   }
+   varray->base.run = varray_run;
 
    varray->input_prim = prim;
-   varray->output_prim = reduced_prim[prim];
+   varray->output_prim = decompose_prim[prim];
 
    varray->middle = middle;
    middle->prepare(middle, varray->output_prim, opt);
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
index b9a319b2535..6979f6b544f 100644
--- a/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
+++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp.h
@@ -10,32 +10,44 @@ static void FUNC(struct draw_pt_front_end *frontend,
 
    boolean flatfirst = (draw->rasterizer->flatshade &&
                         draw->rasterizer->flatshade_first);
-   unsigned i, flags;
+   unsigned i, j, flags;
+   unsigned first, incr;
+
+   varray->fetch_start = start;
+
+   draw_pt_split_prim(varray->input_prim, &first, &incr);
 
 #if 0
-   debug_printf("%s (%d) %d/%d\n", __FUNCTION__, draw->prim, start, count);
-#endif
-#if 0
-   debug_printf("INPUT PRIM = %d (start = %d, count = %d)\n", varray->input_prim,
+   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
+                varray->input_prim,
                 start, count);
 #endif
 
-   for (i = 0; i < count; ++i) {
-      varray->fetch_elts[i] = start + i;
-   }
-   varray->fetch_count = count;
-
    switch (varray->input_prim) {
    case PIPE_PRIM_POINTS:
-      for (i = 0; i < count; i ++) {
-         POINT(varray, i + 0);
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i < end; i++) {
+            POINT(varray, i + 0);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
       }
       break;
 
    case PIPE_PRIM_LINES:
-      for (i = 0; i+1 < count; i += 2) {
-         LINE(varray, DRAW_PIPE_RESET_STIPPLE,
-              i + 0, i + 1);
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+1 < end; i += 2) {
+            LINE(varray, DRAW_PIPE_RESET_STIPPLE,
+                 i + 0, i + 1);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
       }
       break;
 
@@ -43,38 +55,81 @@ static void FUNC(struct draw_pt_front_end *frontend,
       if (count >= 2) {
          flags = DRAW_PIPE_RESET_STIPPLE;
 
-         for (i = 1; i < count; i++, flags = 0) {
-            LINE(varray, flags, i - 1, i);
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end % incr);
+            for (i = 1; i < end; i++, flags = 0) {
+               LINE(varray, flags, i - 1, i);
+            }
+            LINE(varray, flags, i - 1, 0);
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
          }
-         LINE(varray, flags, i - 1, 0);
       }
       break;
 
    case PIPE_PRIM_LINE_STRIP:
       flags = DRAW_PIPE_RESET_STIPPLE;
-      for (i = 1; i < count; i++, flags = 0) {
-         LINE(varray, flags, i - 1, i);
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 1; i < end; i++, flags = 0) {
+            LINE(varray, flags, i - 1, i);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
       }
       break;
 
    case PIPE_PRIM_TRIANGLES:
-      for (i = 0; i+2 < count; i += 3) {
-         TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                  i + 0, i + 1, i + 2);
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+2 < end; i += 3) {
+            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                     i + 0, i + 1, i + 2);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
       }
       break;
 
    case PIPE_PRIM_TRIANGLE_STRIP:
       if (flatfirst) {
-         for (i = 0; i+2 < count; i++) {
-            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                     i + 0, i + 1 + (i&1), i + 2 - (i&1));
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end % incr);
+            for (i = 0; i+2 < end; i++) {
+               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                        i + 0, i + 1 + (i&1), i + 2 - (i&1));
+            }
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
+            if (j + first + i <= count) {
+               varray->fetch_start -= 2;
+               i -= 2;
+            }
          }
       }
       else {
-         for (i = 0; i+2 < count; i++) {
-            TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
-                     i + 0 + (i&1), i + 1 - (i&1), i + 2);
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end  % incr);
+            for (i = 0; i + 2 < end; i++) {
+               TRIANGLE(varray, DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL,
+                        i + 0 + (i&1), i + 1 - (i&1), i + 2);
+            }
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
+            if (j + first + i <= count) {
+               varray->fetch_start -= 2;
+               i -= 2;
+            }
          }
       }
       break;
@@ -83,51 +138,89 @@ static void FUNC(struct draw_pt_front_end *frontend,
       if (count >= 3) {
          if (flatfirst) {
             flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (i = 0; i+2 < count; i++) {
-               TRIANGLE(varray, flags, i + 1, i + 2, 0);
+            for (j = 0; j + first <= count; j += i) {
+               unsigned end = MIN2(FETCH_MAX, count - j);
+               end -= (end % incr);
+               for (i = 0; i+2 < end; i++) {
+                  TRIANGLE(varray, flags, i + 1, i + 2, 0);
+               }
+               i = end;
+               fetch_init(varray, end);
+               varray_flush(varray);
             }
          }
          else {
             flags = DRAW_PIPE_RESET_STIPPLE | DRAW_PIPE_EDGE_FLAG_ALL;
-            for (i = 0; i+2 < count; i++) {
-               TRIANGLE(varray, flags, 0, i + 1, i + 2);
+            for (j = 0; j + first <= count; j += i) {
+               unsigned end = MIN2(FETCH_MAX, count - j);
+               end -= (end % incr);
+               for (i = 0; i+2 < end; i++) {
+                  TRIANGLE(varray, flags, 0, i + 1, i + 2);
+               }
+               i = end;
+               fetch_init(varray, end);
+               varray_flush(varray);
             }
          }
       }
       break;
 
    case PIPE_PRIM_QUADS:
-      for (i = 0; i+3 < count; i += 4) {
-         QUAD(varray, i + 0, i + 1, i + 2, i + 3);
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+3 < end; i += 4) {
+            QUAD(varray, i + 0, i + 1, i + 2, i + 3);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
       }
       break;
 
    case PIPE_PRIM_QUAD_STRIP:
-      for (i = 0; i+3 < count; i += 2) {
-         QUAD(varray, i + 2, i + 0, i + 1, i + 3);
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+3 < end; i += 2) {
+            QUAD(varray, i + 2, i + 0, i + 1, i + 3);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+         if (j + first + i <= count) {
+            varray->fetch_start -= 2;
+            i -= 2;
+         }
       }
       break;
 
    case PIPE_PRIM_POLYGON:
    {
-         /* These bitflags look a little odd because we submit the
-          * vertices as (1,2,0) to satisfy flatshade requirements.
-          */
-         const unsigned edge_first  = DRAW_PIPE_EDGE_FLAG_2;
-         const unsigned edge_middle = DRAW_PIPE_EDGE_FLAG_0;
-         const unsigned edge_last   = DRAW_PIPE_EDGE_FLAG_1;
-
-         flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
-
-	 for (i = 0; i+2 < count; i++, flags = edge_middle) {
+      /* These bitflags look a little odd because we submit the
+       * vertices as (1,2,0) to satisfy flatshade requirements.
+       */
+      const unsigned edge_first  = DRAW_PIPE_EDGE_FLAG_2;
+      const unsigned edge_middle = DRAW_PIPE_EDGE_FLAG_0;
+      const unsigned edge_last   = DRAW_PIPE_EDGE_FLAG_1;
+
+      flags = DRAW_PIPE_RESET_STIPPLE | edge_first | edge_middle;
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 0; i+2 < end; i++, flags = edge_middle) {
 
             if (i + 3 == count)
                flags |= edge_last;
 
-	    TRIANGLE(varray, flags, i + 1, i + 2, 0);
-	 }
+            TRIANGLE(varray, flags, i + 1, i + 2, 0);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
       }
-      break;
+   }
+   break;
 
    default:
       assert(0);
diff --git a/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
new file mode 100644
index 00000000000..114ed371a06
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_pt_varray_tmp_linear.h
@@ -0,0 +1,94 @@
+static unsigned trim( unsigned count, unsigned first, unsigned incr )
+{
+   return count - (count - first) % incr; 
+}
+
+static void FUNC(struct draw_pt_front_end *frontend,
+                 pt_elt_func get_elt,
+                 const void *elts,
+                 unsigned count)
+{
+   struct varray_frontend *varray = (struct varray_frontend *)frontend;
+   unsigned start = (unsigned)elts;
+
+   unsigned i, j;
+   unsigned first, incr;
+
+   varray->fetch_start = start;
+
+   draw_pt_split_prim(varray->input_prim, &first, &incr);
+   
+   /* Sanitize primitive length:
+    */
+   count = trim(count, first, incr); 
+   if (count < first)
+      return;
+
+#if 0
+   debug_printf("%s (%d) %d/%d\n", __FUNCTION__,
+                varray->input_prim,
+                start, count);
+#endif
+
+   switch (varray->input_prim) {
+   case PIPE_PRIM_POINTS:
+   case PIPE_PRIM_LINES:
+   case PIPE_PRIM_TRIANGLES:
+   case PIPE_PRIM_LINE_STRIP:
+   case PIPE_PRIM_TRIANGLE_STRIP:
+   case PIPE_PRIM_QUADS:
+   case PIPE_PRIM_QUAD_STRIP:
+      for (j = 0; j < count;) {
+         unsigned remaining = count - j;
+         unsigned nr = trim( MIN2(FETCH_MAX, remaining), first, incr );
+         varray_flush_linear(varray, start + j, nr);
+         j += nr;
+         if (nr != remaining) 
+            j -= (first - incr);
+      }
+      break;
+
+   case PIPE_PRIM_LINE_LOOP:
+      if (count >= 2) {
+         for (j = 0; j + first <= count; j += i) {
+            unsigned end = MIN2(FETCH_MAX, count - j);
+            end -= (end % incr);
+            for (i = 1; i < end; i++) {
+               LINE(varray, i - 1, i);
+            }
+            LINE(varray, i - 1, 0);
+            i = end;
+            fetch_init(varray, end);
+            varray_flush(varray);
+         }
+      }
+      break;
+
+
+   case PIPE_PRIM_POLYGON:
+   case PIPE_PRIM_TRIANGLE_FAN:
+      for (j = 0; j + first <= count; j += i) {
+         unsigned end = MIN2(FETCH_MAX, count - j);
+         end -= (end % incr);
+         for (i = 2; i < end; i++) {
+            TRIANGLE(varray, 0, i - 1, i);
+         }
+         i = end;
+         fetch_init(varray, end);
+         varray_flush(varray);
+      }
+      break;
+
+   default:
+      assert(0);
+      break;
+   }
+
+   varray_flush(varray);
+}
+
+#undef TRIANGLE
+#undef QUAD
+#undef POINT
+#undef LINE
+#undef FUNC
diff --git a/src/gallium/auxiliary/draw/draw_pt_vcache.c b/src/gallium/auxiliary/draw/draw_pt_vcache.c
index 6b3fb1406ba..96e02fbf3a9 100644
--- a/src/gallium/auxiliary/draw/draw_pt_vcache.c
+++ b/src/gallium/auxiliary/draw/draw_pt_vcache.c
@@ -171,15 +171,15 @@ static void vcache_ef_quad( struct vcache_frontend *vcache,
                             unsigned i2,
                             unsigned i3 )
 {
-   const unsigned omitEdge1 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_2;
-   const unsigned omitEdge2 = DRAW_PIPE_EDGE_FLAG_0 | DRAW_PIPE_EDGE_FLAG_1;
-
-   vcache_triangle_flags( vcache, 
-                          DRAW_PIPE_RESET_STIPPLE | omitEdge1, 
+   vcache_triangle_flags( vcache,
+                          ( DRAW_PIPE_RESET_STIPPLE |
+                            DRAW_PIPE_EDGE_FLAG_0 |
+                            DRAW_PIPE_EDGE_FLAG_2 ),
                           i0, i1, i3 );
 
-   vcache_triangle_flags( vcache, 
-                          omitEdge2, 
+   vcache_triangle_flags( vcache,
+                          ( DRAW_PIPE_EDGE_FLAG_0 |
+                            DRAW_PIPE_EDGE_FLAG_1 ),
                           i1, i2, i3 );
 }
 
@@ -204,19 +204,6 @@ static void vcache_ef_quad( struct vcache_frontend *vcache,
 
 
 
-static unsigned reduced_prim[PIPE_PRIM_POLYGON + 1] = {
-   PIPE_PRIM_POINTS,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_LINES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES,
-   PIPE_PRIM_TRIANGLES
-};
-
 
 
 static void vcache_prepare( struct draw_pt_front_end *frontend,
@@ -236,7 +223,7 @@ static void vcache_prepare( struct draw_pt_front_end *frontend,
    }
 
    vcache->input_prim = prim;
-   vcache->output_prim = reduced_prim[prim];
+   vcache->output_prim = draw_pt_reduced_prim(prim);
 
    vcache->middle = middle;
    middle->prepare( middle, vcache->output_prim, opt );
diff --git a/src/gallium/auxiliary/draw/draw_vertex.h b/src/gallium/auxiliary/draw/draw_vertex.h
index 6d8bac51384..16c65c43175 100644
--- a/src/gallium/auxiliary/draw/draw_vertex.h
+++ b/src/gallium/auxiliary/draw/draw_vertex.h
@@ -109,4 +109,25 @@ extern void draw_compute_vertex_size(struct vertex_info *vinfo);
 void draw_dump_emitted_vertex(const struct vertex_info *vinfo, 
                               const uint8_t *data);
 
+
+static INLINE unsigned draw_translate_vinfo_format(unsigned format )
+{
+   switch (format) {
+   case EMIT_1F:
+   case EMIT_1F_PSIZE:
+      return PIPE_FORMAT_R32_FLOAT;
+   case EMIT_2F:
+      return PIPE_FORMAT_R32G32_FLOAT;
+   case EMIT_3F:
+      return PIPE_FORMAT_R32G32B32_FLOAT;
+   case EMIT_4F:
+      return PIPE_FORMAT_R32G32B32A32_FLOAT;
+   case EMIT_4UB:
+      return PIPE_FORMAT_R8G8B8A8_UNORM;
+   default:
+      return PIPE_FORMAT_NONE;
+   }
+}
+
+
 #endif /* DRAW_VERTEX_H */
diff --git a/src/gallium/auxiliary/draw/draw_vs.c b/src/gallium/auxiliary/draw/draw_vs.c
index 03fe00a9510..9b899d404e1 100644
--- a/src/gallium/auxiliary/draw/draw_vs.c
+++ b/src/gallium/auxiliary/draw/draw_vs.c
@@ -36,6 +36,8 @@
 #include "draw_private.h"
 #include "draw_context.h"
 #include "draw_vs.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
 
 
 
@@ -66,13 +68,13 @@ draw_bind_vertex_shader(struct draw_context *draw,
    
    if (dvs) 
    {
-      draw->vertex_shader = dvs;
-      draw->num_vs_outputs = dvs->info.num_outputs;
+      draw->vs.vertex_shader = dvs;
+      draw->vs.num_vs_outputs = dvs->info.num_outputs;
       dvs->prepare( dvs, draw );
    }
    else {
-      draw->vertex_shader = NULL;
-      draw->num_vs_outputs = 0;
+      draw->vs.vertex_shader = NULL;
+      draw->vs.num_vs_outputs = 0;
    }
 }
 
@@ -83,3 +85,109 @@ draw_delete_vertex_shader(struct draw_context *draw,
 {
    dvs->delete( dvs );
 }
+
+
+
+boolean 
+draw_vs_init( struct draw_context *draw )
+{
+   tgsi_exec_machine_init(&draw->vs.machine);
+
+   /* FIXME: give this machine thing a proper constructor:
+    */
+   draw->vs.machine.Inputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   if (!draw->vs.machine.Inputs)
+      return FALSE;
+
+   draw->vs.machine.Outputs = align_malloc(PIPE_MAX_ATTRIBS * sizeof(struct tgsi_exec_vector), 16);
+   if (!draw->vs.machine.Outputs)
+      return FALSE;
+
+   draw->vs.emit_cache = translate_cache_create();
+   if (!draw->vs.emit_cache) 
+      return FALSE;
+      
+   draw->vs.fetch_cache = translate_cache_create();
+   if (!draw->vs.fetch_cache) 
+      return FALSE;
+      
+   return TRUE;
+}
+
+void
+draw_vs_destroy( struct draw_context *draw )
+{
+   if (draw->vs.machine.Inputs)
+      align_free(draw->vs.machine.Inputs);
+
+   if (draw->vs.machine.Outputs)
+      align_free(draw->vs.machine.Outputs);
+
+   if (draw->vs.fetch_cache)
+      translate_cache_destroy(draw->vs.fetch_cache);
+
+   if (draw->vs.emit_cache)
+      translate_cache_destroy(draw->vs.emit_cache);
+
+   tgsi_exec_machine_free_data(&draw->vs.machine);
+
+}
+
+
+struct draw_vs_varient *
+draw_vs_lookup_varient( struct draw_vertex_shader *vs,
+                        const struct draw_vs_varient_key *key )
+{
+   struct draw_vs_varient *varient;
+   unsigned i;
+
+   /* Lookup existing varient: 
+    */
+   for (i = 0; i < vs->nr_varients; i++)
+      if (draw_vs_varient_key_compare(key, &vs->varient[i]->key) == 0)
+         return vs->varient[i];
+   
+   /* Else have to create a new one: 
+    */
+   varient = vs->create_varient( vs, key );
+   if (varient == NULL)
+      return NULL;
+
+   /* Add it to our list: 
+    */
+   assert(vs->nr_varients < Elements(vs->varient));
+   vs->varient[vs->nr_varients++] = varient;
+
+   /* Done 
+    */
+   return varient;
+}
+
+
+struct translate *
+draw_vs_get_fetch( struct draw_context *draw,
+                   struct translate_key *key )
+{
+   if (!draw->vs.fetch ||
+       translate_key_compare(&draw->vs.fetch->key, key) != 0) 
+   {
+      translate_key_sanitize(key);
+      draw->vs.fetch = translate_cache_find(draw->vs.fetch_cache, key);
+   }
+   
+   return draw->vs.fetch;
+}
+
+struct translate *
+draw_vs_get_emit( struct draw_context *draw,
+                  struct translate_key *key )
+{
+   if (!draw->vs.emit ||
+       translate_key_compare(&draw->vs.emit->key, key) != 0) 
+   {
+      translate_key_sanitize(key);
+      draw->vs.emit = translate_cache_find(draw->vs.emit_cache, key);
+   }
+   
+   return draw->vs.emit;
+}
diff --git a/src/gallium/auxiliary/draw/draw_vs.h b/src/gallium/auxiliary/draw/draw_vs.h
index f9772b83b85..01171bc23d5 100644
--- a/src/gallium/auxiliary/draw/draw_vs.h
+++ b/src/gallium/auxiliary/draw/draw_vs.h
@@ -38,10 +38,84 @@
 struct draw_context;
 struct pipe_shader_state;
 
+struct draw_varient_input 
+{
+   enum pipe_format format;
+   unsigned buffer;
+   unsigned offset; 
+};
+
+struct draw_varient_output
+{
+   enum pipe_format format;     /* output format */
+   unsigned vs_output:8;        /* which vertex shader output is this? */
+   unsigned offset:24;          /* offset into output vertex */
+};
+
+struct draw_varient_element {
+   struct draw_varient_input in;
+   struct draw_varient_output out;
+};
+
+struct draw_vs_varient_key {
+   unsigned output_stride;
+   unsigned nr_elements:8;      /* max2(nr_inputs, nr_outputs) */
+   unsigned nr_inputs:8;
+   unsigned nr_outputs:8;
+   unsigned viewport:1;
+   unsigned clip:1;
+   unsigned pad:5;
+   struct draw_varient_element element[PIPE_MAX_ATTRIBS];
+};
+
+struct draw_vs_varient;
+
+typedef void (PIPE_CDECL *vsv_run_elts_func)( struct draw_vs_varient *,
+                                              const unsigned *elts,
+                                              unsigned count,
+                                              void *output_buffer);
+
+typedef void (PIPE_CDECL *vsv_run_linear_func)( struct draw_vs_varient *,
+                                                unsigned start,
+                                                unsigned count,
+                                                void *output_buffer);
+
+
+struct draw_vs_varient {
+   struct draw_vs_varient_key key;
+
+   struct draw_vertex_shader *vs;
+
+   void (*set_input)( struct draw_vs_varient *,
+                      unsigned i,
+                      const void *ptr,
+                      unsigned stride );
+
+   void (*set_constants)( struct draw_vs_varient *,
+                          const float (*constants)[4] );
+
+   void (*set_viewport)( struct draw_vs_varient *,
+                         const struct pipe_viewport_state * );
+
+   void (PIPE_CDECL *run_linear)( struct draw_vs_varient *shader,
+                                  unsigned start,
+                                  unsigned count,
+                                  void *output_buffer );
+
+   void (PIPE_CDECL *run_elts)( struct draw_vs_varient *shader,
+                                const unsigned *elts,
+                                unsigned count,
+                                void *output_buffer );
+
+   void (*destroy)( struct draw_vs_varient * );
+};
+
+
 /**
  * Private version of the compiled vertex_shader
  */
 struct draw_vertex_shader {
+   struct draw_context *draw;
 
    /* This member will disappear shortly:
     */
@@ -49,6 +123,14 @@ struct draw_vertex_shader {
 
    struct tgsi_shader_info info;
 
+   /* 
+    */
+   struct draw_vs_varient *varient[16];
+   unsigned nr_varients;
+   struct draw_vs_varient *(*create_varient)( struct draw_vertex_shader *shader,
+                                              const struct draw_vs_varient_key *key );
+
+
    void (*prepare)( struct draw_vertex_shader *shader,
 		    struct draw_context *draw );
 
@@ -68,6 +150,15 @@ struct draw_vertex_shader {
 };
 
 
+struct draw_vs_varient *
+draw_vs_lookup_varient( struct draw_vertex_shader *base,
+                        const struct draw_vs_varient_key *key );
+
+
+/********************************************************************************
+ * Internal functions:
+ */
+
 struct draw_vertex_shader *
 draw_create_vs_exec(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
@@ -81,7 +172,52 @@ draw_create_vs_llvm(struct draw_context *draw,
 		    const struct pipe_shader_state *templ);
 
 
+
+struct draw_vs_varient_key;
+struct draw_vertex_shader;
+
+struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key );
+
+
+
+/********************************************************************************
+ * Helpers for vs implementations that don't do their own fetch/emit varients.
+ * Means these can be shared between shaders.
+ */
+struct translate;
+struct translate_key;
+
+struct translate *draw_vs_get_fetch( struct draw_context *draw,
+                                     struct translate_key *key );
+
+
+struct translate *draw_vs_get_emit( struct draw_context *draw,
+                                    struct translate_key *key );
+
+struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key );
+
+
+
+static INLINE int draw_vs_varient_keysize( const struct draw_vs_varient_key *key )
+{
+   return 2 * sizeof(int) + key->nr_elements * sizeof(struct draw_varient_element);
+}
+
+static INLINE int draw_vs_varient_key_compare( const struct draw_vs_varient_key *a,
+                                         const struct draw_vs_varient_key *b )
+{
+   int keysize = draw_vs_varient_keysize(a);
+   return memcmp(a, b, keysize);
+}
+
+
+
+
+
 #define MAX_TGSI_VERTICES 4
    
 
+
 #endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.c b/src/gallium/auxiliary/draw/draw_vs_aos.c
new file mode 100644
index 00000000000..d3770b2c53b
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.c
@@ -0,0 +1,2266 @@
+/*
+ * Mesa 3-D graphics library
+ * Version:  6.3
+ *
+ * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
+ * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+/**
+ * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
+ * using the rtasm runtime assembler.  Based on the old
+ * t_vb_arb_program_sse.c
+ */
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "tgsi/util/tgsi_dump.h"
+
+#include "draw_vs.h"
+#include "draw_vs_aos.h"
+
+#include "rtasm/rtasm_x86sse.h"
+
+#ifdef PIPE_ARCH_X86
+#define DISASSEM 0
+
+static const char *files[] =
+{
+   "NULL",
+   "CONST",
+   "IN",
+   "OUT",
+   "TEMP",
+   "SAMP",
+   "ADDR",
+   "IMM",
+   "INTERNAL",
+};
+
+static INLINE boolean eq( struct x86_reg a,
+			    struct x86_reg b )
+{
+   return (a.file == b.file &&
+	   a.idx == b.idx &&
+	   a.mod == b.mod &&
+	   a.disp == b.disp);
+}
+      
+
+static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
+                                  unsigned file,
+				  unsigned idx )
+{
+   struct x86_reg ptr = cp->machine_EDX;
+
+   switch (file) {
+   case TGSI_FILE_INPUT:
+      return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
+
+   case TGSI_FILE_OUTPUT:
+      return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
+
+   case TGSI_FILE_TEMPORARY:
+      return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
+
+   case TGSI_FILE_IMMEDIATE:
+      return x86_make_disp(ptr, Offset(struct aos_machine, immediate[idx]));
+
+   case TGSI_FILE_CONSTANT:       
+      return x86_make_disp(ptr, Offset(struct aos_machine, constant[idx]));
+
+   case AOS_FILE_INTERNAL:
+      return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
+
+   default:
+      ERROR(cp, "unknown reg file");
+      return x86_make_reg(0,0);
+   }
+}
+		
+
+
+#define X87_CW_EXCEPTION_INV_OP       (1<<0)
+#define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
+#define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
+#define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
+#define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
+#define X87_CW_EXCEPTION_PRECISION    (1<<5)
+#define X87_CW_PRECISION_SINGLE       (0<<8)
+#define X87_CW_PRECISION_RESERVED     (1<<8)
+#define X87_CW_PRECISION_DOUBLE       (2<<8)
+#define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
+#define X87_CW_PRECISION_MASK         (3<<8)
+#define X87_CW_ROUND_NEAREST          (0<<10)
+#define X87_CW_ROUND_DOWN             (1<<10)
+#define X87_CW_ROUND_UP               (2<<10)
+#define X87_CW_ROUND_ZERO             (3<<10)
+#define X87_CW_ROUND_MASK             (3<<10)
+#define X87_CW_INFINITY               (1<<12)
+
+static void do_populate_lut( struct shine_tab *tab,
+                             float unclamped_exponent )
+{
+   const float epsilon = 1.0F / 256.0F;    
+   float exponent = CLAMP(unclamped_exponent, -(128.0F - epsilon), (128.0F - epsilon));
+   unsigned i;
+
+   tab->exponent = unclamped_exponent; /* for later comparison */
+   
+   tab->values[0] = 0;
+   if (exponent == 0) {
+      for (i = 1; i < 258; i++) {
+         tab->values[i] = 1.0;
+      }      
+   }
+   else {
+      for (i = 1; i < 258; i++) {
+         tab->values[i] = powf((float)i * epsilon, exponent);
+      }
+   }
+}
+
+static void init_internals( struct aos_machine *machine )
+{
+   unsigned i;
+   float inv = 1.0f/255.0f;
+   float f255 = 255.0f;
+
+   ASSIGN_4V(machine->internal[IMM_SWZ],       1.0f,  -1.0f,  0.0f, 1.0f);
+   *(unsigned *)&machine->internal[IMM_SWZ][3] = 0xffffffff;
+
+   ASSIGN_4V(machine->internal[IMM_ONES],      1.0f,  1.0f,  1.0f,  1.0f);
+   ASSIGN_4V(machine->internal[IMM_NEGS],     -1.0f, -1.0f, -1.0f, -1.0f);
+   ASSIGN_4V(machine->internal[IMM_IDENTITY],  0.0f,  0.0f,  0.0f,  1.0f);
+   ASSIGN_4V(machine->internal[IMM_INV_255],   inv,   inv,   inv,   inv);
+   ASSIGN_4V(machine->internal[IMM_255],       f255,  f255,  f255,  f255);
+   ASSIGN_4V(machine->internal[IMM_RSQ],       -.5f,  1.5f,  0.0f,  0.0f);
+
+
+   machine->fpu_rnd_nearest = (X87_CW_EXCEPTION_INV_OP |
+                               X87_CW_EXCEPTION_DENORM_OP |
+                               X87_CW_EXCEPTION_ZERO_DIVIDE |
+                               X87_CW_EXCEPTION_OVERFLOW |
+                               X87_CW_EXCEPTION_UNDERFLOW |
+                               X87_CW_EXCEPTION_PRECISION |
+                               (1<<6) |
+                               X87_CW_ROUND_NEAREST |
+                               X87_CW_PRECISION_DOUBLE_EXT);
+
+   assert(machine->fpu_rnd_nearest == 0x37f);
+                               
+   machine->fpu_rnd_neg_inf = (X87_CW_EXCEPTION_INV_OP |
+                               X87_CW_EXCEPTION_DENORM_OP |
+                               X87_CW_EXCEPTION_ZERO_DIVIDE |
+                               X87_CW_EXCEPTION_OVERFLOW |
+                               X87_CW_EXCEPTION_UNDERFLOW |
+                               X87_CW_EXCEPTION_PRECISION |
+                               (1<<6) |
+                               X87_CW_ROUND_DOWN |
+                               X87_CW_PRECISION_DOUBLE_EXT);
+
+   for (i = 0; i < MAX_SHINE_TAB; i++)
+      do_populate_lut( &machine->shine_tab[i], 1.0f );
+}
+
+
+static void spill( struct aos_compilation *cp, unsigned idx )
+{
+   if (!cp->xmm[idx].dirty ||
+       (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
+        cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
+        cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
+      ERROR(cp, "invalid spill");
+      return;
+   }
+   else {
+      struct x86_reg oldval = get_reg_ptr(cp,
+                                          cp->xmm[idx].file,
+                                          cp->xmm[idx].idx);
+     
+      if (0) debug_printf("\nspill %s[%d]", 
+                          files[cp->xmm[idx].file],
+                          cp->xmm[idx].idx);
+ 
+      assert(cp->xmm[idx].dirty);
+      sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
+      cp->xmm[idx].dirty = 0;
+   }
+}
+
+
+static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
+                                        struct x86_reg reg )
+{
+   if (reg.file != file_XMM ||
+       cp->xmm[reg.idx].file != TGSI_FILE_NULL)
+   {
+      struct x86_reg tmp = aos_get_xmm_reg(cp);
+      sse_movaps(cp->func, tmp, reg);
+      reg = tmp;
+   }
+
+   cp->xmm[reg.idx].last_used = cp->insn_counter;
+   return reg;
+}
+
+static struct x86_reg get_xmm( struct aos_compilation *cp,
+                               struct x86_reg reg )
+{
+   if (reg.file != file_XMM) 
+   {
+      struct x86_reg tmp = aos_get_xmm_reg(cp);
+      sse_movaps(cp->func, tmp, reg);
+      reg = tmp;
+   }
+
+   cp->xmm[reg.idx].last_used = cp->insn_counter;
+   return reg;
+}
+
+
+/* Allocate an empty xmm register, either as a temporary or later to
+ * "adopt" as a shader reg.
+ */
+struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
+{
+   unsigned i;
+   unsigned oldest = 0;
+   boolean found = FALSE;
+
+   for (i = 0; i < 8; i++) 
+      if (cp->xmm[i].last_used != cp->insn_counter &&
+          cp->xmm[i].file == TGSI_FILE_NULL) {
+	 oldest = i;
+         found = TRUE;
+      }
+
+   if (!found) {
+      for (i = 0; i < 8; i++) 
+         if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
+            oldest = i;
+   }
+
+   /* Need to write out the old value?
+    */
+   if (cp->xmm[oldest].dirty) 
+      spill(cp, oldest);
+
+   assert(cp->xmm[oldest].last_used != cp->insn_counter);
+
+   cp->xmm[oldest].file = TGSI_FILE_NULL;
+   cp->xmm[oldest].idx = 0;
+   cp->xmm[oldest].dirty = 0;
+   cp->xmm[oldest].last_used = cp->insn_counter;
+   return x86_make_reg(file_XMM, oldest);
+}
+
+void aos_release_xmm_reg( struct aos_compilation *cp,
+                          unsigned idx )
+{
+   cp->xmm[idx].file = TGSI_FILE_NULL;
+   cp->xmm[idx].idx = 0;
+   cp->xmm[idx].dirty = 0;
+   cp->xmm[idx].last_used = 0;
+}
+
+
+
+     
+/* Mark an xmm reg as holding the current copy of a shader reg.
+ */
+void aos_adopt_xmm_reg( struct aos_compilation *cp,
+                        struct x86_reg reg,
+                        unsigned file,
+                        unsigned idx,
+                        unsigned dirty )
+{
+   unsigned i;
+
+   if (reg.file != file_XMM) {
+      assert(0);
+      return;
+   }
+
+
+   /* If any xmm reg thinks it holds this shader reg, break the
+    * illusion.
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file && 
+          cp->xmm[i].idx == idx) 
+      {
+         /* If an xmm reg is already holding this shader reg, take into account its
+          * dirty flag...
+          */
+         dirty |= cp->xmm[i].dirty;
+         aos_release_xmm_reg(cp, i);
+      }
+   }
+
+   cp->xmm[reg.idx].file = file;
+   cp->xmm[reg.idx].idx = idx;
+   cp->xmm[reg.idx].dirty = dirty;
+   cp->xmm[reg.idx].last_used = cp->insn_counter;
+}
+
+
+/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
+ */
+static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp, 
+                                              unsigned file,
+                                              unsigned idx )
+{
+   unsigned i;
+
+   /* Ensure the in-memory copy of this reg is up-to-date
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file && 
+          cp->xmm[i].idx == idx &&
+          cp->xmm[i].dirty) {
+         spill(cp, i);
+      }
+   }
+
+   return get_reg_ptr( cp, file, idx );
+}
+
+
+/* As above, but return a pointer.  Note - this pointer may alias
+ * those returned by get_arg_ptr().
+ */
+static struct x86_reg get_dst_ptr( struct aos_compilation *cp, 
+                                   const struct tgsi_full_dst_register *dst )
+{
+   unsigned file = dst->DstRegister.File;
+   unsigned idx = dst->DstRegister.Index;
+   unsigned i;
+   
+
+   /* Ensure in-memory copy of this reg is up-to-date and invalidate
+    * any xmm copies.
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file &&
+          cp->xmm[i].idx == idx)
+      {
+         if (cp->xmm[i].dirty) 
+            spill(cp, i);
+         
+         aos_release_xmm_reg(cp, i);
+      }
+   }
+
+   return get_reg_ptr( cp, file, idx );
+}
+
+
+
+
+
+/* Return an XMM reg if the argument is resident, otherwise return a
+ * base+offset pointer to the saved value.
+ */
+struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
+                                   unsigned file,
+                                   unsigned idx )
+{
+   unsigned i;
+
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].file == file &&
+	  cp->xmm[i].idx  == idx) 
+      {
+	 cp->xmm[i].last_used = cp->insn_counter;
+	 return x86_make_reg(file_XMM, i);
+      }
+   }
+
+   /* If not found in the XMM register file, return an indirect
+    * reference to the in-memory copy:
+    */
+   return get_reg_ptr( cp, file, idx );
+}
+
+
+
+static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp, 
+                                              unsigned file,
+                                              unsigned idx )
+{
+   struct x86_reg reg = get_xmm( cp,
+                                 aos_get_shader_reg( cp, file, idx ) );
+
+   aos_adopt_xmm_reg( cp,
+                      reg,
+                      file,
+                      idx,
+                      FALSE );
+   
+   return reg;
+}
+
+
+
+struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
+                                     unsigned imm )
+{
+   return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
+}
+
+
+struct x86_reg aos_get_internal( struct aos_compilation *cp,
+                                 unsigned imm )
+{
+   return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
+}
+
+
+
+
+
+/* Emulate pshufd insn in regular SSE, if necessary:
+ */
+static void emit_pshufd( struct aos_compilation *cp,
+			 struct x86_reg dst,
+			 struct x86_reg arg0,
+			 ubyte shuf )
+{
+   if (cp->have_sse2) {
+      sse2_pshufd(cp->func, dst, arg0, shuf);
+   }
+   else {
+      if (!eq(dst, arg0)) 
+	 sse_movaps(cp->func, dst, arg0);
+
+      sse_shufps(cp->func, dst, dst, shuf);
+   }
+}
+
+/* load masks (pack into negs??)
+ * pshufd - shuffle according to writemask
+ * and - result, mask
+ * nand - dest, mask
+ * or - dest, result
+ */
+static boolean mask_write( struct aos_compilation *cp,
+                           struct x86_reg dst,
+                           struct x86_reg result,
+                           unsigned mask )
+{
+   struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+   
+   emit_pshufd(cp, tmp, imm_swz, 
+               SHUF((mask & 1) ? 2 : 3,
+                    (mask & 2) ? 2 : 3,
+                    (mask & 4) ? 2 : 3,
+                    (mask & 8) ? 2 : 3));
+
+   sse_andps(cp->func, dst, tmp);
+   sse_andnps(cp->func, tmp, result);
+   sse_orps(cp->func, dst, tmp);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   return TRUE;
+}
+
+
+
+
+/* Helper for writemask:
+ */
+static boolean emit_shuf_copy2( struct aos_compilation *cp,
+				  struct x86_reg dst,
+				  struct x86_reg arg0,
+				  struct x86_reg arg1,
+				  ubyte shuf )
+{
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+   emit_pshufd(cp, dst, arg1, shuf);
+   emit_pshufd(cp, tmp, arg0, shuf);
+   sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
+   emit_pshufd(cp, dst, dst, shuf);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   return TRUE;
+}
+
+
+
+#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
+
+
+/* Locate a source register and perform any required (simple) swizzle.  
+ * 
+ * Just fail on complex swizzles at this point.
+ */
+static struct x86_reg fetch_src( struct aos_compilation *cp, 
+                                 const struct tgsi_full_src_register *src ) 
+{
+   struct x86_reg arg0 = aos_get_shader_reg(cp, 
+                                            src->SrcRegister.File, 
+                                            src->SrcRegister.Index);
+   unsigned i;
+   unsigned swz = 0;
+   unsigned negs = 0;
+   unsigned abs = 0;
+
+   for (i = 0; i < 4; i++) {
+      unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, i );
+      unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
+
+      switch (swizzle) {
+      case TGSI_EXTSWIZZLE_ZERO:
+      case TGSI_EXTSWIZZLE_ONE:
+         ERROR(cp, "not supporting full swizzles yet in tgsi_aos_sse2");
+         break;
+
+      default:
+         swz |= (swizzle & 0x3) << (i * 2);
+         break;
+      }
+
+      switch (neg) {
+      case TGSI_UTIL_SIGN_TOGGLE:
+         negs |= (1<<i);
+         break;
+         
+      case TGSI_UTIL_SIGN_KEEP:
+         break;
+
+      case TGSI_UTIL_SIGN_CLEAR:
+         abs |= (1<<i);
+         break;
+
+      default:
+         ERROR(cp, "unsupported sign-mode");
+         break;
+      }
+   }
+
+   if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
+      struct x86_reg dst = aos_get_xmm_reg(cp);
+
+      if (swz != SSE_SWIZZLE_NOOP) {
+         emit_pshufd(cp, dst, arg0, swz);
+         arg0 = dst;
+      }
+
+      if (negs && negs != 0xf) {
+         struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
+         struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+         /* Load 1,-1,0,0
+          * Use neg as arg to pshufd
+          * Multiply
+          */
+         emit_pshufd(cp, tmp, imm_swz, 
+                     SHUF((negs & 1) ? 1 : 0,
+                          (negs & 2) ? 1 : 0,
+                          (negs & 4) ? 1 : 0,
+                          (negs & 8) ? 1 : 0));
+         sse_mulps(cp->func, dst, arg0);
+
+         aos_release_xmm_reg(cp, tmp.idx);
+         arg0 = dst;
+      }
+      else if (negs) {
+         struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
+         sse_mulps(cp->func, dst, imm_negs);
+         arg0 = dst;
+      }
+
+
+      if (abs && abs != 0xf) {
+         ERROR(cp, "unsupported partial abs");
+      }
+      else if (abs) {
+         struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
+         struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+         sse_movaps(cp->func, tmp, arg0);
+         sse_mulps(cp->func, tmp, neg);
+         sse_maxps(cp->func, dst, arg0);
+
+         aos_release_xmm_reg(cp, tmp.idx);
+         arg0 = dst;
+      }
+   }
+      
+   return arg0;
+}
+
+static void x87_fld_src( struct aos_compilation *cp, 
+                         const struct tgsi_full_src_register *src,
+                         unsigned channel ) 
+{
+   struct x86_reg arg0 = aos_get_shader_reg_ptr(cp, 
+                                                src->SrcRegister.File, 
+                                                src->SrcRegister.Index);
+
+   unsigned swizzle = tgsi_util_get_full_src_register_extswizzle( src, channel );
+   unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
+
+   switch (swizzle) {
+   case TGSI_EXTSWIZZLE_ZERO:
+      x87_fldz( cp->func );
+      break;
+
+   case TGSI_EXTSWIZZLE_ONE:
+      x87_fld1( cp->func );
+      break;
+
+   default:
+      x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
+      break;
+   }
+   
+
+   switch (neg) {
+   case TGSI_UTIL_SIGN_TOGGLE:
+      /* Flip the sign:
+       */
+      x87_fchs( cp->func );
+      break;
+         
+   case TGSI_UTIL_SIGN_KEEP:
+      break;
+
+   case TGSI_UTIL_SIGN_CLEAR:
+      x87_fabs( cp->func );
+      break;
+
+   case TGSI_UTIL_SIGN_SET:
+      x87_fabs( cp->func );
+      x87_fchs( cp->func );
+      break;
+
+   default:
+      ERROR(cp, "unsupported sign-mode");
+      break;
+   }
+}
+
+
+
+
+
+
+/* Used to implement write masking.  This and most of the other instructions
+ * here would be easier to implement if there had been a translation
+ * to a 2 argument format (dst/arg0, arg1) at the shader level before
+ * attempting to translate to x86/sse code.
+ */
+static void store_dest( struct aos_compilation *cp, 
+                        const struct tgsi_full_dst_register *reg,
+                        struct x86_reg result )
+{
+   struct x86_reg dst;
+
+   switch (reg->DstRegister.WriteMask) {
+   case 0:
+      return;
+   
+   case TGSI_WRITEMASK_XYZW:
+      aos_adopt_xmm_reg(cp, 
+                        get_xmm_writable(cp, result), 
+                        reg->DstRegister.File,
+                        reg->DstRegister.Index,
+                        TRUE);
+      return;
+   default: 
+      break;
+   }
+
+   dst = aos_get_shader_reg_xmm(cp, 
+                                reg->DstRegister.File,
+                                reg->DstRegister.Index);
+
+   switch (reg->DstRegister.WriteMask) {
+   case TGSI_WRITEMASK_X:
+      sse_movss(cp->func, dst, get_xmm(cp, result));
+      break;
+      
+   case TGSI_WRITEMASK_ZW:
+      sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
+      break;
+
+   case TGSI_WRITEMASK_XY: 
+      result = get_xmm_writable(cp, result);
+      sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
+      dst = result;
+      break;
+
+   case TGSI_WRITEMASK_YZW: 
+      result = get_xmm_writable(cp, result);
+      sse_movss(cp->func, result, dst);
+      dst = result;
+      break;
+
+   default:
+      mask_write(cp, dst, result, reg->DstRegister.WriteMask);
+      break;
+   }
+
+   aos_adopt_xmm_reg(cp, 
+                     dst, 
+                     reg->DstRegister.File,
+                     reg->DstRegister.Index,
+                     TRUE);
+
+}
+
+static void inject_scalar( struct aos_compilation *cp,
+                           struct x86_reg dst,
+                           struct x86_reg result,
+                           unsigned swizzle )
+{
+   sse_shufps(cp->func, dst, dst, swizzle);
+   sse_movss(cp->func, dst, result);
+   sse_shufps(cp->func, dst, dst, swizzle);
+}
+
+
+static void store_scalar_dest( struct aos_compilation *cp, 
+                               const struct tgsi_full_dst_register *reg,
+                               struct x86_reg result )
+{
+   unsigned writemask = reg->DstRegister.WriteMask;
+   struct x86_reg dst;
+
+   if (writemask != TGSI_WRITEMASK_X &&
+       writemask != TGSI_WRITEMASK_Y &&
+       writemask != TGSI_WRITEMASK_Z &&
+       writemask != TGSI_WRITEMASK_W &&
+       writemask != 0) 
+   {
+      result = get_xmm_writable(cp, result); /* already true, right? */
+      sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
+      store_dest(cp, reg, result);
+      return;
+   }
+
+   result = get_xmm(cp, result);
+   dst = aos_get_shader_reg_xmm(cp, 
+                                reg->DstRegister.File,
+                                reg->DstRegister.Index);
+
+
+
+   switch (reg->DstRegister.WriteMask) {
+   case TGSI_WRITEMASK_X:
+      sse_movss(cp->func, dst, result);
+      break;
+
+   case TGSI_WRITEMASK_Y:
+      inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
+      break;
+
+   case TGSI_WRITEMASK_Z:
+      inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
+      break;
+
+   case TGSI_WRITEMASK_W:
+      inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
+      break;
+
+   default:
+      break;
+   }
+
+   aos_adopt_xmm_reg(cp, 
+                     dst, 
+                     reg->DstRegister.File,
+                     reg->DstRegister.Index,
+                     TRUE);
+}
+   
+
+
+static void x87_fst_or_nop( struct x86_function *func,
+                            unsigned writemask,
+                            unsigned channel,
+                            struct x86_reg ptr )
+{
+   assert(ptr.file == file_REG32);
+   if (writemask & (1<<channel)) 
+      x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
+}
+
+static void x87_fstp_or_pop( struct x86_function *func,
+                             unsigned writemask,
+                             unsigned channel,
+                             struct x86_reg ptr )
+{
+   assert(ptr.file == file_REG32);
+   if (writemask & (1<<channel)) 
+      x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
+   else
+      x87_fstp( func, x86_make_reg( file_x87, 0 ));
+}
+
+
+
+/* 
+ */
+static void x87_fstp_dest4( struct aos_compilation *cp,
+                            const struct tgsi_full_dst_register *dst )
+{
+   struct x86_reg ptr = get_dst_ptr(cp, dst); 
+   unsigned writemask = dst->DstRegister.WriteMask;
+
+   x87_fst_or_nop(cp->func, writemask, 0, ptr);
+   x87_fst_or_nop(cp->func, writemask, 1, ptr);
+   x87_fst_or_nop(cp->func, writemask, 2, ptr);
+   x87_fstp_or_pop(cp->func, writemask, 3, ptr);
+}
+
+/* Save current x87 state and put it into single precision mode.
+ */
+static void save_fpu_state( struct aos_compilation *cp )
+{
+   x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                       Offset(struct aos_machine, fpu_restore)));
+}
+
+static void restore_fpu_state( struct aos_compilation *cp )
+{
+   x87_fnclex(cp->func);
+   x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                      Offset(struct aos_machine, fpu_restore)));
+}
+
+static void set_fpu_round_neg_inf( struct aos_compilation *cp )
+{
+   if (cp->fpucntl != FPU_RND_NEG) {
+      cp->fpucntl = FPU_RND_NEG;
+      x87_fnclex(cp->func);
+      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                         Offset(struct aos_machine, fpu_rnd_neg_inf)));
+   }
+}
+
+static void set_fpu_round_nearest( struct aos_compilation *cp )
+{
+   if (cp->fpucntl != FPU_RND_NEAREST) {
+      cp->fpucntl = FPU_RND_NEAREST;
+      x87_fnclex(cp->func);
+      x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX, 
+                                         Offset(struct aos_machine, fpu_rnd_nearest)));
+   }
+}
+
+
+static void x87_emit_ex2( struct aos_compilation *cp )
+{
+   struct x86_reg st0 = x86_make_reg(file_x87, 0);
+   struct x86_reg st1 = x86_make_reg(file_x87, 1);
+   int stack = cp->func->x87_stack;
+
+//   set_fpu_round_neg_inf( cp );
+
+   x87_fld(cp->func, st0);      /* a a */
+   x87_fprndint( cp->func );	/* int(a) a*/
+   x87_fsubr(cp->func, st1, st0);    /* int(a) frc(a) */
+   x87_fxch(cp->func, st1);     /* frc(a) int(a) */
+   x87_f2xm1(cp->func);         /* (2^frc(a))-1 int(a) */
+   x87_fld1(cp->func);          /* 1 (2^frc(a))-1 int(a) */
+   x87_faddp(cp->func, st1);	/* 2^frac(a) int(a)  */
+   x87_fscale(cp->func);	/* (2^frac(a)*2^int(int(a))) int(a) */
+                                /* 2^a int(a) */
+   x87_fstp(cp->func, st1);     /* 2^a */
+
+   assert( stack == cp->func->x87_stack);
+      
+}
+
+static void PIPE_CDECL print_reg( const char *msg,
+                                  const float *reg )
+{
+   debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
+}
+
+static void emit_print( struct aos_compilation *cp,
+                        const char *message, /* must point to a static string! */
+                        unsigned file,
+                        unsigned idx )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
+   unsigned i;
+
+   /* There shouldn't be anything on the x87 stack.  Can add this
+    * capacity later if need be.
+    */
+   assert(cp->func->x87_stack == 0);
+
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  We're obviously not concerned about performance on this
+    * debug path, so here goes:
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+
+      aos_release_xmm_reg(cp, i);
+   }
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+
+   /* Push the arguments:
+    */
+   x86_lea( cp->func, ecx, arg );
+   x86_push( cp->func, ecx );
+   x86_push_imm32( cp->func, (int)message );
+
+   /* Call the helper.  Could call debug_printf directly, but
+    * print_reg is a nice place to put a breakpoint if need be.
+    */
+   x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
+   x86_call( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+
+   /* Pop caller-save regs 
+    */
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   /* Done... 
+    */
+}
+
+/**
+ * The traditional instructions.  All operate on internal registers
+ * and ignore write masks and swizzling issues.
+ */
+
+static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+
+   sse_movaps(cp->func, tmp, arg0);
+   sse_mulps(cp->func, tmp, neg);
+   sse_maxps(cp->func, tmp, arg0);
+   
+   store_dest(cp, &op->FullDstRegisters[0], tmp);
+   return TRUE;
+}
+
+static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_addps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
+   x87_fcos(cp->func);
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+/* The dotproduct instructions don't really do that well in sse:
+ */
+static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp = aos_get_xmm_reg(cp); 
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+   /* Now the hard bit: sum the first 3 values:
+    */ 
+   sse_movhlps(cp->func, tmp, dst);
+   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
+   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
+   sse_addss(cp->func, dst, tmp);
+   
+   aos_release_xmm_reg(cp, tmp.idx);
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+
+static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);      
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+   
+   /* Now the hard bit: sum the values:
+    */ 
+   sse_movhlps(cp->func, tmp, dst);
+   sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
+   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
+   sse_addss(cp->func, dst, tmp);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+
+   /* Now the hard bit: sum the values (from DP3):
+    */ 
+   sse_movhlps(cp->func, tmp, dst);
+   sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
+   emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
+   sse_addss(cp->func, dst, tmp);
+   emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
+   sse_addss(cp->func, dst, tmp);
+
+   aos_release_xmm_reg(cp, tmp.idx);
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+    struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+    struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+    struct x86_reg dst = aos_get_xmm_reg(cp);
+    struct x86_reg tmp = aos_get_xmm_reg(cp);
+    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+
+/*    dst[0] = 1.0     * 1.0F; */
+/*    dst[1] = arg0[1] * arg1[1]; */
+/*    dst[2] = arg0[2] * 1.0; */
+/*    dst[3] = 1.0     * arg1[3]; */
+
+    emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
+    emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
+    sse_mulps(cp->func, dst, tmp);
+
+    aos_release_xmm_reg(cp, tmp.idx);
+    store_dest(cp, &op->FullDstRegisters[0], dst);
+    return TRUE;
+}
+
+static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld1(cp->func);		/* 1 */
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0 1 */
+   x87_fyl2x(cp->func);	/* log2(a0) */
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
+   x87_emit_ex2(cp);
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+
+   set_fpu_round_neg_inf( cp );
+
+   /* Load all sources first to avoid aliasing
+    */
+   for (i = 3; i >= 0; i--) {
+      if (writemask & (1<<i)) {
+         x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
+      }
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+         x87_fprndint( cp->func );   
+         x87_fstp(cp->func, x86_make_disp(dst, i*4));
+      }
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+
+   set_fpu_round_nearest( cp );
+
+   /* Load all sources first to avoid aliasing
+    */
+   for (i = 3; i >= 0; i--) {
+      if (writemask & (1<<i)) {
+         x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
+      }
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+         x87_fprndint( cp->func );   
+         x87_fstp(cp->func, x86_make_disp(dst, i*4));
+      }
+   }
+
+   return TRUE;
+}
+
+
+static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   struct x86_reg st0 = x86_make_reg(file_x87, 0);
+   struct x86_reg st1 = x86_make_reg(file_x87, 1);
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   int i;
+
+   set_fpu_round_neg_inf( cp );
+
+   /* suck all the source values onto the stack before writing out any
+    * dst, which may alias...
+    */
+   for (i = 3; i >= 0; i--) {
+      if (writemask & (1<<i)) {
+         x87_fld_src(cp, &op->FullSrcRegisters[0], i);   
+      }
+   }
+
+   for (i = 0; i < 4; i++) {
+      if (writemask & (1<<i)) {
+         x87_fld(cp->func, st0);     /* a a */
+         x87_fprndint( cp->func );   /* flr(a) a */
+         x87_fsubp(cp->func, st1);  /* frc(a) */
+         x87_fstp(cp->func, x86_make_disp(dst, i*4));
+      }
+   }
+
+   return TRUE;
+}
+
+static PIPE_CDECL void do_lit( struct aos_machine *machine,
+                               float *result,
+                               const float *in,
+                               unsigned count )
+{
+   if (in[0] > 0) 
+   {
+      if (in[1] <= 0.0) 
+      {
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = 1.0;
+         result[3] = 1.0F;
+      }
+      else
+      {
+         const float epsilon = 1.0F / 256.0F;    
+         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = powf(in[1], exponent);
+         result[3] = 1.0;
+      }
+   }
+   else 
+   {
+      result[0] = 1.0F;
+      result[1] = 0.0;
+      result[2] = 0.0;
+      result[3] = 1.0F;
+   }
+}
+
+
+static PIPE_CDECL void do_lit_lut( struct aos_machine *machine,
+                                   float *result,
+                                   const float *in,
+                                   unsigned count )
+{
+   if (in[0] > 0) 
+   {
+      if (in[1] <= 0.0) 
+      {
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = 1.0;
+         result[3] = 1.0F;
+         return;
+      }
+      
+      if (machine->lit_info[count].shine_tab->exponent != in[3]) {
+         machine->lit_info[count].func = do_lit;
+         goto no_luck;
+      }
+
+      if (in[1] <= 1.0)
+      {
+         const float *tab = machine->lit_info[count].shine_tab->values;
+         float f = in[1] * 256;
+         int k = (int)f;
+         float frac = f - (float)k;
+         
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = tab[k] + frac*(tab[k+1]-tab[k]);
+         result[3] = 1.0;
+         return;
+      }
+      
+   no_luck:
+      {
+         const float epsilon = 1.0F / 256.0F;    
+         float exponent = CLAMP(in[3], -(128.0F - epsilon), (128.0F - epsilon));
+         result[0] = 1.0F;
+         result[1] = in[0];
+         result[2] = powf(in[1], exponent);
+         result[3] = 1.0;
+      }
+   }
+   else 
+   {
+      result[0] = 1.0F;
+      result[1] = 0.0;
+      result[2] = 0.0;
+      result[3] = 1.0F;
+   }
+}
+
+
+
+static void PIPE_CDECL populate_lut( struct aos_machine *machine,
+                                     float *result,
+                                     const float *in,
+                                     unsigned count )
+{
+   unsigned i, tab;
+
+   /* Search for an existing table for this value.  Note that without
+    * static analysis we don't really know if in[3] will be constant,
+    * but it usually is...
+    */
+   for (tab = 0; tab < 4; tab++) {
+      if (machine->shine_tab[tab].exponent == in[3]) {
+         goto found;
+      }
+   }
+
+   for (tab = 0, i = 1; i < 4; i++) {
+      if (machine->shine_tab[i].last_used < machine->shine_tab[tab].last_used)
+         tab = i;
+   }
+
+   if (machine->shine_tab[tab].last_used == machine->now) {
+      /* No unused tables (this is not a ffvertex program...).  Just
+       * call pow each time:
+       */
+      machine->lit_info[count].func = do_lit;
+      machine->lit_info[count].func( machine, result, in, count );
+      return;
+   }
+   else {
+      do_populate_lut( &machine->shine_tab[tab], in[3] );
+   }
+
+ found:
+   machine->shine_tab[tab].last_used = machine->now;
+   machine->lit_info[count].shine_tab = &machine->shine_tab[tab];
+   machine->lit_info[count].func = do_lit_lut;
+   machine->lit_info[count].func( machine, result, in, count );
+}
+
+
+
+
+
+static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+   unsigned lit_count = cp->lit_count++;
+   struct x86_reg result, arg0;
+   unsigned i;
+
+#if 1
+   /* For absolute correctness, need to spill/invalidate all XMM regs
+    * too.  
+    */
+   for (i = 0; i < 8; i++) {
+      if (cp->xmm[i].dirty) 
+         spill(cp, i);
+      aos_release_xmm_reg(cp, i);
+   }
+#endif
+
+   if (writemask != TGSI_WRITEMASK_XYZW) 
+      result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
+   else 
+      result = get_dst_ptr(cp, &op->FullDstRegisters[0]);    
+
+   
+   arg0 = fetch_src( cp, &op->FullSrcRegisters[0] );
+   if (arg0.file == file_XMM) {
+      struct x86_reg tmp = x86_make_disp(cp->machine_EDX, 
+                                         Offset(struct aos_machine, tmp[1]));
+      sse_movaps( cp->func, tmp, arg0 );
+      arg0 = tmp;
+   }
+                  
+      
+
+   /* Push caller-save (ie scratch) regs.  
+    */
+   x86_cdecl_caller_push_regs( cp->func );
+
+   /* Push the arguments:
+    */
+   x86_push_imm32( cp->func, lit_count );
+
+   x86_lea( cp->func, ecx, arg0 );
+   x86_push( cp->func, ecx );
+
+   x86_lea( cp->func, ecx, result );
+   x86_push( cp->func, ecx );
+
+   x86_push( cp->func, cp->machine_EDX );
+
+   if (lit_count < MAX_LIT_INFO) {
+      x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX, 
+                                             Offset(struct aos_machine, lit_info) + 
+                                             lit_count * sizeof(struct lit_info) + 
+                                             Offset(struct lit_info, func)));
+   }
+   else {
+      x86_mov_reg_imm( cp->func, ecx, (int)do_lit );
+   }
+
+   x86_call( cp->func, ecx );
+            
+   x86_pop( cp->func, ecx );    /* fixme... */
+   x86_pop( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+   x86_pop( cp->func, ecx );
+
+   x86_cdecl_caller_pop_regs( cp->func );
+
+   if (writemask != TGSI_WRITEMASK_XYZW) {
+      store_dest( cp, 
+                  &op->FullDstRegisters[0],
+                  get_xmm_writable( cp, result ) );
+   }
+
+   return TRUE;
+}
+
+   
+static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg dst = get_dst_ptr(cp, &op->FullDstRegisters[0]); 
+   unsigned writemask = op->FullDstRegisters[0].DstRegister.WriteMask;
+
+   if (writemask & TGSI_WRITEMASK_YZ) {
+      struct x86_reg st1 = x86_make_reg(file_x87, 1);
+      struct x86_reg st2 = x86_make_reg(file_x87, 2);
+
+      /* a1' = a1 <= 0 ? 1 : a1;  
+       */
+      x87_fldz(cp->func);                           /* 1 0  */
+#if 1
+      x87_fld1(cp->func);                           /* 1 0  */
+#else
+      /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
+       */
+      x87_fldz(cp->func);                           /* 1 0  */
+#endif
+      x87_fld_src(cp, &op->FullSrcRegisters[0], 1); /* a1 1 0  */
+      x87_fcomi(cp->func, st2);	                    /* a1 1 0  */
+      x87_fcmovb(cp->func, st1);                    /* a1' 1 0  */
+      x87_fstp(cp->func, st1);                      /* a1' 0  */
+      x87_fstp(cp->func, st1);                      /* a1'  */
+
+      x87_fld_src(cp, &op->FullSrcRegisters[0], 3); /* a3 a1'  */
+      x87_fxch(cp->func, st1);                      /* a1' a3  */
+      
+
+      /* Compute pow(a1, a3)
+       */
+      x87_fyl2x(cp->func);	/* a3*log2(a1)      */
+      x87_emit_ex2( cp );       /* 2^(a3*log2(a1))   */
+
+
+      /* a0' = max2(a0, 0):
+       */
+      x87_fldz(cp->func);                           /* 0 r2 */
+      x87_fld_src(cp, &op->FullSrcRegisters[0], 0); /* a0 0 r2 */
+      x87_fcomi(cp->func, st1);	
+      x87_fcmovb(cp->func, st1);                    /* a0' 0 r2 */
+
+      x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
+
+      x87_fcomi(cp->func, st1);  /* a0' 0 r2 */
+      x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
+
+      x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
+      x87_fpop(cp->func);       /* r2 */
+      x87_fpop(cp->func);
+   }
+
+   if (writemask & TGSI_WRITEMASK_XW) {
+      x87_fld1(cp->func);
+      x87_fst_or_nop(cp->func, writemask, 0, dst);
+      x87_fstp_or_pop(cp->func, writemask, 3, dst);
+   }
+
+   return TRUE;
+}
+
+
+
+static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_maxps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_minps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   /* potentially nothing to do */
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg arg2 = fetch_src(cp, &op->FullSrcRegisters[2]);
+
+   /* If we can't clobber old contents of arg0, get a temporary & copy
+    * it there, then clobber it...
+    */
+   arg0 = get_xmm_writable(cp, arg0);
+
+   sse_mulps(cp->func, arg0, arg1);
+   sse_addps(cp->func, arg0, arg2);
+   store_dest(cp, &op->FullDstRegisters[0], arg0);
+   return TRUE;
+}
+
+/* Really not sufficient -- need to check for conditions that could
+ * generate inf/nan values, which will slow things down hugely.
+ */
+static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[1], 0);  /* a1.x */
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);	/* a0.x a1.x */
+   x87_fyl2x(cp->func);	                                /* a1*log2(a0) */
+
+   x87_emit_ex2( cp );		/* 2^(a1*log2(a0)) */
+
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg dst = aos_get_xmm_reg(cp);
+
+   if (cp->have_sse2) {
+      sse2_rcpss(cp->func, dst, arg0);
+      /* extend precision here...
+       */
+   }
+   else {
+      struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+      sse_movss(cp->func, dst, ones);
+      sse_divss(cp->func, dst, arg0);
+   }
+
+   store_scalar_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+/* Although rsqrtps() and rcpps() are low precision on some/all SSE
+ * implementations, it is possible to improve its precision at
+ * fairly low cost, using a newton/raphson step, as below:
+ * 
+ * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
+ * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
+ * or:
+ *   x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
+ * 
+ *
+ * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
+ */
+static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+
+   if (0) {
+      struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+      struct x86_reg r = aos_get_xmm_reg(cp);
+      sse_rsqrtss(cp->func, r, arg0);
+      store_scalar_dest(cp, &op->FullDstRegisters[0], r);
+      return TRUE;
+   }
+   else {
+      struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+      struct x86_reg r = aos_get_xmm_reg(cp);
+
+      struct x86_reg neg_half       = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
+      struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
+      struct x86_reg src            = get_xmm_writable( cp, arg0 );
+      
+      sse_rsqrtss( cp->func, r, src  );             /* rsqrtss(a) */
+      sse_mulss(   cp->func, src, neg_half  );      /* -.5 * a */
+      sse_mulss(   cp->func, src,  r );             /* -.5 * a * r */
+      sse_mulss(   cp->func, src,  r );             /* -.5 * a * r * r */
+      sse_addss(   cp->func, src, one_point_five ); /* 1.5 - .5 * a * r * r */
+      sse_mulss(   cp->func, r,  src );             /* r * (1.5 - .5 * a * r * r) */
+
+      store_scalar_dest(cp, &op->FullDstRegisters[0], r);
+      return TRUE;
+   }
+}
+
+
+static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
+   sse_andps(cp->func, dst, ones);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   x87_fld_src(cp, &op->FullSrcRegisters[0], 0);
+   x87_fsin(cp->func);
+   x87_fstp_dest4(cp, &op->FullDstRegisters[0]);
+   return TRUE;
+}
+
+
+
+static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+   
+   sse_cmpps(cp->func, dst, arg1, cc_LessThan);
+   sse_andps(cp->func, dst, ones);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg dst = get_xmm_writable(cp, arg0);
+
+   sse_subps(cp->func, dst, arg1);
+
+   store_dest(cp, &op->FullDstRegisters[0], dst);
+   return TRUE;
+}
+
+
+static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op ) 
+{
+   struct x86_reg arg0 = fetch_src(cp, &op->FullSrcRegisters[0]);
+   struct x86_reg arg1 = fetch_src(cp, &op->FullSrcRegisters[1]);
+   struct x86_reg tmp0 = aos_get_xmm_reg(cp);
+   struct x86_reg tmp1 = aos_get_xmm_reg(cp);
+
+   emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
+   sse_mulps(cp->func, tmp1, arg0);
+   emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
+   sse_mulps(cp->func, tmp0, arg1);
+   sse_subps(cp->func, tmp1, tmp0);
+   sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
+
+/*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
+/*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
+/*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
+/*    dst[3] is undef */
+
+
+   aos_release_xmm_reg(cp, tmp0.idx);
+   store_dest(cp, &op->FullDstRegisters[0], tmp1);
+   return TRUE;
+}
+
+
+
+static boolean
+emit_instruction( struct aos_compilation *cp,
+                  struct tgsi_full_instruction *inst )
+{
+   x87_assert_stack_empty(cp->func);
+
+   switch( inst->Instruction.Opcode ) {
+   case TGSI_OPCODE_MOV:
+      return emit_MOV( cp, inst );
+
+   case TGSI_OPCODE_LIT:
+      return emit_LIT(cp, inst);
+
+   case TGSI_OPCODE_RCP:
+      return emit_RCP(cp, inst);
+
+   case TGSI_OPCODE_RSQ:
+      return emit_RSQ(cp, inst);
+
+   case TGSI_OPCODE_EXP:
+      /*return emit_EXP(cp, inst);*/
+      return FALSE;
+
+   case TGSI_OPCODE_LOG:
+      /*return emit_LOG(cp, inst);*/
+      return FALSE;
+
+   case TGSI_OPCODE_MUL:
+      return emit_MUL(cp, inst);
+
+   case TGSI_OPCODE_ADD:
+      return emit_ADD(cp, inst);
+
+   case TGSI_OPCODE_DP3:
+      return emit_DP3(cp, inst);
+
+   case TGSI_OPCODE_DP4:
+      return emit_DP4(cp, inst);
+
+   case TGSI_OPCODE_DST:
+      return emit_DST(cp, inst);
+
+   case TGSI_OPCODE_MIN:
+      return emit_MIN(cp, inst);
+
+   case TGSI_OPCODE_MAX:
+      return emit_MAX(cp, inst);
+
+   case TGSI_OPCODE_SLT:
+      return emit_SLT(cp, inst);
+
+   case TGSI_OPCODE_SGE:
+      return emit_SGE(cp, inst);
+
+   case TGSI_OPCODE_MAD:
+      return emit_MAD(cp, inst);
+
+   case TGSI_OPCODE_SUB:
+      return emit_SUB(cp, inst);
+ 
+   case TGSI_OPCODE_LERP:
+//      return emit_LERP(cp, inst);
+      return FALSE;
+
+   case TGSI_OPCODE_FRAC:
+      return emit_FRC(cp, inst);
+
+   case TGSI_OPCODE_CLAMP:
+//      return emit_CLAMP(cp, inst);
+      return FALSE;
+
+   case TGSI_OPCODE_FLOOR:
+      return emit_FLR(cp, inst);
+
+   case TGSI_OPCODE_ROUND:
+      return emit_RND(cp, inst);
+
+   case TGSI_OPCODE_EXPBASE2:
+      return emit_EX2(cp, inst);
+
+   case TGSI_OPCODE_LOGBASE2:
+      return emit_LG2(cp, inst);
+
+   case TGSI_OPCODE_POWER:
+      return emit_POW(cp, inst);
+
+   case TGSI_OPCODE_CROSSPRODUCT:
+      return emit_XPD(cp, inst);
+
+   case TGSI_OPCODE_ABS:
+      return emit_ABS(cp, inst);
+
+   case TGSI_OPCODE_DPH:
+      return emit_DPH(cp, inst);
+
+   case TGSI_OPCODE_COS:
+      return emit_COS(cp, inst);
+
+   case TGSI_OPCODE_SIN:
+      return emit_SIN(cp, inst);
+
+   case TGSI_OPCODE_END:
+      return TRUE;
+
+   default:
+      return FALSE;
+   }
+}
+
+
+static boolean emit_viewport( struct aos_compilation *cp )
+{
+   struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
+                                               TGSI_FILE_OUTPUT, 
+                                               0);
+
+   struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, scale));
+
+   struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, translate));
+
+   sse_mulps(cp->func, pos, scale);
+   sse_addps(cp->func, pos, translate);
+
+   aos_adopt_xmm_reg( cp,
+                      pos,
+                      TGSI_FILE_OUTPUT,
+                      0,
+                      TRUE );
+   return TRUE;
+}
+
+
+/* This is useful to be able to see the results on softpipe.  Doesn't
+ * do proper clipping, just assumes the backend can do it during
+ * rasterization -- for debug only...
+ */
+static boolean emit_rhw_viewport( struct aos_compilation *cp )
+{
+   struct x86_reg tmp = aos_get_xmm_reg(cp);
+   struct x86_reg pos = aos_get_shader_reg_xmm(cp, 
+                                               TGSI_FILE_OUTPUT, 
+                                               0);
+
+   struct x86_reg scale = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, scale));
+
+   struct x86_reg translate = x86_make_disp(cp->machine_EDX, 
+                                        Offset(struct aos_machine, translate));
+
+
+
+   emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
+   sse2_rcpss(cp->func, tmp, tmp);
+   sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
+   
+   sse_mulps(cp->func, pos, scale);
+   sse_mulps(cp->func, pos, tmp);
+   sse_addps(cp->func, pos, translate);
+
+   /* Set pos[3] = w 
+    */
+   mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
+
+   aos_adopt_xmm_reg( cp,
+                      pos,
+                      TGSI_FILE_OUTPUT,
+                      0,
+                      TRUE );
+   return TRUE;
+}
+
+
+static boolean note_immediate( struct aos_compilation *cp,
+                               struct tgsi_full_immediate *imm )
+{
+   unsigned pos = cp->num_immediates++;
+   unsigned j;
+
+   for (j = 0; j < imm->Immediate.Size; j++) {
+      cp->vaos->machine->immediate[pos][j] = imm->u.ImmediateFloat32[j].Float;
+   }
+
+   return TRUE;
+}
+
+
+
+
+static void find_last_write_outputs( struct aos_compilation *cp )
+{
+   struct tgsi_parse_context parse;
+   unsigned this_instruction = 0;
+   unsigned i;
+
+   tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
+
+   while (!tgsi_parse_end_of_tokens( &parse )) {
+      
+      tgsi_parse_token( &parse );
+
+      if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION) 
+         continue;
+
+      for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
+         if (parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.File ==
+             TGSI_FILE_OUTPUT) 
+         {
+            unsigned idx = parse.FullToken.FullInstruction.FullDstRegisters[i].DstRegister.Index;
+            cp->output_last_write[idx] = this_instruction;
+         }
+      }
+
+      this_instruction++;
+   }
+
+   tgsi_parse_free( &parse );
+}
+
+
+#define ARG_VARIENT    1
+#define ARG_START_ELTS 2
+#define ARG_COUNT      3
+#define ARG_OUTBUF     4
+
+
+static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
+                                     boolean linear )
+{ 
+   struct tgsi_parse_context parse;
+   struct aos_compilation cp;
+   unsigned fixup, label;
+
+   tgsi_parse_init( &parse, varient->base.vs->state.tokens );
+
+   memset(&cp, 0, sizeof(cp));
+
+   cp.insn_counter = 1;
+   cp.vaos = varient;
+   cp.have_sse2 = 1;
+   cp.func = &varient->func[ linear ? 0 : 1 ];
+
+   cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
+   cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX);
+   cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
+   cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX);
+   cp.count_ESI     = x86_make_reg(file_REG32, reg_SI);
+
+   x86_init_func(cp.func);
+
+   find_last_write_outputs(&cp);
+
+   x86_push(cp.func, cp.idx_EBX);
+   x86_push(cp.func, cp.count_ESI);
+
+
+   /* Load arguments into regs:
+    */
+   x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_VARIENT));
+   x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
+   x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
+   x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
+
+
+   /* Compare count to zero and possibly bail.
+    */
+   x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
+   x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
+   fixup = x86_jcc_forward(cp.func, cc_E);
+
+   /* Dig out the machine pointer from inside the varient arg 
+    */
+   x86_mov(cp.func, cp.machine_EDX, 
+           x86_make_disp(cp.machine_EDX,
+                         Offset( struct draw_vs_varient_aos_sse, machine )));
+
+   save_fpu_state( &cp );
+   set_fpu_round_nearest( &cp );
+
+   /* Note address for loop jump 
+    */
+   label = x86_get_label(cp.func);
+   {
+      /* Fetch inputs...  TODO:  fetch lazily...
+       */
+      if (!aos_fetch_inputs( &cp, linear ))
+         goto fail;
+
+      /* Emit the shader:
+       */
+      while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error ) 
+      {
+         tgsi_parse_token( &parse );
+
+         switch (parse.FullToken.Token.Type) {
+         case TGSI_TOKEN_TYPE_IMMEDIATE:
+            if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
+               goto fail;
+            break;
+
+         case TGSI_TOKEN_TYPE_INSTRUCTION:
+            if (DISASSEM)
+               tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
+
+            if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
+               goto fail;
+            break;
+         }
+
+         x87_assert_stack_empty(cp.func);
+         cp.insn_counter++;
+
+         if (DISASSEM)
+            debug_printf("\n");
+      }
+
+   
+      {
+         unsigned i;
+         for (i = 0; i < 8; i++) {
+            if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
+               cp.xmm[i].file = TGSI_FILE_NULL;
+               cp.xmm[i].dirty = 0;
+            }
+         }
+      }
+
+      if (cp.error)
+         goto fail;
+
+      if (cp.vaos->base.key.clip) {
+         /* not really handling clipping, just do the rhw so we can
+          * see the results...
+          */
+         emit_rhw_viewport(&cp); 
+      }
+      else if (cp.vaos->base.key.viewport) {
+         emit_viewport(&cp);
+      }
+
+      /* Emit output...  TODO: do this eagerly after the last write to a
+       * given output.
+       */
+      if (!aos_emit_outputs( &cp ))
+         goto fail;
+
+
+      /* Next vertex:
+       */
+      x86_lea(cp.func, 
+              cp.outbuf_ECX, 
+              x86_make_disp(cp.outbuf_ECX, 
+                            cp.vaos->base.key.output_stride));
+
+      /* Incr index
+       */   
+      if (linear) {
+         x86_inc(cp.func, cp.idx_EBX);
+      } 
+      else {
+         x86_lea(cp.func, cp.idx_EBX, x86_make_disp(cp.idx_EBX, 4));
+      }
+
+   }
+   /* decr count, loop if not zero
+    */
+   x86_dec(cp.func, cp.count_ESI);
+   x86_jcc(cp.func, cc_NZ, label);
+
+   restore_fpu_state(&cp);
+
+   /* Land forward jump here:
+    */
+   x86_fixup_fwd_jump(cp.func, fixup);
+
+   /* Exit mmx state?
+    */
+   if (cp.func->need_emms)
+      mmx_emms(cp.func);
+
+   x86_pop(cp.func, cp.count_ESI);
+   x86_pop(cp.func, cp.idx_EBX);
+
+   x87_assert_stack_empty(cp.func);
+   x86_ret(cp.func);
+
+   tgsi_parse_free( &parse );
+   return !cp.error;
+
+ fail:
+   tgsi_parse_free( &parse );
+   return FALSE;
+}
+
+
+
+static void vaos_set_buffer( struct draw_vs_varient *varient,
+                             unsigned buf,
+                             const void *ptr,
+                             unsigned stride )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+   unsigned i;
+
+   for (i = 0; i < vaos->base.key.nr_inputs; i++) {
+      if (vaos->base.key.element[i].in.buffer == buf) {
+         vaos->machine->attrib[i].input_ptr = ((char *)ptr +
+                                               vaos->base.key.element[i].in.offset);
+         vaos->machine->attrib[i].input_stride = stride;
+      }
+   }
+}
+
+
+static void vaos_destroy( struct draw_vs_varient *varient )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   if (vaos->machine)
+      align_free( vaos->machine );
+
+   x86_release_func( &vaos->func[0] );
+   x86_release_func( &vaos->func[1] );
+
+   FREE(vaos);
+}
+
+static void vaos_run_elts( struct draw_vs_varient *varient,
+                           const unsigned *elts,
+                           unsigned count,
+                           void *output_buffer )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   vaos->machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
+   vaos->gen_run_elts( varient,
+                       elts,
+                       count,
+                       output_buffer );
+}
+
+static void vaos_run_linear( struct draw_vs_varient *varient,
+                             unsigned start,
+                             unsigned count,
+                             void *output_buffer )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   vaos->machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
+   vaos->gen_run_linear( varient,
+                         start,
+                         count,
+                         output_buffer );
+}
+
+
+static void vaos_set_constants( struct draw_vs_varient *varient,
+                                const float (*constants)[4] )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   memcpy(vaos->machine->constant,
+          constants,
+          (vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1) * 4 * sizeof(float));
+
+#if 0
+   unsigned i;
+   for (i =0; i < vaos->base.vs->info.file_max[TGSI_FILE_CONSTANT] + 1; i++)
+      debug_printf("state %d: %f %f %f %f\n",
+                   i, 
+                   constants[i][0],
+                   constants[i][1],
+                   constants[i][2],
+                   constants[i][3]);
+#endif
+
+   {
+      unsigned i;
+      for (i = 0; i < MAX_LIT_INFO; i++) {
+         vaos->machine->lit_info[i].func = populate_lut;
+         vaos->machine->now++;
+      }
+   }
+}
+
+
+static void vaos_set_viewport( struct draw_vs_varient *varient,
+                               const struct pipe_viewport_state *viewport )
+{
+   struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
+
+   memcpy(vaos->machine->scale, viewport->scale, 4 * sizeof(float));
+   memcpy(vaos->machine->translate, viewport->translate, 4 * sizeof(float));
+}
+
+
+
+static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key )
+{
+   struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
+
+   if (!vaos)
+      goto fail;
+   
+   vaos->base.key = *key;
+   vaos->base.vs = vs;
+   vaos->base.set_input = vaos_set_buffer;
+   vaos->base.set_constants = vaos_set_constants;
+   vaos->base.set_viewport = vaos_set_viewport;
+   vaos->base.destroy = vaos_destroy;
+   vaos->base.run_linear = vaos_run_linear;
+   vaos->base.run_elts = vaos_run_elts;
+
+   vaos->draw = vs->draw;
+   vaos->machine = align_malloc( sizeof(struct aos_machine), 16 );
+   if (!vaos->machine)
+      goto fail;
+   
+   memset(vaos->machine, 0, sizeof(struct aos_machine));
+   init_internals(vaos->machine);
+
+   tgsi_dump(vs->state.tokens, 0);
+
+   if (!build_vertex_program( vaos, TRUE ))
+      goto fail;
+
+   if (!build_vertex_program( vaos, FALSE ))
+      goto fail;
+
+   vaos->gen_run_linear = (vsv_run_linear_func)x86_get_func(&vaos->func[0]);
+   if (!vaos->gen_run_linear)
+      goto fail;
+
+   vaos->gen_run_elts = (vsv_run_elts_func)x86_get_func(&vaos->func[1]);
+   if (!vaos->gen_run_elts)
+      goto fail;
+
+   return &vaos->base;
+
+ fail:
+   if (vaos->machine)
+      align_free( vaos->machine );
+
+   if (vaos)
+      x86_release_func( &vaos->func[0] );
+
+   if (vaos)
+      x86_release_func( &vaos->func[1] );
+
+   FREE(vaos);
+   
+   return NULL;
+}
+
+
+struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key )
+{
+   struct draw_vs_varient *varient = varient_aos_sse( vs, key );
+
+   if (varient == NULL) {
+      assert(0);
+      varient = draw_vs_varient_generic( vs, key );
+   }
+
+   return varient;
+}
+
+
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos.h b/src/gallium/auxiliary/draw/draw_vs_aos.h
new file mode 100644
index 00000000000..b47413ff43c
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos.h
@@ -0,0 +1,222 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+/* Authors:  Keith Whitwell <keith@tungstengraphics.com>
+ */
+
+#ifndef DRAW_VS_AOS_H
+#define DRAW_VS_AOS_H
+
+
+struct tgsi_token;
+struct x86_function;
+
+#include "pipe/p_state.h"
+#include "rtasm/rtasm_x86sse.h"
+
+
+
+
+
+#define X    0
+#define Y    1
+#define Z    2
+#define W    3
+
+#define MAX_INPUTS     PIPE_MAX_ATTRIBS
+#define MAX_OUTPUTS    PIPE_MAX_ATTRIBS
+#define MAX_TEMPS      PIPE_MAX_ATTRIBS /* say */
+#define MAX_CONSTANTS  PIPE_MAX_ATTRIBS /* say */
+#define MAX_IMMEDIATES PIPE_MAX_ATTRIBS /* say */
+#define MAX_INTERNALS  8
+
+#define AOS_FILE_INTERNAL TGSI_FILE_COUNT
+
+#define FPU_RND_NEG    1
+#define FPU_RND_NEAREST 2
+
+struct aos_machine;
+typedef void PIPE_CDECL (*lit_func)( struct aos_machine *,
+                                    float *result,
+                                    const float *in,
+                                    unsigned count );
+struct shine_tab {
+   float exponent;
+   float values[258];
+   unsigned last_used;
+};
+
+struct lit_info {
+   lit_func func;
+   struct shine_tab *shine_tab;
+};
+
+#define MAX_SHINE_TAB    4
+#define MAX_LIT_INFO     16
+
+/* This is the temporary storage used by all the aos_sse vs varients.
+ * Create one per context and reuse by passing a pointer in at
+ * vs_varient creation??
+ */
+struct aos_machine {
+   float input    [MAX_INPUTS    ][4];
+   float output   [MAX_OUTPUTS   ][4];
+   float temp     [MAX_TEMPS     ][4];
+   float constant [MAX_CONSTANTS ][4]; /* fixme -- should just be a pointer */
+   float immediate[MAX_IMMEDIATES][4]; /* fixme -- should just be a pointer */
+   float internal [MAX_INTERNALS ][4];
+
+   float scale[4];              /* viewport */
+   float translate[4];          /* viewport */
+
+   float tmp[2][4];             /* scratch space for LIT */
+
+   struct shine_tab shine_tab[MAX_SHINE_TAB];
+   struct lit_info  lit_info[MAX_LIT_INFO];
+   unsigned now;
+   
+
+   ushort fpu_rnd_nearest;
+   ushort fpu_rnd_neg_inf;
+   ushort fpu_restore;
+   ushort fpucntl;              /* one of FPU_* above */
+
+   struct {
+      const void *input_ptr;
+      unsigned input_stride;
+
+      unsigned output_offset;
+   } attrib[PIPE_MAX_ATTRIBS];
+};
+
+
+
+
+struct aos_compilation {
+   struct x86_function *func;
+   struct draw_vs_varient_aos_sse *vaos;
+
+   unsigned insn_counter;
+   unsigned num_immediates;
+   unsigned count;
+   unsigned lit_count;
+
+   struct {
+      unsigned idx:16;
+      unsigned file:8;
+      unsigned dirty:8;
+      unsigned last_used;
+   } xmm[8];
+
+
+   boolean input_fetched[PIPE_MAX_ATTRIBS];
+   unsigned output_last_write[PIPE_MAX_ATTRIBS];
+
+   boolean have_sse2;
+   boolean error;
+   short fpucntl;
+
+   /* these are actually known values, but putting them in a struct
+    * like this is helpful to keep them in sync across the file.
+    */
+   struct x86_reg tmp_EAX;
+   struct x86_reg idx_EBX;     /* either start+i or &elt[i] */
+   struct x86_reg outbuf_ECX;
+   struct x86_reg machine_EDX;
+   struct x86_reg count_ESI;    /* decrements to zero */
+};
+
+struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp );
+void aos_release_xmm_reg( struct aos_compilation *cp, unsigned idx );
+
+void aos_adopt_xmm_reg( struct aos_compilation *cp,
+                        struct x86_reg reg,
+                        unsigned file,
+                        unsigned idx,
+                        unsigned dirty );
+
+struct x86_reg aos_get_shader_reg( struct aos_compilation *cp, 
+                                   unsigned file,
+                                   unsigned idx );
+
+boolean aos_fetch_inputs( struct aos_compilation *cp,
+                          boolean linear );
+
+boolean aos_emit_outputs( struct aos_compilation *cp );
+
+
+#define IMM_ONES     0              /* 1, 1,1,1 */
+#define IMM_SWZ      1              /* 1,-1,0, 0xffffffff */
+#define IMM_IDENTITY 2              /* 0, 0,0,1 */
+#define IMM_INV_255  3              /* 1/255, 1/255, 1/255, 1/255 */
+#define IMM_255      4              /* 255, 255, 255, 255 */
+#define IMM_NEGS     5              /* -1,-1,-1,-1 */
+#define IMM_RSQ      6              /* -.5,1.5,_,_ */
+#define IMM_PSIZE    7              /* not really an immediate - updated each run */
+
+struct x86_reg aos_get_internal( struct aos_compilation *cp,
+                                 unsigned imm );
+struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
+                                     unsigned imm );
+
+
+#define ERROR(cp, msg)                                                  \
+do {                                                                    \
+   debug_printf("%s: x86 translation failed: %s\n", __FUNCTION__, msg); \
+   cp->error = 1;                                                       \
+   assert(0);                                                           \
+} while (0)
+
+
+
+
+
+
+struct draw_vs_varient_aos_sse {
+   struct draw_vs_varient base;
+   struct draw_context *draw;
+
+#if 0
+   struct {
+      const void *ptr;
+      unsigned stride;
+   } attrib[PIPE_MAX_ATTRIBS];
+#endif
+
+   struct aos_machine *machine; /* XXX: temporarily unshared */
+
+   vsv_run_linear_func gen_run_linear;
+   vsv_run_elts_func gen_run_elts;
+
+
+   struct x86_function func[2];
+};
+
+
+
+#endif 
+
diff --git a/src/gallium/auxiliary/draw/draw_vs_aos_io.c b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
new file mode 100644
index 00000000000..836110f382d
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_aos_io.c
@@ -0,0 +1,326 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+
+#include "pipe/p_util.h"
+#include "pipe/p_shader_tokens.h"
+#include "tgsi/util/tgsi_parse.h"
+#include "tgsi/util/tgsi_util.h"
+#include "tgsi/exec/tgsi_exec.h"
+#include "draw_vs.h"
+#include "draw_vs_aos.h"
+#include "draw_vertex.h"
+
+#include "rtasm/rtasm_x86sse.h"
+
+#ifdef PIPE_ARCH_X86
+
+/* Note - don't yet have to worry about interacting with the code in
+ * draw_vs_aos.c as there is no intermingling of generated code...
+ * That may have to change, we'll see.
+ */
+static void emit_load_R32G32B32A32( struct aos_compilation *cp, 			   
+				    struct x86_reg data,
+				    struct x86_reg src_ptr )
+{
+   sse_movups(cp->func, data, src_ptr);
+}
+
+static void emit_load_R32G32B32( struct aos_compilation *cp, 			   
+				 struct x86_reg data,
+				 struct x86_reg src_ptr )
+{
+   sse_movss(cp->func, data, x86_make_disp(src_ptr, 8));
+   sse_shufps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ), SHUF(X,Y,Z,W) );
+   sse_shufps(cp->func, data, data, SHUF(Y,Z,X,W) );
+   sse_movlps(cp->func, data, src_ptr);
+}
+
+static void emit_load_R32G32( struct aos_compilation *cp, 
+			   struct x86_reg data,
+			   struct x86_reg src_ptr )
+{
+   sse_movups(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
+   sse_movlps(cp->func, data, src_ptr);
+}
+
+
+static void emit_load_R32( struct aos_compilation *cp, 
+			   struct x86_reg data,
+			   struct x86_reg src_ptr )
+{
+   sse_movss(cp->func, data, src_ptr);
+   sse_orps(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ) );
+}
+
+
+static void emit_load_R8G8B8A8_UNORM( struct aos_compilation *cp,
+				       struct x86_reg data,
+				       struct x86_reg src_ptr )
+{
+   sse_movss(cp->func, data, src_ptr);
+   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
+   sse2_punpcklbw(cp->func, data, aos_get_internal_xmm( cp, IMM_IDENTITY ));
+   sse2_cvtdq2ps(cp->func, data, data);
+   sse_mulps(cp->func, data, aos_get_internal(cp, IMM_INV_255));
+}
+
+
+
+static void get_src_ptr( struct x86_function *func,
+                         struct x86_reg src,
+                         struct x86_reg machine,
+                         struct x86_reg elt,
+                         unsigned a )
+{
+   struct x86_reg input_ptr = 
+      x86_make_disp(machine, 
+		    Offset(struct aos_machine, attrib[a].input_ptr));
+
+   struct x86_reg input_stride = 
+      x86_make_disp(machine, 
+		    Offset(struct aos_machine, attrib[a].input_stride));
+
+   /* Calculate pointer to current attrib:
+    */
+   x86_mov(func, src, input_stride);
+   x86_imul(func, src, elt);
+   x86_add(func, src, input_ptr);
+}
+
+
+/* Extended swizzles?  Maybe later.
+ */  
+static void emit_swizzle( struct aos_compilation *cp,
+			  struct x86_reg dest,
+			  struct x86_reg src,
+			  unsigned shuffle )
+{
+   sse_shufps(cp->func, dest, src, shuffle);
+}
+
+
+static boolean load_input( struct aos_compilation *cp,
+                           unsigned idx,
+                           boolean linear )
+{
+   unsigned format = cp->vaos->base.key.element[idx].in.format;
+   struct x86_reg src = cp->tmp_EAX;
+   struct x86_reg dataXMM = aos_get_xmm_reg(cp);
+
+   /* Figure out source pointer address:
+    */
+   get_src_ptr(cp->func, 
+               src, 
+               cp->machine_EDX, 
+               linear ? cp->idx_EBX : x86_deref(cp->idx_EBX),
+               idx);
+
+   src = x86_deref(src);
+
+   aos_adopt_xmm_reg( cp,
+                      dataXMM,
+                      TGSI_FILE_INPUT,
+                      idx,
+                      TRUE );
+
+   switch (format) {
+   case PIPE_FORMAT_R32_FLOAT:
+      emit_load_R32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_R32G32_FLOAT:
+      emit_load_R32G32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_R32G32B32_FLOAT:
+      emit_load_R32G32B32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_R32G32B32A32_FLOAT:
+      emit_load_R32G32B32A32(cp, dataXMM, src);
+      break;
+   case PIPE_FORMAT_B8G8R8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
+      emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+      break;
+   case PIPE_FORMAT_R8G8B8A8_UNORM:
+      emit_load_R8G8B8A8_UNORM(cp, dataXMM, src);
+      break;
+   default:
+      ERROR(cp, "unhandled input format");
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+boolean aos_fetch_inputs( struct aos_compilation *cp, boolean linear )
+{
+   unsigned i;
+   
+   for (i = 0; i < cp->vaos->base.key.nr_inputs; i++) {
+      if (!load_input( cp, i, linear ))
+         return FALSE;
+      cp->insn_counter++;
+      debug_printf("\n");
+   }
+
+   return TRUE;
+}
+
+
+
+
+
+
+
+static void emit_store_R32G32B32A32( struct aos_compilation *cp, 			   
+				     struct x86_reg dst_ptr,
+				     struct x86_reg dataXMM )
+{
+   sse_movups(cp->func, dst_ptr, dataXMM);
+}
+
+static void emit_store_R32G32B32( struct aos_compilation *cp, 
+				  struct x86_reg dst_ptr,
+				  struct x86_reg dataXMM )
+{
+   sse_movlps(cp->func, dst_ptr, dataXMM);
+   sse_shufps(cp->func, dataXMM, dataXMM, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
+   sse_movss(cp->func, x86_make_disp(dst_ptr,8), dataXMM);
+}
+
+static void emit_store_R32G32( struct aos_compilation *cp, 
+			       struct x86_reg dst_ptr,
+			       struct x86_reg dataXMM )
+{
+   sse_movlps(cp->func, dst_ptr, dataXMM);
+}
+
+static void emit_store_R32( struct aos_compilation *cp, 
+			    struct x86_reg dst_ptr,
+			    struct x86_reg dataXMM )
+{
+   sse_movss(cp->func, dst_ptr, dataXMM);
+}
+
+
+
+static void emit_store_R8G8B8A8_UNORM( struct aos_compilation *cp,
+				       struct x86_reg dst_ptr,
+				       struct x86_reg dataXMM )
+{
+   sse_mulps(cp->func, dataXMM, aos_get_internal(cp, IMM_255));
+   sse2_cvtps2dq(cp->func, dataXMM, dataXMM);
+   sse2_packssdw(cp->func, dataXMM, dataXMM);
+   sse2_packuswb(cp->func, dataXMM, dataXMM);
+   sse_movss(cp->func, dst_ptr, dataXMM);
+}
+
+
+
+
+
+static boolean emit_output( struct aos_compilation *cp,
+                            struct x86_reg ptr,
+                            struct x86_reg dataXMM, 
+                            unsigned format )
+{
+   switch (format) {
+   case EMIT_1F:
+   case EMIT_1F_PSIZE:
+      emit_store_R32(cp, ptr, dataXMM);
+      break;
+   case EMIT_2F:
+      emit_store_R32G32(cp, ptr, dataXMM);
+      break;
+   case EMIT_3F:
+      emit_store_R32G32B32(cp, ptr, dataXMM);
+      break;
+   case EMIT_4F:
+      emit_store_R32G32B32A32(cp, ptr, dataXMM);
+      break;
+   case EMIT_4UB:
+      if (1) {
+         emit_swizzle(cp, dataXMM, dataXMM, SHUF(Z,Y,X,W));
+         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
+      }
+      else {
+         emit_store_R8G8B8A8_UNORM(cp, ptr, dataXMM);
+      }
+      break;
+   default:
+      ERROR(cp, "unhandled output format");
+      return FALSE;
+   }
+
+   return TRUE;
+}
+
+
+
+boolean aos_emit_outputs( struct aos_compilation *cp )
+{
+   unsigned i;
+   
+   for (i = 0; i < cp->vaos->base.key.nr_outputs; i++) {
+      unsigned format = cp->vaos->base.key.element[i].out.format;
+      unsigned offset = cp->vaos->base.key.element[i].out.offset;
+      unsigned vs_output = cp->vaos->base.key.element[i].out.vs_output;
+
+      struct x86_reg data;
+
+      if (format == EMIT_1F_PSIZE) {
+         data = aos_get_internal_xmm( cp, IMM_PSIZE );
+      }
+      else {
+         data = aos_get_shader_reg( cp, 
+                                    TGSI_FILE_OUTPUT,
+                                    vs_output );
+      }
+
+      if (data.file != file_XMM) {
+         struct x86_reg tmp = aos_get_xmm_reg( cp );
+         sse_movups(cp->func, tmp, data);
+         data = tmp;
+      }
+      
+      if (!emit_output( cp, 
+                        x86_make_disp( cp->outbuf_ECX, offset ),
+                        data, 
+                        format ))
+         return FALSE;
+
+      aos_release_xmm_reg( cp, data.idx );
+
+      cp->insn_counter++;
+      debug_printf("\n");
+   }
+
+   return TRUE;
+}
+
+#endif
diff --git a/src/gallium/auxiliary/draw/draw_vs_exec.c b/src/gallium/auxiliary/draw/draw_vs_exec.c
index 7a02f6334b1..4501877efcf 100644
--- a/src/gallium/auxiliary/draw/draw_vs_exec.c
+++ b/src/gallium/auxiliary/draw/draw_vs_exec.c
@@ -179,10 +179,12 @@ draw_create_vs_exec(struct draw_context *draw,
 
    tgsi_scan_shader(state->tokens, &vs->base.info);
 
+   vs->base.draw = draw;
    vs->base.prepare = vs_exec_prepare;
    vs->base.run_linear = vs_exec_run_linear;
    vs->base.delete = vs_exec_delete;
-   vs->machine = &draw->machine;
+   vs->base.create_varient = draw_vs_varient_generic;
+   vs->machine = &draw->vs.machine;
 
    return &vs->base;
 }
diff --git a/src/gallium/auxiliary/draw/draw_vs_llvm.c b/src/gallium/auxiliary/draw/draw_vs_llvm.c
index 171da51dd56..621472ec7c8 100644
--- a/src/gallium/auxiliary/draw/draw_vs_llvm.c
+++ b/src/gallium/auxiliary/draw/draw_vs_llvm.c
@@ -114,7 +114,9 @@ draw_create_vs_llvm(struct draw_context *draw,
 
    tgsi_scan_shader(vs->base.state.tokens, &vs->base.info);
 
+   vs->base.draw = draw;
    vs->base.prepare = vs_llvm_prepare;
+   vs->base.create_varient = draw_vs_varient_generic;
    vs->base.run_linear = vs_llvm_run_linear;
    vs->base.delete = vs_llvm_delete;
    vs->machine = &draw->machine;
diff --git a/src/gallium/auxiliary/draw/draw_vs_sse.c b/src/gallium/auxiliary/draw/draw_vs_sse.c
index e3f4e67472a..7781782ae84 100644
--- a/src/gallium/auxiliary/draw/draw_vs_sse.c
+++ b/src/gallium/auxiliary/draw/draw_vs_sse.c
@@ -47,9 +47,7 @@
 #include "tgsi/util/tgsi_parse.h"
 
 #define SSE_MAX_VERTICES 4
-#define SSE_SWIZZLES 1
 
-#if SSE_SWIZZLES
 typedef void (XSTDCALL *codegen_function) (
    const struct tgsi_exec_vector *input, /* 1 */
    struct tgsi_exec_vector *output, /* 2 */
@@ -62,14 +60,6 @@ typedef void (XSTDCALL *codegen_function) (
    float (*aos_output)[4],      /* 9 */
    uint num_outputs,            /* 10 */
    uint output_stride );        /* 11 */
-#else
-typedef void (XSTDCALL *codegen_function) (
-   const struct tgsi_exec_vector *input,
-   struct tgsi_exec_vector *output,
-   float (*constant)[4],
-   struct tgsi_exec_vector *temporary,
-   float (*immediates)[4] );
-#endif
 
 struct draw_sse_vertex_shader {
    struct draw_vertex_shader base;
@@ -111,7 +101,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
    for (i = 0; i < count; i += MAX_TGSI_VERTICES) {
       unsigned int max_vertices = MIN2(MAX_TGSI_VERTICES, count - i);
 
-#if SSE_SWIZZLES
       /* run compiled shader
        */
       shader->func(machine->Inputs,
@@ -128,43 +117,6 @@ vs_sse_run_linear( struct draw_vertex_shader *base,
 
       input = (const float (*)[4])((const char *)input + input_stride * max_vertices);
       output = (float (*)[4])((char *)output + output_stride * max_vertices);
-#else
-      unsigned int j, slot;
-
-      /* Swizzle inputs.  
-       */
-      for (j = 0; j < max_vertices; j++) {
-         for (slot = 0; slot < base->info.num_inputs; slot++) {
-            machine->Inputs[slot].xyzw[0].f[j] = input[slot][0];
-            machine->Inputs[slot].xyzw[1].f[j] = input[slot][1];
-            machine->Inputs[slot].xyzw[2].f[j] = input[slot][2];
-            machine->Inputs[slot].xyzw[3].f[j] = input[slot][3];
-         } 
-
-	 input = (const float (*)[4])((const char *)input + input_stride);
-      }
-
-      /* run compiled shader
-       */
-      shader->func(machine->Inputs,
-		   machine->Outputs,
-		   (float (*)[4])constants,
-		   machine->Temps,
-		   shader->immediates);
-
-      /* Unswizzle all output results.  
-       */
-      for (j = 0; j < max_vertices; j++) {
-         for (slot = 0; slot < base->info.num_outputs; slot++) {
-            output[slot][0] = machine->Outputs[slot].xyzw[0].f[j];
-            output[slot][1] = machine->Outputs[slot].xyzw[1].f[j];
-            output[slot][2] = machine->Outputs[slot].xyzw[2].f[j];
-            output[slot][3] = machine->Outputs[slot].xyzw[3].f[j];
-         } 
-
-	 output = (float (*)[4])((char *)output + output_stride);
-      }
-#endif
    }
 }
 
@@ -203,15 +155,18 @@ draw_create_vs_sse(struct draw_context *draw,
 
    tgsi_scan_shader(templ->tokens, &vs->base.info);
 
+   vs->base.draw = draw;
+   vs->base.create_varient = draw_vs_varient_aos_sse;
+//   vs->base.create_varient = draw_vs_varient_generic;
    vs->base.prepare = vs_sse_prepare;
    vs->base.run_linear = vs_sse_run_linear;
    vs->base.delete = vs_sse_delete;
-   vs->machine = &draw->machine;
+   vs->machine = &draw->vs.machine;
    
    x86_init_func( &vs->sse2_program );
 
    if (!tgsi_emit_sse2( (struct tgsi_token *) vs->base.state.tokens,
-			&vs->sse2_program, vs->immediates, SSE_SWIZZLES )) 
+			&vs->sse2_program, vs->immediates, TRUE )) 
       goto fail;
       
    vs->func = (codegen_function) x86_get_func( &vs->sse2_program );
diff --git a/src/gallium/auxiliary/draw/draw_vs_varient.c b/src/gallium/auxiliary/draw/draw_vs_varient.c
new file mode 100644
index 00000000000..119a3a04b56
--- /dev/null
+++ b/src/gallium/auxiliary/draw/draw_vs_varient.c
@@ -0,0 +1,326 @@
+/**************************************************************************
+ * 
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ * 
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ * 
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ * 
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ * 
+ **************************************************************************/
+
+ /*
+  * Authors:
+  *   Keith Whitwell <keith@tungstengraphics.com>
+  */
+
+#include "pipe/p_util.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vbuf.h"
+#include "draw/draw_vertex.h"
+#include "draw/draw_vs.h"
+#include "translate/translate.h"
+#include "translate/translate_cache.h"
+
+/* A first pass at incorporating vertex fetch/emit functionality into 
+ */
+struct draw_vs_varient_generic {
+   struct draw_vs_varient base;
+
+   struct pipe_viewport_state viewport;
+
+   struct draw_vertex_shader *shader;
+   struct draw_context *draw;
+   
+   /* Basic plan is to run these two translate functions before/after
+    * the vertex shader's existing run_linear() routine to simulate
+    * the inclusion of this functionality into the shader...  
+    * 
+    * Next will look at actually including it.
+    */
+   struct translate *fetch;
+   struct translate *emit;
+
+   const float (*constants)[4];
+};
+
+
+
+
+static void vsvg_set_constants( struct draw_vs_varient *varient,
+                                const float (*constants)[4] )
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+
+   vsvg->constants = constants;
+}
+
+
+static void vsvg_set_input( struct draw_vs_varient *varient,
+                            unsigned buffer,
+                            const void *ptr,
+                            unsigned stride )
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+
+   vsvg->fetch->set_buffer(vsvg->fetch, 
+                           buffer, 
+                           ptr, 
+                           stride);
+}
+
+
+/* Mainly for debug at this stage:
+ */
+static void do_rhw_viewport( struct draw_vs_varient_generic *vsvg,
+                             unsigned count,
+                             void *output_buffer )
+{
+   char *ptr = (char *)output_buffer;
+   const float *scale = vsvg->viewport.scale;
+   const float *trans = vsvg->viewport.translate;
+   unsigned stride = vsvg->base.key.output_stride;
+   unsigned j;
+
+   for (j = 0; j < count; j++, ptr += stride) {
+      float *data = (float *)ptr;
+      float w = 1.0f / data[3];
+
+      data[0] = data[0] * w * scale[0] + trans[0];
+      data[1] = data[1] * w * scale[1] + trans[1];
+      data[2] = data[2] * w * scale[2] + trans[2];
+      data[3] = w;
+   }
+}
+
+static void do_viewport( struct draw_vs_varient_generic *vsvg,
+                             unsigned count,
+                             void *output_buffer )
+{
+   char *ptr = (char *)output_buffer;
+   const float *scale = vsvg->viewport.scale;
+   const float *trans = vsvg->viewport.translate;
+   unsigned stride = vsvg->base.key.output_stride;
+   unsigned j;
+
+   for (j = 0; j < count; j++, ptr += stride) {
+      float *data = (float *)ptr;
+
+      data[0] = data[0] * scale[0] + trans[0];
+      data[1] = data[1] * scale[1] + trans[1];
+      data[2] = data[2] * scale[2] + trans[2];
+   }
+}
+                         
+
+static void vsvg_run_elts( struct draw_vs_varient *varient,
+                           const unsigned *elts,
+                           unsigned count,
+                           void *output_buffer)
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+			
+   /* Want to do this in small batches for cache locality?
+    */
+   
+   vsvg->fetch->run_elts( vsvg->fetch, 
+                          elts,
+                          count,
+                          output_buffer );
+
+   //if (!vsvg->base.vs->is_passthrough) 
+   {
+      vsvg->base.vs->run_linear( vsvg->base.vs, 
+                                 output_buffer,
+                                 output_buffer,
+                                 vsvg->constants,
+                                 count,
+                                 vsvg->base.key.output_stride, 
+                                 vsvg->base.key.output_stride);
+
+
+      if (vsvg->base.key.clip) {
+         /* not really handling clipping, just do the rhw so we can
+          * see the results...
+          */
+         do_rhw_viewport( vsvg,
+                          count,
+                          output_buffer );
+      }
+      else if (vsvg->base.key.viewport) {
+         do_viewport( vsvg,
+                      count,
+                      output_buffer );
+      }
+
+
+      //if (!vsvg->already_in_emit_format)
+
+      vsvg->emit->set_buffer( vsvg->emit,
+                              0, 
+                              output_buffer,
+                              vsvg->base.key.output_stride );
+
+
+      vsvg->emit->run( vsvg->emit,
+                       0, count,
+                       output_buffer );
+   }
+}
+
+
+static void vsvg_run_linear( struct draw_vs_varient *varient,
+                                   unsigned start,
+                                   unsigned count,
+                                   void *output_buffer )
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+	
+   //debug_printf("%s %d %d\n", __FUNCTION__, start, count);
+   
+				
+   vsvg->fetch->run( vsvg->fetch, 
+                     start,
+                     count,
+                     output_buffer );
+
+   //if (!vsvg->base.vs->is_passthrough) 
+   {
+      vsvg->base.vs->run_linear( vsvg->base.vs, 
+                                 output_buffer,
+                                 output_buffer,
+                                 vsvg->constants,
+                                 count,
+                                 vsvg->base.key.output_stride, 
+                                 vsvg->base.key.output_stride);
+
+      if (vsvg->base.key.clip) {
+         /* not really handling clipping, just do the rhw so we can
+          * see the results...
+          */
+         do_rhw_viewport( vsvg,
+                          count,
+                          output_buffer );
+      }
+      else if (vsvg->base.key.viewport) {
+         do_viewport( vsvg,
+                      count,
+                      output_buffer );
+      }
+
+      //if (!vsvg->already_in_emit_format)
+      vsvg->emit->set_buffer( vsvg->emit,
+                              0, 
+                              output_buffer,
+                              vsvg->base.key.output_stride );
+
+      vsvg->emit->set_buffer( vsvg->emit, 
+                              1,
+                              &vsvg->draw->rasterizer->point_size,
+                              0);
+
+      vsvg->emit->run( vsvg->emit,
+                       0, count,
+                       output_buffer );
+   }
+}
+
+
+
+
+static void vsvg_set_viewport( struct draw_vs_varient *varient,
+                               const struct pipe_viewport_state *viewport )
+{
+   struct draw_vs_varient_generic *vsvg = (struct draw_vs_varient_generic *)varient;
+
+   vsvg->viewport = *viewport;
+}
+
+static void vsvg_destroy( struct draw_vs_varient *varient )
+{
+   FREE(varient);
+}
+
+
+struct draw_vs_varient *draw_vs_varient_generic( struct draw_vertex_shader *vs,
+                                                 const struct draw_vs_varient_key *key )
+{
+   unsigned i;
+   struct translate_key fetch, emit;
+
+   struct draw_vs_varient_generic *vsvg = CALLOC_STRUCT( draw_vs_varient_generic );
+   if (vsvg == NULL)
+      return NULL;
+
+   vsvg->base.key = *key;
+   vsvg->base.vs = vs;
+   vsvg->base.set_input     = vsvg_set_input;
+   vsvg->base.set_constants = vsvg_set_constants;
+   vsvg->base.set_viewport  = vsvg_set_viewport;
+   vsvg->base.run_elts      = vsvg_run_elts;
+   vsvg->base.run_linear    = vsvg_run_linear;
+   vsvg->base.destroy       = vsvg_destroy;
+
+
+
+   /* Build free-standing fetch and emit functions:
+    */
+   fetch.nr_elements = key->nr_inputs;
+   fetch.output_stride = 0;
+   for (i = 0; i < key->nr_inputs; i++) {
+      fetch.element[i].input_format = key->element[i].in.format;
+      fetch.element[i].input_buffer = key->element[i].in.buffer;
+      fetch.element[i].input_offset = key->element[i].in.offset;
+      fetch.element[i].output_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+      fetch.element[i].output_offset = fetch.output_stride;
+      fetch.output_stride += 4 * sizeof(float);
+   }
+
+
+   emit.nr_elements = key->nr_outputs;
+   emit.output_stride = key->output_stride;
+   for (i = 0; i < key->nr_outputs; i++) {
+      if (key->element[i].out.format != EMIT_1F_PSIZE)
+      {      
+         emit.element[i].input_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+         emit.element[i].input_buffer = 0;
+         emit.element[i].input_offset = key->element[i].out.vs_output * 4 * sizeof(float);
+         emit.element[i].output_format = draw_translate_vinfo_format(key->element[i].out.format);
+         emit.element[i].output_offset = key->element[i].out.offset;
+      }
+      else {
+         emit.element[i].input_format = PIPE_FORMAT_R32_FLOAT;
+         emit.element[i].input_buffer = 1;
+         emit.element[i].input_offset = 0;
+         emit.element[i].output_format = PIPE_FORMAT_R32_FLOAT;
+         emit.element[i].output_offset = key->element[i].out.offset;
+      }
+   }
+
+   vsvg->fetch = draw_vs_get_fetch( vs->draw, &fetch );
+   vsvg->emit = draw_vs_get_emit( vs->draw, &emit );
+
+   return &vsvg->base;
+}
+
+
+
+
+
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
index 4e036d9032c..2415b0156ba 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.c
@@ -36,11 +36,8 @@
 
 #define DUMP_SSE  0
 
-#if DUMP_SSE
 
-static void
-_print_reg(
-   struct x86_reg reg )
+void x86_print_reg( struct x86_reg reg )
 {
    if (reg.mod != mod_REG) 
       debug_printf( "[" );
@@ -77,6 +74,7 @@ _print_reg(
       debug_printf( "]" );
 }
 
+#if DUMP_SSE
 
 #define DUMP_START() debug_printf( "\n" )
 #define DUMP_END() debug_printf( "\n" )
@@ -87,7 +85,7 @@ _print_reg(
       foo++;                                    \
    if  (*foo)                                   \
       foo++;                                    \
-   debug_printf( "\n% 15s ", foo );             \
+   debug_printf( "\n% 4x% 15s ", p->csr - p->store, foo );             \
 } while (0)
 
 #define DUMP_I( I ) do {                        \
@@ -97,27 +95,27 @@ _print_reg(
 
 #define DUMP_R( R0 ) do {                       \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
 } while( 0 )
 
 #define DUMP_RR( R0, R1 ) do {                  \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
    debug_printf( ", " );                        \
-   _print_reg( R1 );                            \
+   x86_print_reg( R1 );                            \
 } while( 0 )
 
 #define DUMP_RI( R0, I ) do {                   \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
    debug_printf( ", %u", I );                   \
 } while( 0 )
 
 #define DUMP_RRI( R0, R1, I ) do {              \
    DUMP();                                      \
-   _print_reg( R0 );                            \
+   x86_print_reg( R0 );                            \
    debug_printf( ", " );                        \
-   _print_reg( R1 );                            \
+   x86_print_reg( R1 );                            \
    debug_printf( ", %u", I );                   \
 } while( 0 )
 
@@ -220,6 +218,8 @@ static void emit_3ub( struct x86_function *p, unsigned char b0, unsigned char b1
 
 /* Build a modRM byte + possible displacement.  No treatment of SIB
  * indexing.  BZZT - no way to encode an absolute address.
+ *
+ * This is the "/r" field in the x86 manuals...
  */
 static void emit_modrm( struct x86_function *p, 
 			struct x86_reg reg, 
@@ -258,7 +258,8 @@ static void emit_modrm( struct x86_function *p,
    }
 }
 
-
+/* Emits the "/0".."/7" specialized versions of the modrm ("/r") bytes.
+ */
 static void emit_modrm_noreg( struct x86_function *p,
 			      unsigned op,
 			      struct x86_reg regmem )
@@ -367,8 +368,7 @@ void x86_jcc( struct x86_function *p,
    DUMP_I(cc);
    
    if (offset < 0) {
-      int amt = p->csr - p->store;
-      assert(amt > -offset);
+      assert(p->csr - p->store > -offset);
    }
 
    if (offset <= 127 && offset >= -128) {
@@ -445,6 +445,16 @@ void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm )
    emit_1i(p, imm);
 }
 
+void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm )
+{
+   DUMP_RI( dst, imm );
+   assert(dst.mod == mod_REG);
+   emit_1ub(p, 0x80);
+   emit_modrm_noreg(p, 0, dst);
+   emit_1ub(p, imm);
+}
+
+
 void x86_push( struct x86_function *p,
 	       struct x86_reg reg )
 {
@@ -461,6 +471,17 @@ void x86_push( struct x86_function *p,
    p->stack_offset += 4;
 }
 
+void x86_push_imm32( struct x86_function *p,
+                     int imm32 )
+{
+   DUMP_I( imm32 );
+   emit_1ub(p, 0x68);
+   emit_1i(p,  imm32);
+
+   p->stack_offset += 4;
+}
+
+
 void x86_pop( struct x86_function *p,
 	      struct x86_reg reg )
 {
@@ -988,6 +1009,24 @@ void sse2_movd( struct x86_function *p,
 /***********************************************************************
  * x87 instructions
  */
+static void note_x87_pop( struct x86_function *p )
+{
+   p->x87_stack--;
+   assert(p->x87_stack >= 0);
+}
+
+static void note_x87_push( struct x86_function *p )
+{
+   p->x87_stack++;
+   assert(p->x87_stack <= 7);
+}
+
+void x87_assert_stack_empty( struct x86_function *p )
+{
+   assert (p->x87_stack == 0);
+}
+
+
 void x87_fist( struct x86_function *p, struct x86_reg dst )
 {
    DUMP_R( dst );
@@ -1000,6 +1039,7 @@ void x87_fistp( struct x86_function *p, struct x86_reg dst )
    DUMP_R( dst );
    emit_1ub(p, 0xdb);
    emit_modrm_noreg(p, 3, dst);
+   note_x87_pop(p);
 }
 
 void x87_fild( struct x86_function *p, struct x86_reg arg )
@@ -1007,12 +1047,14 @@ void x87_fild( struct x86_function *p, struct x86_reg arg )
    DUMP_R( arg );
    emit_1ub(p, 0xdf);
    emit_modrm_noreg(p, 0, arg);
+   note_x87_push(p);
 }
 
 void x87_fldz( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xee);
+   note_x87_push(p);
 }
 
 
@@ -1029,18 +1071,21 @@ void x87_fld1( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xe8);
+   note_x87_push(p);
 }
 
 void x87_fldl2e( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xea);
+   note_x87_push(p);
 }
 
 void x87_fldln2( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xed);
+   note_x87_push(p);
 }
 
 void x87_fwait( struct x86_function *p )
@@ -1061,6 +1106,49 @@ void x87_fclex( struct x86_function *p )
    x87_fnclex(p);
 }
 
+void x87_fcmovb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc0+arg.idx);
+}
+
+void x87_fcmove( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xc8+arg.idx);
+}
+
+void x87_fcmovbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xda, 0xd0+arg.idx);
+}
+
+void x87_fcmovnb( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc0+arg.idx);
+}
+
+void x87_fcmovne( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xc8+arg.idx);
+}
+
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   assert(arg.file == file_x87);
+   emit_2ub(p, 0xdb, 0xd0+arg.idx);
+}
+
+
 
 static void x87_arith_op( struct x86_function *p, struct x86_reg dst, struct x86_reg arg,
 			  unsigned char dst0ub0,
@@ -1148,6 +1236,7 @@ void x87_fmulp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc8+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fsubp( struct x86_function *p, struct x86_reg dst )
@@ -1156,6 +1245,7 @@ void x87_fsubp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe8+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
@@ -1164,6 +1254,7 @@ void x87_fsubrp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xe0+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_faddp( struct x86_function *p, struct x86_reg dst )
@@ -1172,6 +1263,7 @@ void x87_faddp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xc0+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fdivp( struct x86_function *p, struct x86_reg dst )
@@ -1180,6 +1272,7 @@ void x87_fdivp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf8+dst.idx);
+   note_x87_pop(p);
 }
 
 void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
@@ -1188,6 +1281,13 @@ void x87_fdivrp( struct x86_function *p, struct x86_reg dst )
    assert(dst.file == file_x87);
    assert(dst.idx >= 1);
    emit_2ub(p, 0xde, 0xf0+dst.idx);
+   note_x87_pop(p);
+}
+
+void x87_ftst( struct x86_function *p )
+{
+   DUMP();
+   emit_2ub(p, 0xd9, 0xe4);
 }
 
 void x87_fucom( struct x86_function *p, struct x86_reg arg )
@@ -1202,12 +1302,15 @@ void x87_fucomp( struct x86_function *p, struct x86_reg arg )
    DUMP_R( arg );
    assert(arg.file == file_x87);
    emit_2ub(p, 0xdd, 0xe8+arg.idx);
+   note_x87_pop(p);
 }
 
 void x87_fucompp( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xda, 0xe9);
+   note_x87_pop(p);             /* pop twice */
+   note_x87_pop(p);             /* pop twice */
 }
 
 void x87_fxch( struct x86_function *p, struct x86_reg arg )
@@ -1289,6 +1392,7 @@ void x87_fyl2x( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xf1);
+   note_x87_pop(p);
 }
 
 /* st1 = st1 * log2(st0 + 1.0);
@@ -1300,6 +1404,7 @@ void x87_fyl2xp1( struct x86_function *p )
 {
    DUMP();
    emit_2ub(p, 0xd9, 0xf9);
+   note_x87_pop(p);
 }
 
 
@@ -1312,6 +1417,7 @@ void x87_fld( struct x86_function *p, struct x86_reg arg )
       emit_1ub(p, 0xd9);
       emit_modrm_noreg(p, 0, arg);
    }
+   note_x87_push(p);
 }
 
 void x87_fst( struct x86_function *p, struct x86_reg dst )
@@ -1334,8 +1440,15 @@ void x87_fstp( struct x86_function *p, struct x86_reg dst )
       emit_1ub(p, 0xd9);
       emit_modrm_noreg(p, 3, dst);
    }
+   note_x87_pop(p);
 }
 
+void x87_fpop( struct x86_function *p )
+{
+   x87_fstp( p, x86_make_reg( file_x87, 0 ));
+}
+
+
 void x87_fcom( struct x86_function *p, struct x86_reg dst )
 {
    DUMP_R( dst );
@@ -1347,6 +1460,7 @@ void x87_fcom( struct x86_function *p, struct x86_reg dst )
    }
 }
 
+
 void x87_fcomp( struct x86_function *p, struct x86_reg dst )
 {
    DUMP_R( dst );
@@ -1356,6 +1470,20 @@ void x87_fcomp( struct x86_function *p, struct x86_reg dst )
       emit_1ub(p, 0xd8);
       emit_modrm_noreg(p, 3, dst);
    }
+   note_x87_pop(p);
+}
+
+void x87_fcomi( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+}
+
+void x87_fcomip( struct x86_function *p, struct x86_reg arg )
+{
+   DUMP_R( arg );
+   emit_2ub(p, 0xdb, 0xf0+arg.idx);
+   note_x87_pop(p);
 }
 
 
@@ -1374,6 +1502,17 @@ void x87_fnstsw( struct x86_function *p, struct x86_reg dst )
 }
 
 
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst )
+{
+   DUMP_R( dst );
+   assert(dst.file == file_REG32);
+
+   emit_1ub(p, 0x9b);           /* WAIT -- needed? */
+   emit_1ub(p, 0xd9);
+   emit_modrm_noreg(p, 7, dst);
+}
+
+
 
 
 /***********************************************************************
@@ -1442,6 +1581,21 @@ void mmx_movq( struct x86_function *p,
  */
 
 
+void x86_cdecl_caller_push_regs( struct x86_function *p )
+{
+   x86_push(p, x86_make_reg(file_REG32, reg_AX));
+   x86_push(p, x86_make_reg(file_REG32, reg_CX));
+   x86_push(p, x86_make_reg(file_REG32, reg_DX));
+}
+
+void x86_cdecl_caller_pop_regs( struct x86_function *p )
+{
+   x86_pop(p, x86_make_reg(file_REG32, reg_DX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_CX));
+   x86_pop(p, x86_make_reg(file_REG32, reg_AX));
+}
+
+
 /* Retreive a reference to one of the function arguments, taking into
  * account any push/pop activity:
  */
diff --git a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
index eacaeeaf6fe..63e812fac92 100644
--- a/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
+++ b/src/gallium/auxiliary/rtasm/rtasm_x86sse.h
@@ -41,10 +41,12 @@ struct x86_function {
    unsigned size;
    unsigned char *store;
    unsigned char *csr;
-   unsigned stack_offset;
-   int need_emms;
+
+   unsigned stack_offset:16;
+   unsigned need_emms:8;
+   int x87_stack:8;
+
    unsigned char error_overflow[4];
-   const char *fn;
 };
 
 enum x86_reg_file {
@@ -107,6 +109,9 @@ void x86_init_func_size( struct x86_function *p, unsigned code_size );
 void x86_release_func( struct x86_function *p );
 void (*x86_get_func( struct x86_function *p ))( void );
 
+/* Debugging:
+ */
+void x86_print_reg( struct x86_reg reg );
 
 
 /* Create and manipulate registers and regmem values:
@@ -150,6 +155,7 @@ void x86_call( struct x86_function *p, struct x86_reg reg);
  * I load the immediate into general purpose register and use it.
  */
 void x86_mov_reg_imm( struct x86_function *p, struct x86_reg dst, int imm );
+void x86_add_reg_imm8( struct x86_function *p, struct x86_reg dst, ubyte imm );
 
 
 /* Macro for sse_shufps() and sse2_pshufd():
@@ -220,6 +226,7 @@ void x86_imul( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_or( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_pop( struct x86_function *p, struct x86_reg reg );
 void x86_push( struct x86_function *p, struct x86_reg reg );
+void x86_push_imm32( struct x86_function *p, int imm );
 void x86_ret( struct x86_function *p );
 void x86_retw( struct x86_function *p, unsigned short imm );
 void x86_sub( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
@@ -227,13 +234,27 @@ void x86_test( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_xor( struct x86_function *p, struct x86_reg dst, struct x86_reg src );
 void x86_sahf( struct x86_function *p );
 
+
+void x86_cdecl_caller_push_regs( struct x86_function *p );
+void x86_cdecl_caller_pop_regs( struct x86_function *p );
+
+void x87_assert_stack_empty( struct x86_function *p );
+
 void x87_f2xm1( struct x86_function *p );
 void x87_fabs( struct x86_function *p );
 void x87_fadd( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_faddp( struct x86_function *p, struct x86_reg dst );
 void x87_fchs( struct x86_function *p );
 void x87_fclex( struct x86_function *p );
+void x87_fcmovb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmove( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnb( struct x86_function *p, struct x86_reg src );
+void x87_fcmovnbe( struct x86_function *p, struct x86_reg src );
+void x87_fcmovne( struct x86_function *p, struct x86_reg src );
 void x87_fcom( struct x86_function *p, struct x86_reg dst );
+void x87_fcomi( struct x86_function *p, struct x86_reg dst );
+void x87_fcomip( struct x86_function *p, struct x86_reg dst );
 void x87_fcomp( struct x86_function *p, struct x86_reg dst );
 void x87_fcos( struct x86_function *p );
 void x87_fdiv( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
@@ -253,6 +274,7 @@ void x87_fmul( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_fmulp( struct x86_function *p, struct x86_reg dst );
 void x87_fnclex( struct x86_function *p );
 void x87_fprndint( struct x86_function *p );
+void x87_fpop( struct x86_function *p );
 void x87_fscale( struct x86_function *p );
 void x87_fsin( struct x86_function *p );
 void x87_fsincos( struct x86_function *p );
@@ -263,11 +285,13 @@ void x87_fsub( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_fsubp( struct x86_function *p, struct x86_reg dst );
 void x87_fsubr( struct x86_function *p, struct x86_reg dst, struct x86_reg arg );
 void x87_fsubrp( struct x86_function *p, struct x86_reg dst );
+void x87_ftst( struct x86_function *p );
 void x87_fxch( struct x86_function *p, struct x86_reg dst );
 void x87_fxtract( struct x86_function *p );
 void x87_fyl2x( struct x86_function *p );
 void x87_fyl2xp1( struct x86_function *p );
 void x87_fwait( struct x86_function *p );
+void x87_fnstcw( struct x86_function *p, struct x86_reg dst );
 void x87_fnstsw( struct x86_function *p, struct x86_reg dst );
 void x87_fucompp( struct x86_function *p );
 void x87_fucomp( struct x86_function *p, struct x86_reg arg );
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
index 4c65ffd7807..648afa2a51b 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.c
@@ -539,9 +539,9 @@ static const char *TGSI_MODULATES[] =
    "MODULATE_EIGHTH"
 };
 
-static void
-dump_declaration_short(
-   struct tgsi_full_declaration  *decl )
+void
+tgsi_dump_declaration(
+   const struct tgsi_full_declaration  *decl )
 {
    TXT( "\nDCL " );
    ENM( decl->Declaration.File, TGSI_FILES_SHORT );
@@ -672,9 +672,9 @@ dump_declaration_verbose(
    }
 }
 
-static void
-dump_immediate_short(
-   struct tgsi_full_immediate *imm )
+void
+tgsi_dump_immediate(
+   const struct tgsi_full_immediate *imm )
 {
    unsigned i;
 
@@ -727,9 +727,9 @@ dump_immediate_verbose(
    }
 }
 
-static void
-dump_instruction_short(
-   struct tgsi_full_instruction  *inst,
+void
+tgsi_dump_instruction(
+   const struct tgsi_full_instruction  *inst,
    unsigned                      instno )
 {
    unsigned i;
@@ -1281,17 +1281,17 @@ tgsi_dump(
 
       switch( parse.FullToken.Token.Type ) {
       case TGSI_TOKEN_TYPE_DECLARATION:
-         dump_declaration_short(
+         tgsi_dump_declaration(
             &parse.FullToken.FullDeclaration );
          break;
 
       case TGSI_TOKEN_TYPE_IMMEDIATE:
-         dump_immediate_short(
+         tgsi_dump_immediate(
             &parse.FullToken.FullImmediate );
          break;
 
       case TGSI_TOKEN_TYPE_INSTRUCTION:
-         dump_instruction_short(
+         tgsi_dump_instruction(
             &parse.FullToken.FullInstruction,
             instno );
          instno++;
diff --git a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
index beb0155d56c..ca83bdef206 100644
--- a/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
+++ b/src/gallium/auxiliary/tgsi/util/tgsi_dump.h
@@ -14,6 +14,24 @@ tgsi_dump(
    const struct tgsi_token *tokens,
    unsigned                flags );
 
+struct tgsi_full_immediate;
+struct tgsi_full_instruction;
+struct tgsi_full_declaration;
+
+void
+tgsi_dump_immediate(
+   const struct tgsi_full_immediate *imm );
+
+void
+tgsi_dump_instruction(
+   const struct tgsi_full_instruction  *inst,
+   unsigned                      instno );
+
+void
+tgsi_dump_declaration(
+   const struct tgsi_full_declaration  *decl );
+
+
 #if defined __cplusplus
 }
 #endif
diff --git a/src/gallium/auxiliary/translate/translate.h b/src/gallium/auxiliary/translate/translate.h
index b8210af50cf..c3b754a902b 100644
--- a/src/gallium/auxiliary/translate/translate.h
+++ b/src/gallium/auxiliary/translate/translate.h
@@ -71,15 +71,15 @@ struct translate {
 		       const void *ptr,
 		       unsigned stride );
 
-   void (*run_elts)( struct translate *,
-		     const unsigned *elts,
-		     unsigned count,
-		     void *output_buffer);
-
-   void (*run)( struct translate *,
-		unsigned start,
-		unsigned count,
-		void *output_buffer);
+   void (PIPE_CDECL *run_elts)( struct translate *,
+                                const unsigned *elts,
+                                unsigned count,
+                                void *output_buffer);
+
+   void (PIPE_CDECL *run)( struct translate *,
+                           unsigned start,
+                           unsigned count,
+                           void *output_buffer);
 };
 
 
diff --git a/src/gallium/auxiliary/translate/translate_generic.c b/src/gallium/auxiliary/translate/translate_generic.c
index 402780ee539..a25d94f2ca4 100644
--- a/src/gallium/auxiliary/translate/translate_generic.c
+++ b/src/gallium/auxiliary/translate/translate_generic.c
@@ -541,10 +541,10 @@ static emit_func get_emit_func( enum pipe_format format )
 /**
  * Fetch vertex attributes for 'count' vertices.
  */
-static void generic_run_elts( struct translate *translate,
-			      const unsigned *elts,
-			      unsigned count,
-			      void *output_buffer )
+static void PIPE_CDECL generic_run_elts( struct translate *translate,
+                                         const unsigned *elts,
+                                         unsigned count,
+                                         void *output_buffer )
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
@@ -580,10 +580,10 @@ static void generic_run_elts( struct translate *translate,
 
 
 
-static void generic_run( struct translate *translate,
-			 unsigned start,
-			 unsigned count,
-			 void *output_buffer )
+static void PIPE_CDECL generic_run( struct translate *translate,
+                                    unsigned start,
+                                    unsigned count,
+                                    void *output_buffer )
 {
    struct translate_generic *tg = translate_generic(translate);
    char *vert = output_buffer;
diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
index a54ac5a82ff..2fc8b9d3d03 100644
--- a/src/gallium/auxiliary/translate/translate_sse.c
+++ b/src/gallium/auxiliary/translate/translate_sse.c
@@ -45,22 +45,16 @@
 #define W    3
 
 
-#ifdef WIN32
-#define RTASM __cdecl
-#else
-#define RTASM
-#endif
-
-typedef void (RTASM *run_func)( struct translate *translate,
-                                unsigned start,
-                                unsigned count,
-                                void *output_buffer );
-
-typedef void (RTASM *run_elts_func)( struct translate *translate,
-                                     const unsigned *elts,
+typedef void (PIPE_CDECL *run_func)( struct translate *translate,
+                                     unsigned start,
                                      unsigned count,
                                      void *output_buffer );
 
+typedef void (PIPE_CDECL *run_elts_func)( struct translate *translate,
+                                          const unsigned *elts,
+                                          unsigned count,
+                                          void *output_buffer );
+
 
 
 struct translate_sse {
@@ -472,13 +466,7 @@ static boolean build_vertex_emit( struct translate_sse *p,
    x86_lea(p->func, vertexECX, x86_make_disp(vertexECX, p->translate.key.output_stride));
 
    /* Incr index
-    */   /* Emit code for each of the attributes.  Currently routes
-    * everything through SSE registers, even when it might be more
-    * efficient to stick with regular old x86.  No optimization or
-    * other tricks - enough new ground to cover here just getting
-    * things working.
-    */
-
+    */ 
    if (linear) {
       x86_inc(p->func, idxEBX);
    }