From 89498d01531cd515c769e570bf799c39fbafc8fb Mon Sep 17 00:00:00 2001
From: Keith Whitwell <keithw@vmware.com>
Date: Wed, 7 Oct 2009 22:36:43 +0100
Subject: llvmpipe: import experimental softpipe rasterizer code, wip binning
 code

WIP, does't build or run.

Rasterizer code is based on Nick Capen's devmaster posts and the
larrabee articles, but currently doesn't share either the performance
or correctness of either...
---
 src/gallium/drivers/llvmpipe/Makefile             |    2 -
 src/gallium/drivers/llvmpipe/SConscript           |    2 -
 src/gallium/drivers/llvmpipe/lp_context.c         |   26 +-
 src/gallium/drivers/llvmpipe/lp_context.h         |    5 +-
 src/gallium/drivers/llvmpipe/lp_prim_setup.c      |  190 ---
 src/gallium/drivers/llvmpipe/lp_prim_setup.h      |   85 --
 src/gallium/drivers/llvmpipe/lp_prim_vbuf.c       |  105 +-
 src/gallium/drivers/llvmpipe/lp_prim_vbuf.h       |    4 +-
 src/gallium/drivers/llvmpipe/lp_rasterizer.c      |  157 +++
 src/gallium/drivers/llvmpipe/lp_rasterizer.h      |  112 ++
 src/gallium/drivers/llvmpipe/lp_setup.c           | 1432 +--------------------
 src/gallium/drivers/llvmpipe/lp_setup.h           |   17 +-
 src/gallium/drivers/llvmpipe/lp_setup_context.h   |  140 ++
 src/gallium/drivers/llvmpipe/lp_setup_rasterize.c |    7 +
 src/gallium/drivers/llvmpipe/lp_setup_tri.c       |  755 +++++++++++
 src/gallium/drivers/llvmpipe/lp_state_derived.c   |   25 +-
 src/gallium/drivers/llvmpipe/lp_tile_cache.c      |  353 -----
 src/gallium/drivers/llvmpipe/lp_tile_cache.h      |   71 -
 18 files changed, 1276 insertions(+), 2212 deletions(-)
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_prim_setup.c
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_prim_setup.h
 create mode 100644 src/gallium/drivers/llvmpipe/lp_rasterizer.c
 create mode 100644 src/gallium/drivers/llvmpipe/lp_rasterizer.h
 create mode 100644 src/gallium/drivers/llvmpipe/lp_setup_context.h
 create mode 100644 src/gallium/drivers/llvmpipe/lp_setup_rasterize.c
 create mode 100644 src/gallium/drivers/llvmpipe/lp_setup_tri.c
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_tile_cache.c
 delete mode 100644 src/gallium/drivers/llvmpipe/lp_tile_cache.h

(limited to 'src')

diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 21aff1967a1..8f05e5a6fd1 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -32,7 +32,6 @@ C_SOURCES = \
 	lp_draw_arrays.c \
 	lp_flush.c \
 	lp_jit.c \
-	lp_prim_setup.c \
 	lp_prim_vbuf.c \
 	lp_setup.c \
 	lp_query.c \
@@ -51,7 +50,6 @@ C_SOURCES = \
 	lp_tex_sample_c.c \
 	lp_tex_sample_llvm.c \
 	lp_texture.c \
-	lp_tile_cache.c \
 	lp_tile_soa.c
 
 include ../../Makefile.template
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 13cd465838a..344b2463377 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -45,7 +45,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_draw_arrays.c',
 		'lp_flush.c',
 		'lp_jit.c',
-		'lp_prim_setup.c',
 		'lp_prim_vbuf.c',
 		'lp_setup.c',
 		'lp_query.c',
@@ -64,7 +63,6 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_tex_sample_c.c',
 		'lp_tex_sample_llvm.c',
 		'lp_texture.c',
-		'lp_tile_cache.c',
 		'lp_tile_soa.c',
 	])
 
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 202cb8ef439..57e71f3e986 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -31,13 +31,13 @@
  */
 
 #include "draw/draw_context.h"
+#include "draw/draw_vbuf.h"
 #include "pipe/p_defines.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "lp_clear.h"
 #include "lp_context.h"
 #include "lp_flush.h"
-#include "lp_prim_setup.h"
 #include "lp_prim_vbuf.h"
 #include "lp_state.h"
 #include "lp_surface.h"
@@ -264,21 +264,21 @@ llvmpipe_create( struct pipe_screen *screen )
                          (struct tgsi_sampler **)
                             llvmpipe->tgsi.vert_samplers_list);
 
-   llvmpipe->setup = lp_draw_render_stage(llvmpipe);
-   if (!llvmpipe->setup)
-      goto fail;
-
    if (debug_get_bool_option( "LP_NO_RAST", FALSE ))
       llvmpipe->no_rast = TRUE;
 
-   if (debug_get_bool_option( "LP_NO_VBUF", FALSE )) {
-      /* Deprecated path -- vbuf is the intended interface to the draw module:
-       */
-      draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->setup);
-   }
-   else {
-      lp_init_vbuf(llvmpipe);
-   }
+   llvmpipe->vbuf_backend = lp_create_vbuf_backend(llvmpipe);
+   if (!llvmpipe->vbuf_backend)
+      goto fail;
+
+   llvmpipe->vbuf = draw_vbuf_stage(llvmpipe->draw, llvmpipe->vbuf_backend);
+   if (!llvmpipe->vbuf)
+      goto fail;
+
+   draw_set_rasterize_stage(llvmpipe->draw, llvmpipe->vbuf);
+   draw_set_render(llvmpipe->draw, llvmpipe->vbuf_backend);
+
+
 
    /* plug in AA line/point stages */
    draw_install_aaline_stage(llvmpipe->draw, &llvmpipe->pipe);
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 8d5a0d4f1fc..0b77ae58d50 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -124,9 +124,10 @@ struct llvmpipe_context {
 
    /** The primitive drawing context */
    struct draw_context *draw;
-   struct draw_stage *setup;
+
+   /** Draw module backend */
+   struct vbuf_render *vbuf_backend;
    struct draw_stage *vbuf;
-   struct llvmpipe_vbuf_render *vbuf_render;
 
    boolean dirty_render_cache;
    
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.c b/src/gallium/drivers/llvmpipe/lp_prim_setup.c
deleted file mode 100644
index b14f8fb99d9..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_setup.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * \brief A draw stage that drives our triangle setup routines from
- * within the draw pipeline.  One of two ways to drive setup, the
- * other being in lp_prim_vbuf.c.
- *
- * \author  Keith Whitwell <keith@tungstengraphics.com>
- * \author  Brian Paul
- */
-
-
-#include "lp_context.h"
-#include "lp_setup.h"
-#include "lp_state.h"
-#include "lp_prim_setup.h"
-#include "draw/draw_pipe.h"
-#include "draw/draw_vertex.h"
-#include "util/u_memory.h"
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_stage {
-   struct draw_stage stage; /**< This must be first (base class) */
-
-   struct setup_context *setup;
-};
-
-
-
-/**
- * Basically a cast wrapper.
- */
-static INLINE struct setup_stage *setup_stage( struct draw_stage *stage )
-{
-   return (struct setup_stage *)stage;
-}
-
-
-typedef const float (*cptrf4)[4];
-
-static void
-do_tri(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-   
-   llvmpipe_setup_tri( setup->setup,
-              (cptrf4)prim->v[0]->data,
-              (cptrf4)prim->v[1]->data,
-              (cptrf4)prim->v[2]->data );
-}
-
-static void
-do_line(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   llvmpipe_setup_line( setup->setup,
-               (cptrf4)prim->v[0]->data,
-               (cptrf4)prim->v[1]->data );
-}
-
-static void
-do_point(struct draw_stage *stage, struct prim_header *prim)
-{
-   struct setup_stage *setup = setup_stage( stage );
-
-   llvmpipe_setup_point( setup->setup,
-                (cptrf4)prim->v[0]->data );
-}
-
-
-
-
-static void setup_begin( struct draw_stage *stage )
-{
-   struct setup_stage *setup = setup_stage(stage);
-
-   llvmpipe_setup_prepare( setup->setup );
-
-   stage->point = do_point;
-   stage->line = do_line;
-   stage->tri = do_tri;
-}
-
-
-static void setup_first_point( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->point( stage, header );
-}
-
-static void setup_first_line( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->line( stage, header );
-}
-
-
-static void setup_first_tri( struct draw_stage *stage,
-			       struct prim_header *header )
-{
-   setup_begin(stage);
-   stage->tri( stage, header );
-}
-
-
-
-static void setup_flush( struct draw_stage *stage,
-			 unsigned flags )
-{
-   stage->point = setup_first_point;
-   stage->line = setup_first_line;
-   stage->tri = setup_first_tri;
-}
-
-
-static void reset_stipple_counter( struct draw_stage *stage )
-{
-}
-
-
-static void render_destroy( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   llvmpipe_setup_destroy_context(ssetup->setup);
-   FREE( stage );
-}
-
-
-/**
- * Create a new primitive setup/render stage.
- */
-struct draw_stage *lp_draw_render_stage( struct llvmpipe_context *llvmpipe )
-{
-   struct setup_stage *sstage = CALLOC_STRUCT(setup_stage);
-
-   sstage->setup = llvmpipe_setup_create_context(llvmpipe);
-   sstage->stage.draw = llvmpipe->draw;
-   sstage->stage.point = setup_first_point;
-   sstage->stage.line = setup_first_line;
-   sstage->stage.tri = setup_first_tri;
-   sstage->stage.flush = setup_flush;
-   sstage->stage.reset_stipple_counter = reset_stipple_counter;
-   sstage->stage.destroy = render_destroy;
-
-   return (struct draw_stage *)sstage;
-}
-
-struct setup_context *
-lp_draw_setup_context( struct draw_stage *stage )
-{
-   struct setup_stage *ssetup = setup_stage(stage);
-   return ssetup->setup;
-}
-
-void
-lp_draw_flush( struct draw_stage *stage )
-{
-   stage->flush( stage, 0 );
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_setup.h b/src/gallium/drivers/llvmpipe/lp_prim_setup.h
deleted file mode 100644
index da6cae63751..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_prim_setup.h
+++ /dev/null
@@ -1,85 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-
-#ifndef LP_PRIM_SETUP_H
-#define LP_PRIM_SETUP_H
-
-
-/**
- * vbuf is a special stage to gather the stream of triangles, lines, points
- * together and reconstruct vertex buffers for hardware upload.
- *
- * First attempt, work in progress.
- * 
- * TODO:
- *    - separate out vertex buffer building and primitive emit, ie >1 draw per vb.
- *    - tell vbuf stage how to build hw vertices directly
- *    - pass vbuf stage a buffer pointer for direct emit to agp/vram.
- *
- *
- *
- * Vertices are just an array of floats, with all the attributes
- * packed.  We currently assume a layout like:
- *
- * attr[0][0..3] - window position
- * attr[1..n][0..3] - remaining attributes.
- *
- * Attributes are assumed to be 4 floats wide but are packed so that
- * all the enabled attributes run contiguously.
- */
-
-
-struct draw_stage;
-struct llvmpipe_context;
-
-
-typedef void (*vbuf_draw_func)( struct pipe_context *pipe,
-                                unsigned prim,
-                                const ushort *elements,
-                                unsigned nr_elements,
-                                const void *vertex_buffer,
-                                unsigned nr_vertices );
-
-
-extern struct draw_stage *
-lp_draw_render_stage( struct llvmpipe_context *llvmpipe );
-
-extern struct setup_context *
-lp_draw_setup_context( struct draw_stage * );
-
-extern void
-lp_draw_flush( struct draw_stage * );
-
-
-extern struct draw_stage *
-lp_draw_vbuf_stage( struct draw_context *draw_context,
-                    struct pipe_context *pipe,
-                    vbuf_draw_func draw );
-
-
-#endif /* LP_PRIM_SETUP_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
index c394dcb61d0..e244ac9087c 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.c
@@ -39,7 +39,6 @@
 #include "lp_context.h"
 #include "lp_state.h"
 #include "lp_prim_vbuf.h"
-#include "lp_prim_setup.h"
 #include "lp_setup.h"
 #include "draw/draw_context.h"
 #include "draw/draw_vbuf.h"
@@ -59,6 +58,8 @@ struct llvmpipe_vbuf_render
 {
    struct vbuf_render base;
    struct llvmpipe_context *llvmpipe;
+   struct setup_context *setup;
+
    uint prim;
    uint vertex_size;
    uint nr_vertices;
@@ -75,6 +76,11 @@ llvmpipe_vbuf_render(struct vbuf_render *vbr)
 }
 
 
+
+
+
+
+
 static const struct vertex_info *
 lp_vbuf_get_vertex_info(struct vbuf_render *vbr)
 {
@@ -105,36 +111,6 @@ lp_vbuf_allocate_vertices(struct vbuf_render *vbr,
 static void
 lp_vbuf_release_vertices(struct vbuf_render *vbr)
 {
-#if 0
-   {
-      struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-      const struct vertex_info *info = 
-         llvmpipe_get_vbuf_vertex_info(cvbr->llvmpipe);
-      const float *vtx = (const float *) cvbr->vertex_buffer;
-      uint i, j;
-      debug_printf("%s (vtx_size = %u,  vtx_used = %u)\n",
-             __FUNCTION__, cvbr->vertex_size, cvbr->nr_vertices);
-      for (i = 0; i < cvbr->nr_vertices; i++) {
-         for (j = 0; j < info->num_attribs; j++) {
-            uint k;
-            switch (info->attrib[j].emit) {
-            case EMIT_4F:  k = 4;   break;
-            case EMIT_3F:  k = 3;   break;
-            case EMIT_2F:  k = 2;   break;
-            case EMIT_1F:  k = 1;   break;
-            default: assert(0);
-            }
-            debug_printf("Vert %u attr %u: ", i, j);
-            while (k-- > 0) {
-               debug_printf("%g ", vtx[0]);
-               vtx++;
-            }
-            debug_printf("\n");
-         }
-      }
-   }
-#endif
-
    /* keep the old allocation for next time */
 }
 
@@ -160,12 +136,8 @@ static boolean
 lp_vbuf_set_primitive(struct vbuf_render *vbr, unsigned prim)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
+   struct setup_context *setup_ctx = cvbr->setup;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct setup_context *setup_ctx = lp_draw_setup_context(cvbr->llvmpipe->setup);
-   
    llvmpipe_setup_prepare( setup_ctx );
 
    cvbr->llvmpipe->reduced_prim = u_reduced_prim(prim);
@@ -193,14 +165,9 @@ lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
    const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer = cvbr->vertex_buffer;
+   struct setup_context *setup_ctx = cvbr->setup;
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = llvmpipe->setup;
-   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -367,11 +334,6 @@ lp_vbuf_draw(struct vbuf_render *vbr, const ushort *indices, uint nr)
    default:
       assert(0);
    }
-
-   /* XXX: why are we calling this???  If we had to call something, it
-    * would be a function in lp_setup.c:
-    */
-   lp_draw_flush( setup );
 }
 
 
@@ -384,17 +346,12 @@ lp_vbuf_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
    struct llvmpipe_context *llvmpipe = cvbr->llvmpipe;
+   struct setup_context *setup_ctx = cvbr->setup;
    const unsigned stride = llvmpipe->vertex_info_vbuf.size * sizeof(float);
    const void *vertex_buffer =
       (void *) get_vert(cvbr->vertex_buffer, start, stride);
    unsigned i;
 
-   /* XXX: break this dependency - make setup_context live under
-    * llvmpipe, rename the old "setup" draw stage to something else.
-    */
-   struct draw_stage *setup = llvmpipe->setup;
-   struct setup_context *setup_ctx = lp_draw_setup_context(setup);
-
    switch (cvbr->prim) {
    case PIPE_PRIM_POINTS:
       for (i = 0; i < nr; i++) {
@@ -568,40 +525,38 @@ static void
 lp_vbuf_destroy(struct vbuf_render *vbr)
 {
    struct llvmpipe_vbuf_render *cvbr = llvmpipe_vbuf_render(vbr);
-   cvbr->llvmpipe->vbuf_render = NULL;
+   llvmpipe_setup_destroy_context(cvbr->setup);
    FREE(cvbr);
 }
 
 
 /**
- * Initialize the post-transform vertex buffer information for the given
- * context.
+ * Create the post-transform vertex handler for the given context.
  */
-void
-lp_init_vbuf(struct llvmpipe_context *lp)
+struct vbuf_render *
+lp_create_vbuf_backend(struct llvmpipe_context *lp)
 {
-   assert(lp->draw);
+   struct llvmpipe_vbuf_render *cvbr = CALLOC_STRUCT(llvmpipe_vbuf_render);
 
-   lp->vbuf_render = CALLOC_STRUCT(llvmpipe_vbuf_render);
+   assert(lp->draw);
 
-   lp->vbuf_render->base.max_indices = LP_MAX_VBUF_INDEXES;
-   lp->vbuf_render->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
 
-   lp->vbuf_render->base.get_vertex_info = lp_vbuf_get_vertex_info;
-   lp->vbuf_render->base.allocate_vertices = lp_vbuf_allocate_vertices;
-   lp->vbuf_render->base.map_vertices = lp_vbuf_map_vertices;
-   lp->vbuf_render->base.unmap_vertices = lp_vbuf_unmap_vertices;
-   lp->vbuf_render->base.set_primitive = lp_vbuf_set_primitive;
-   lp->vbuf_render->base.draw = lp_vbuf_draw;
-   lp->vbuf_render->base.draw_arrays = lp_vbuf_draw_arrays;
-   lp->vbuf_render->base.release_vertices = lp_vbuf_release_vertices;
-   lp->vbuf_render->base.destroy = lp_vbuf_destroy;
+   cvbr->base.max_indices = LP_MAX_VBUF_INDEXES;
+   cvbr->base.max_vertex_buffer_bytes = LP_MAX_VBUF_SIZE;
 
-   lp->vbuf_render->llvmpipe = lp;
+   cvbr->base.get_vertex_info = lp_vbuf_get_vertex_info;
+   cvbr->base.allocate_vertices = lp_vbuf_allocate_vertices;
+   cvbr->base.map_vertices = lp_vbuf_map_vertices;
+   cvbr->base.unmap_vertices = lp_vbuf_unmap_vertices;
+   cvbr->base.set_primitive = lp_vbuf_set_primitive;
+   cvbr->base.draw = lp_vbuf_draw;
+   cvbr->base.draw_arrays = lp_vbuf_draw_arrays;
+   cvbr->base.release_vertices = lp_vbuf_release_vertices;
+   cvbr->base.destroy = lp_vbuf_destroy;
 
-   lp->vbuf = draw_vbuf_stage(lp->draw, &lp->vbuf_render->base);
+   cvbr->llvmpipe = lp;
 
-   draw_set_rasterize_stage(lp->draw, lp->vbuf);
+   cvbr->setup = llvmpipe_setup_create_context(cvbr->llvmpipe);
 
-   draw_set_render(lp->draw, &lp->vbuf_render->base);
+   return &cvbr->base;
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
index 6c4e6063e6d..0676e2f42ac 100644
--- a/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
+++ b/src/gallium/drivers/llvmpipe/lp_prim_vbuf.h
@@ -31,8 +31,8 @@
 
 struct llvmpipe_context;
 
-extern void
-lp_init_vbuf(struct llvmpipe_context *llvmpipe);
+extern struct vbuf_render *
+lp_create_vbuf_backend(struct llvmpipe_context *llvmpipe);
 
 
 #endif /* LP_VBUF_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rasterizer.c b/src/gallium/drivers/llvmpipe/lp_rasterizer.c
new file mode 100644
index 00000000000..089ea597292
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rasterizer.c
@@ -0,0 +1,157 @@
+
+struct lp_rasterizer {
+
+   /* We can choose whatever layout for the internal tile storage we
+    * prefer:
+    */
+   struct {
+      unsigned color[TILESIZE][TILESIZE];
+      unsigned depth[TILESIZE][TILESIZE];
+      char stencil[TILESIZE][TILESIZE];
+   } tile;
+
+      
+   unsigned x;
+   unsigned y;
+
+   
+   struct {
+      struct pipe_surface *color;
+      struct pipe_surface *zstencil;
+      unsigned clear_color;
+      unsigned clear_depth;
+      char clear_stencil;
+   } state;
+};
+
+struct lp_rasterizer *lp_rast_create( void )
+{
+   return CALLOC_STRUCT(lp_rasterizer);
+}
+
+void lp_rast_bind_surfaces( struct lp_rasterizer *,
+			    struct pipe_surface *color,
+			    struct pipe_surface *zstencil,
+			    const float *clear_color,
+			    double clear_depth,
+			    unsigned clear_stencil)
+{
+   pipe_surface_reference(&rast->state.color, color);
+   pipe_surface_reference(&rast->state.depth, depth);
+   rast->state.clear_color = util_pack_8888(clear_color);
+   rast->state.clear_depth = clear_depth * 0xffffffff;
+   rast->state.clear_stencil = clear_stencil;
+}
+
+/* Begining of each tile:
+ */
+void lp_rast_start_tile( struct lp_rasterizer *,
+			 unsigned x,
+			 unsigned y )
+{
+   rast->x = x;
+   rast->y = y;
+}
+
+void lp_rast_clear_color( struct lp_rasterizer *rast )
+{
+   const unsigned clear_color = rast->state.clear_color;
+   unsigned i, j;
+   
+   for (i = 0; i < TILESIZE; i++)
+      for (j = 0; j < TILESIZE; j++)
+	 rast->tile[i][j] = clear_color;
+}
+
+void lp_rast_clear_depth( struct lp_rasterizer *rast )
+{
+   const unsigned clear_depth = rast->state.clear_depth;
+   unsigned i, j;
+   
+   for (i = 0; i < TILESIZE; i++)
+      for (j = 0; j < TILESIZE; j++)
+	 rast->tile[i][j] = clear_depth;
+}
+
+void lp_rast_clear_stencil( struct lp_rasterizer *rast )
+{
+   const unsigned clear_stencil = rast->state.clear_stencil;
+
+   memset(rast->tile.stencil, clear_stencil, sizeof rast->tile.stencil );
+}
+
+void lp_rast_load_color( struct lp_rasterizer *rast )
+{
+   /* call u_tile func to load colors from surface */
+}
+
+void lp_rast_load_zstencil( struct lp_rasterizer *rast )
+{
+   /* call u_tile func to load depth (and stencil?) from surface */
+}
+
+/* Within a tile:
+ */
+void lp_rast_set_state( struct lp_rasterizer *rast,
+		       const struct lp_rast_state *state )
+{
+   rast->shader_state = state;
+}
+
+void lp_rast_triangle( struct lp_rasterizer *rast,
+		       const struct lp_rast_triangle *inputs )
+{
+   /* Set up the silly quad coef pointers
+    */
+   for (i = 0; i < 4; i++) {
+      rast->quads[i].posCoef = inputs->posCoef;
+      rast->quads[i].coef = inputs->coef;
+   }
+
+   /* Scan the tile in 4x4 chunks (?) and figure out which bits to
+    * rasterize:
+    */
+
+}
+
+void lp_rast_shade_tile( struct lp_rasterizer *rast,
+			 const struct lp_rast_shader_inputs *inputs )
+{
+   /* Set up the silly quad coef pointers
+    */
+   for (i = 0; i < 4; i++) {
+      rast->quads[i].posCoef = inputs->posCoef;
+      rast->quads[i].coef = inputs->coef;
+   }
+
+   /* Use the existing preference for 8x2 (four quads) shading:
+    */
+   for (i = 0; i < TILESIZE; i += 8) {
+      for (j = 0; j < TILESIZE; j += 2) {
+	 rast->shader_state.shade( inputs->jc,
+				   rast->x + i,
+				   rast->y + j,
+				   rast->quads, 4 );
+      }
+   }
+}
+
+/* End of tile:
+ */
+void lp_rast_store_color( struct lp_rasterizer *rast )
+{
+   /* call u_tile func to store colors to surface */
+}
+
+void lp_rast_store_zstencil( struct lp_rasterizer *rast )
+{
+   /* call u_tile func to store depth/stencil to surface */
+}
+
+/* Shutdown:
+ */
+void lp_rast_destroy( struct lp_rasterizer *rast )
+{
+   FREE(rast);
+}
+
diff --git a/src/gallium/drivers/llvmpipe/lp_rasterizer.h b/src/gallium/drivers/llvmpipe/lp_rasterizer.h
new file mode 100644
index 00000000000..b3ae06a1169
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_rasterizer.h
@@ -0,0 +1,112 @@
+
+/* Initially create and program a single rasterizer directly.  Later
+ * will want multiple of these, one or two per core.  At that stage
+ * will probably pass command buffers into the rasterizers rather than
+ * individual function calls like this.
+ */
+struct lp_rasterizer;
+
+struct lp_rast_state {
+   /* State:
+    */
+   struct lp_jit_context jc;
+   
+   /* Shader itself:
+    */
+};
+
+/* Coefficients necessary to run the shader at a given location:
+ */
+struct lp_rast_shader_inputs {
+
+   /* Current rasterizer state:
+    */
+   const struct lp_rast_state *state;
+
+   /* Attribute interpolation:
+    */
+   float oneoverarea;
+   float x1;
+   float y1;
+
+   struct tgsi_interp_coef position_coef;
+   struct tgsi_interp_coef *coef;
+};
+
+
+/* Rasterization information for a triangle known to be in this bin,
+ * plus inputs to run the shader:
+ */
+struct lp_rast_triangle {
+   /* one-pixel sized trivial accept offsets for each plane */
+   float ei1;                   
+   float ei2;
+   float ei3;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   float eo1;                   
+   float eo2;
+   float eo3;
+
+   /* y deltas for vertex pairs */
+   float dy12;
+   float dy23;
+   float dy31;
+
+   /* x deltas for vertex pairs */
+   float dx12;
+   float dx23;
+   float dx31;
+
+   /* State to run the shader: */
+   struct lp_rast_shader_inputs inputs;
+};
+
+
+
+struct lp_rasterizer *lp_rast_create( void );
+
+void lp_rast_bind_surfaces( struct lp_rasterizer *,
+			    struct pipe_surface *color,
+			    struct pipe_surface *zstencil,
+			    const float *clear_color,
+			    double clear_depth,
+			    unsigned clear_stencil);
+
+/* Begining of each tile:
+ */
+void lp_rast_start_tile( struct lp_rasterizer *,
+			 unsigned x,
+			 unsigned y );
+
+void lp_rast_clear_color( struct lp_rasterizer * );
+
+void lp_rast_clear_zstencil( struct lp_rasterizer * );
+
+void lp_rast_load_color( struct lp_rasterizer * );
+
+void lp_rast_load_zstencil( struct lp_rasterizer * );
+
+
+/* Within a tile:
+ */
+void lp_rast_set_state( struct lp_rasterizer *,
+		       const struct lp_rast_state * );
+
+void lp_rast_triangle( struct lp_rasterizer *,
+		       const struct lp_rast_triangle * );
+
+void lp_rast_shade_tile( struct lp_rasterizer *,
+			 const struct lp_rast_shader_inputs * );
+
+/* End of tile:
+ */
+void lp_rast_store_color( struct lp_rasterizer * );
+
+void lp_rast_store_zstencil( struct lp_rasterizer * );
+
+
+/* Shutdown:
+ */
+void lp_rast_destroy( struct lp_rasterizer * );
+
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 60107214df5..8c67524506e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -26,15 +26,15 @@
  **************************************************************************/
 
 /**
- * \brief  Primitive rasterization/rendering (points, lines, triangles)
+ * \brief  Primitive rasterization/rendering (points, lines)
  *
  * \author  Keith Whitwell <keith@tungstengraphics.com>
  * \author  Brian Paul
  */
 
 #include "lp_context.h"
-#include "lp_prim_setup.h"
 #include "lp_quad.h"
+#include "lp_quad_pipe.h"
 #include "lp_setup.h"
 #include "lp_state.h"
 #include "draw/draw_context.h"
@@ -44,1397 +44,49 @@
 #include "pipe/p_thread.h"
 #include "util/u_math.h"
 #include "util/u_memory.h"
-#include "lp_bld_debug.h"
-#include "lp_tile_cache.h"
-#include "lp_tile_soa.h"
 
 
 #define DEBUG_VERTS 0
-#define DEBUG_FRAGS 0
 
-/**
- * Triangle edge info
- */
-struct edge {
-   float dx;		/**< X(v1) - X(v0), used only during setup */
-   float dy;		/**< Y(v1) - Y(v0), used only during setup */
-   float dxdy;		/**< dx/dy */
-   float sx, sy;	/**< first sample point coord */
-   int lines;		/**< number of lines on this edge */
-};
-
-
-#define MAX_QUADS 16
-
-
-/**
- * Triangle setup info (derived from draw_stage).
- * Also used for line drawing (taking some liberties).
- */
-struct setup_context {
-   struct llvmpipe_context *llvmpipe;
-
-   /* Vertices are just an array of floats making up each attribute in
-    * turn.  Currently fixed at 4 floats, but should change in time.
-    * Codegen will help cope with this.
-    */
-   const float (*vmax)[4];
-   const float (*vmid)[4];
-   const float (*vmin)[4];
-   const float (*vprovoke)[4];
-
-   struct edge ebot;
-   struct edge etop;
-   struct edge emaj;
-
-   float oneoverarea;
-   int facing;
-
-   struct quad_header quad[MAX_QUADS];
-   struct quad_header *quad_ptrs[MAX_QUADS];
-   unsigned count;
-
-   struct quad_interp_coef coef;
-
-   struct {
-      int left[2];   /**< [0] = row0, [1] = row1 */
-      int right[2];
-      int y;
-   } span;
-
-#if DEBUG_FRAGS
-   uint numFragsEmitted;  /**< per primitive */
-   uint numFragsWritten;  /**< per primitive */
-#endif
-
-   unsigned winding;		/* which winding to cull */
-};
-
-
-
-/**
- * Execute fragment shader for the four fragments in the quad.
- */
-static void
-shade_quads(struct llvmpipe_context *llvmpipe,
-            struct quad_header *quads[],
-            unsigned nr)
-{
-   struct lp_fragment_shader *fs = llvmpipe->fs;
-   struct quad_header *quad = quads[0];
-   const unsigned x = quad->input.x0;
-   const unsigned y = quad->input.y0;
-   uint8_t *tile;
-   uint8_t *color;
-   void *depth;
-   uint32_t ALIGN16_ATTRIB mask[4][NUM_CHANNELS];
-   unsigned chan_index;
-   unsigned q;
-
-   assert(fs->current);
-   if(!fs->current)
-      return;
-
-   /* Sanity checks */
-   assert(nr * QUAD_SIZE == TILE_VECTOR_HEIGHT * TILE_VECTOR_WIDTH);
-   assert(x % TILE_VECTOR_WIDTH == 0);
-   assert(y % TILE_VECTOR_HEIGHT == 0);
-   for (q = 0; q < nr; ++q) {
-      assert(quads[q]->input.x0 == x + q*2);
-      assert(quads[q]->input.y0 == y);
-   }
-
-   /* mask */
-   for (q = 0; q < 4; ++q)
-      for (chan_index = 0; chan_index < NUM_CHANNELS; ++chan_index)
-         mask[q][chan_index] = quads[q]->inout.mask & (1 << chan_index) ? ~0 : 0;
-
-   /* color buffer */
-   if(llvmpipe->framebuffer.nr_cbufs >= 1 &&
-      llvmpipe->framebuffer.cbufs[0]) {
-      tile = lp_get_cached_tile(llvmpipe->cbuf_cache[0], x, y);
-      color = &TILE_PIXEL(tile, x & (TILE_SIZE-1), y & (TILE_SIZE-1), 0);
-   }
-   else
-      color = NULL;
-
-   /* depth buffer */
-   if(llvmpipe->zsbuf_map) {
-      assert((x % 2) == 0);
-      assert((y % 2) == 0);
-      depth = llvmpipe->zsbuf_map +
-              y*llvmpipe->zsbuf_transfer->stride +
-              2*x*llvmpipe->zsbuf_transfer->block.size;
-   }
-   else
-      depth = NULL;
-
-   /* XXX: This will most likely fail on 32bit x86 without -mstackrealign */
-   assert(lp_check_alignment(mask, 16));
-
-   assert(lp_check_alignment(depth, 16));
-   assert(lp_check_alignment(color, 16));
-   assert(lp_check_alignment(llvmpipe->jit_context.blend_color, 16));
-
-   /* run shader */
-   fs->current->jit_function( &llvmpipe->jit_context,
-                              x, y,
-                              quad->coef->a0,
-                              quad->coef->dadx,
-                              quad->coef->dady,
-                              &mask[0][0],
-                              color,
-                              depth);
-}
-
-
-
-
-/**
- * Do triangle cull test using tri determinant (sign indicates orientation)
- * \return true if triangle is to be culled.
- */
-static INLINE boolean
-cull_tri(const struct setup_context *setup, float det)
-{
-   if (det != 0) {   
-      /* if (det < 0 then Z points toward camera and triangle is 
-       * counter-clockwise winding.
-       */
-      unsigned winding = (det < 0) ? PIPE_WINDING_CCW : PIPE_WINDING_CW;
-
-      if ((winding & setup->winding) == 0)
-	 return FALSE;
-   }
-
-   /* Culled:
-    */
-   return TRUE;
-}
-
-
-
-/**
- * Clip setup->quad against the scissor/surface bounds.
- */
-static INLINE void
-quad_clip( struct setup_context *setup, struct quad_header *quad )
-{
-   const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-
-   if (quad->input.x0 >= maxx ||
-       quad->input.y0 >= maxy ||
-       quad->input.x0 + 1 < minx ||
-       quad->input.y0 + 1 < miny) {
-      /* totally clipped */
-      quad->inout.mask = 0x0;
-      return;
-   }
-   if (quad->input.x0 < minx)
-      quad->inout.mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-   if (quad->input.y0 < miny)
-      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-   if (quad->input.x0 == maxx - 1)
-      quad->inout.mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-   if (quad->input.y0 == maxy - 1)
-      quad->inout.mask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-}
-
-
-
-/**
- * Given an X or Y coordinate, return the block/quad coordinate that it
- * belongs to.
- */
-static INLINE int block( int x )
-{
-   return x & ~(2-1);
-}
-
-static INLINE int block_x( int x )
-{
-   return x & ~(TILE_VECTOR_WIDTH - 1);
-}
-
-
-/**
- * Emit a quad (pass to next stage) with clipping.
- */
-static INLINE void
-clip_emit_quad( struct setup_context *setup, struct quad_header *quad )
-{
-   quad_clip( setup, quad );
-
-   if (quad->inout.mask) {
-      struct llvmpipe_context *lp = setup->llvmpipe;
-
-#if 1
-      /* XXX: The blender expects 4 quads. This is far from efficient, but
-       * until we codegenerate single-quad variants of the fragment pipeline
-       * we need this hack. */
-      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-      struct quad_header quads[nr_quads];
-      struct quad_header *quad_ptrs[nr_quads];
-      int x0 = block_x(quad->input.x0);
-      unsigned i;
-
-      for(i = 0; i < nr_quads; ++i) {
-         int x = x0 + 2*i;
-         if(x == quad->input.x0)
-            memcpy(&quads[i], quad, sizeof quads[i]);
-         else {
-            memset(&quads[i], 0, sizeof quads[i]);
-            quads[i].input.x0 = x;
-            quads[i].input.y0 = quad->input.y0;
-            quads[i].coef = quad->coef;
-         }
-         quad_ptrs[i] = &quads[i];
-      }
-
-      shade_quads( lp, quad_ptrs, nr_quads );
-#else
-      shade_quads( lp, &quad, 1 );
-#endif
-   }
-}
-
-
-/**
- * Render a horizontal span of quads
- */
-static void flush_spans( struct setup_context *setup )
-{
-   const int step = TILE_VECTOR_WIDTH;
-   const int xleft0 = setup->span.left[0];
-   const int xleft1 = setup->span.left[1];
-   const int xright0 = setup->span.right[0];
-   const int xright1 = setup->span.right[1];
-
-
-   int minleft = block_x(MIN2(xleft0, xleft1));
-   int maxright = MAX2(xright0, xright1);
-   int x;
-
-   for (x = minleft; x < maxright; x += step) {
-      unsigned skip_left0 = CLAMP(xleft0 - x, 0, step);
-      unsigned skip_left1 = CLAMP(xleft1 - x, 0, step);
-      unsigned skip_right0 = CLAMP(x + step - xright0, 0, step);
-      unsigned skip_right1 = CLAMP(x + step - xright1, 0, step);
-      unsigned lx = x;
-      const unsigned nr_quads = TILE_VECTOR_HEIGHT*TILE_VECTOR_WIDTH/QUAD_SIZE;
-      unsigned q = 0;
-
-      unsigned skipmask_left0 = (1U << skip_left0) - 1U;
-      unsigned skipmask_left1 = (1U << skip_left1) - 1U;
-
-      /* These calculations fail when step == 32 and skip_right == 0.
-       */
-      unsigned skipmask_right0 = ~0U << (unsigned)(step - skip_right0);
-      unsigned skipmask_right1 = ~0U << (unsigned)(step - skip_right1);
-
-      unsigned mask0 = ~skipmask_left0 & ~skipmask_right0;
-      unsigned mask1 = ~skipmask_left1 & ~skipmask_right1;
-
-      if (mask0 | mask1) {
-         for(q = 0; q < nr_quads; ++q) {
-            unsigned quadmask = (mask0 & 3) | ((mask1 & 3) << 2);
-            setup->quad[q].input.x0 = lx;
-            setup->quad[q].input.y0 = setup->span.y;
-            setup->quad[q].inout.mask = quadmask;
-            setup->quad_ptrs[q] = &setup->quad[q];
-            mask0 >>= 2;
-            mask1 >>= 2;
-            lx += 2;
-         }
-         assert(!(mask0 | mask1));
-
-         shade_quads(setup->llvmpipe, setup->quad_ptrs, nr_quads );
-      }
-   }
-
-
-   setup->span.y = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   setup->span.left[0] = 1000000;     /* greater than right[0] */
-   setup->span.left[1] = 1000000;     /* greater than right[1] */
-}
-
-
-#if DEBUG_VERTS
-static void print_vertex(const struct setup_context *setup,
-                         const float (*v)[4])
-{
-   int i;
-   debug_printf("   Vertex: (%p)\n", v);
-   for (i = 0; i < setup->quad[0].nr_attrs; i++) {
-      debug_printf("     %d: %f %f %f %f\n",  i,
-              v[i][0], v[i][1], v[i][2], v[i][3]);
-      if (util_is_inf_or_nan(v[i][0])) {
-         debug_printf("   NaN!\n");
-      }
-   }
-}
-#endif
-
-/**
- * Sort the vertices from top to bottom order, setting up the triangle
- * edge fields (ebot, emaj, etop).
- * \return FALSE if coords are inf/nan (cull the tri), TRUE otherwise
- */
-static boolean setup_sort_vertices( struct setup_context *setup,
-                                    float det,
-                                    const float (*v0)[4],
-                                    const float (*v1)[4],
-                                    const float (*v2)[4] )
-{
-   setup->vprovoke = v2;
-
-   /* determine bottom to top order of vertices */
-   {
-      float y0 = v0[0][1];
-      float y1 = v1[0][1];
-      float y2 = v2[0][1];
-      if (y0 <= y1) {
-	 if (y1 <= y2) {
-	    /* y0<=y1<=y2 */
-	    setup->vmin = v0;
-	    setup->vmid = v1;
-	    setup->vmax = v2;
-	 }
-	 else if (y2 <= y0) {
-	    /* y2<=y0<=y1 */
-	    setup->vmin = v2;
-	    setup->vmid = v0;
-	    setup->vmax = v1;
-	 }
-	 else {
-	    /* y0<=y2<=y1 */
-	    setup->vmin = v0;
-	    setup->vmid = v2;
-	    setup->vmax = v1;
-	 }
-      }
-      else {
-	 if (y0 <= y2) {
-	    /* y1<=y0<=y2 */
-	    setup->vmin = v1;
-	    setup->vmid = v0;
-	    setup->vmax = v2;
-	 }
-	 else if (y2 <= y1) {
-	    /* y2<=y1<=y0 */
-	    setup->vmin = v2;
-	    setup->vmid = v1;
-	    setup->vmax = v0;
-	 }
-	 else {
-	    /* y1<=y2<=y0 */
-	    setup->vmin = v1;
-	    setup->vmid = v2;
-	    setup->vmax = v0;
-	 }
-      }
-   }
-
-   setup->ebot.dx = setup->vmid[0][0] - setup->vmin[0][0];
-   setup->ebot.dy = setup->vmid[0][1] - setup->vmin[0][1];
-   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
-   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
-   setup->etop.dx = setup->vmax[0][0] - setup->vmid[0][0];
-   setup->etop.dy = setup->vmax[0][1] - setup->vmid[0][1];
-
-   /*
-    * Compute triangle's area.  Use 1/area to compute partial
-    * derivatives of attributes later.
-    *
-    * The area will be the same as prim->det, but the sign may be
-    * different depending on how the vertices get sorted above.
-    *
-    * To determine whether the primitive is front or back facing we
-    * use the prim->det value because its sign is correct.
-    */
-   {
-      const float area = (setup->emaj.dx * setup->ebot.dy -
-			    setup->ebot.dx * setup->emaj.dy);
-
-      setup->oneoverarea = 1.0f / area;
-
-      /*
-      debug_printf("%s one-over-area %f  area %f  det %f\n",
-                   __FUNCTION__, setup->oneoverarea, area, det );
-      */
-      if (util_is_inf_or_nan(setup->oneoverarea))
-         return FALSE;
-   }
-
-   /* We need to know if this is a front or back-facing triangle for:
-    *  - the GLSL gl_FrontFacing fragment attribute (bool)
-    *  - two-sided stencil test
-    */
-   setup->facing = 
-      ((det > 0.0) ^ 
-       (setup->llvmpipe->rasterizer->front_winding == PIPE_WINDING_CW));
-
-   return TRUE;
-}
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static void tri_pos_coeff( struct setup_context *setup,
-                           uint vertSlot, unsigned i)
-{
-   float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
-   float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-   float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-   float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-   float dadx = a * setup->oneoverarea;
-   float dady = b * setup->oneoverarea;
-
-   assert(i <= 3);
-
-   setup->coef.dadx[0][i] = dadx;
-   setup->coef.dady[0][i] = dady;
-
-   /* calculate a0 as the value which would be sampled for the
-    * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
-    *
-    * this is neat but unfortunately not a good way to do things for
-    * triangles with very large values of dadx or dady as it will
-    * result in the subtraction and re-addition from a0 of a very
-    * large number, which means we'll end up loosing a lot of the
-    * fractional bits and precision from a0.  the way to fix this is
-    * to define a0 as the sample at a pixel center somewhere near vmin
-    * instead - i'll switch to this later.
-    */
-   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - 0.5f) +
-                            dady * (setup->vmin[0][1] - 0.5f)));
-
-   /*
-   debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-                slot, "xyzw"[i],
-                setup->coef[slot].a0[i],
-                setup->coef[slot].dadx[i],
-                setup->coef[slot].dady[i]);
-   */
-}
-
-
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
- * \param slot  which attribute slot
- * \param i  which component of the slot (0..3)
- */
-static void const_pos_coeff( struct setup_context *setup,
-                             uint vertSlot, unsigned i)
-{
-   setup->coef.dadx[0][i] = 0;
-   setup->coef.dady[0][i] = 0;
-
-   /* need provoking vertex info!
-    */
-   setup->coef.a0[0][i] = setup->vprovoke[vertSlot][i];
-}
-
-
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- * The value value comes from vertex[slot][i].
- * The result will be put into setup->coef[slot].a0[i].
- * \param slot  which attribute slot
- * \param i  which component of the slot (0..3)
- */
-static void const_coeff( struct setup_context *setup,
-                         unsigned attrib,
-                         uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      setup->coef.dadx[1 + attrib][i] = 0;
-      setup->coef.dady[1 + attrib][i] = 0;
-
-      /* need provoking vertex info!
-       */
-      setup->coef.a0[1 + attrib][i] = setup->vprovoke[vertSlot][i];
-   }
-}
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a triangle.
- */
-static void tri_linear_coeff( struct setup_context *setup,
-                              unsigned attrib,
-                              uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      float botda = setup->vmid[vertSlot][i] - setup->vmin[vertSlot][i];
-      float majda = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-      float dadx = a * setup->oneoverarea;
-      float dady = b * setup->oneoverarea;
-
-      assert(i <= 3);
-
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-
-      /* calculate a0 as the value which would be sampled for the
-       * fragment at (0,0), taking into account that we want to sample at
-       * pixel centers, in other words (0.5, 0.5).
-       *
-       * this is neat but unfortunately not a good way to do things for
-       * triangles with very large values of dadx or dady as it will
-       * result in the subtraction and re-addition from a0 of a very
-       * large number, which means we'll end up loosing a lot of the
-       * fractional bits and precision from a0.  the way to fix this is
-       * to define a0 as the sample at a pixel center somewhere near vmin
-       * instead - i'll switch to this later.
-       */
-      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
-
-      /*
-      debug_printf("attr[%d].%c: %f dx:%f dy:%f\n",
-                   slot, "xyzw"[i],
-                   setup->coef[slot].a0[i],
-                   setup->coef[slot].dadx[i],
-                   setup->coef[slot].dady[i]);
-      */
-   }
-}
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void tri_persp_coeff( struct setup_context *setup,
-                             unsigned attrib,
-                             uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      /* premultiply by 1/w  (v[0][3] is always W):
-       */
-      float mina = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-      float mida = setup->vmid[vertSlot][i] * setup->vmid[0][3];
-      float maxa = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-      float botda = mida - mina;
-      float majda = maxa - mina;
-      float a = setup->ebot.dy * majda - botda * setup->emaj.dy;
-      float b = setup->emaj.dx * botda - majda * setup->ebot.dx;
-      float dadx = a * setup->oneoverarea;
-      float dady = b * setup->oneoverarea;
-
-      /*
-      debug_printf("tri persp %d,%d: %f %f %f\n", vertSlot, i,
-                   setup->vmin[vertSlot][i],
-                   setup->vmid[vertSlot][i],
-                   setup->vmax[vertSlot][i]
-             );
-      */
-      assert(i <= 3);
-
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-      setup->coef.a0[1 + attrib][i] = (mina -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
-   }
-}
-
-
-/**
- * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial, though Y has to be inverted for OpenGL.
- * Z and W are copied from posCoef which should have already been computed.
- * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
- */
-static void
-setup_fragcoord_coeff(struct setup_context *setup, uint slot)
-{
-   /*X*/
-   setup->coef.a0[1 + slot][0] = 0;
-   setup->coef.dadx[1 + slot][0] = 1.0;
-   setup->coef.dady[1 + slot][0] = 0.0;
-   /*Y*/
-   setup->coef.a0[1 + slot][1] = 0.0;
-   setup->coef.dadx[1 + slot][1] = 0.0;
-   setup->coef.dady[1 + slot][1] = 1.0;
-   /*Z*/
-   setup->coef.a0[1 + slot][2] = setup->coef.a0[0][2];
-   setup->coef.dadx[1 + slot][2] = setup->coef.dadx[0][2];
-   setup->coef.dady[1 + slot][2] = setup->coef.dady[0][2];
-   /*W*/
-   setup->coef.a0[1 + slot][3] = setup->coef.a0[0][3];
-   setup->coef.dadx[1 + slot][3] = setup->coef.dadx[0][3];
-   setup->coef.dady[1 + slot][3] = setup->coef.dady[0][3];
-}
-
-
-
-/**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmid,vmax,vprovoke are initialized.
- */
-static void setup_tri_coefficients( struct setup_context *setup )
-{
-   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-   uint fragSlot;
-
-   /* z and w are done by linear interpolation:
-    */
-   tri_pos_coeff(setup, 0, 2);
-   tri_pos_coeff(setup, 0, 3);
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         const_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_LINEAR:
-         tri_linear_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_PERSPECTIVE:
-         tri_persp_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_POS:
-         setup_fragcoord_coeff(setup, fragSlot);
-         break;
-      default:
-         assert(0);
-      }
-
-      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-         setup->coef.dadx[1 + fragSlot][0] = 0.0;
-         setup->coef.dady[1 + fragSlot][0] = 0.0;
-      }
-   }
-}
-
-
-
-static void setup_tri_edges( struct setup_context *setup )
-{
-   float vmin_x = setup->vmin[0][0] + 0.5f;
-   float vmid_x = setup->vmid[0][0] + 0.5f;
-
-   float vmin_y = setup->vmin[0][1] - 0.5f;
-   float vmid_y = setup->vmid[0][1] - 0.5f;
-   float vmax_y = setup->vmax[0][1] - 0.5f;
-
-   setup->emaj.sy = ceilf(vmin_y);
-   setup->emaj.lines = (int) ceilf(vmax_y - setup->emaj.sy);
-   setup->emaj.dxdy = setup->emaj.dx / setup->emaj.dy;
-   setup->emaj.sx = vmin_x + (setup->emaj.sy - vmin_y) * setup->emaj.dxdy;
-
-   setup->etop.sy = ceilf(vmid_y);
-   setup->etop.lines = (int) ceilf(vmax_y - setup->etop.sy);
-   setup->etop.dxdy = setup->etop.dx / setup->etop.dy;
-   setup->etop.sx = vmid_x + (setup->etop.sy - vmid_y) * setup->etop.dxdy;
-
-   setup->ebot.sy = ceilf(vmin_y);
-   setup->ebot.lines = (int) ceilf(vmid_y - setup->ebot.sy);
-   setup->ebot.dxdy = setup->ebot.dx / setup->ebot.dy;
-   setup->ebot.sx = vmin_x + (setup->ebot.sy - vmin_y) * setup->ebot.dxdy;
-}
-
-
-/**
- * Render the upper or lower half of a triangle.
- * Scissoring/cliprect is applied here too.
- */
-static void subtriangle( struct setup_context *setup,
-			 struct edge *eleft,
-			 struct edge *eright,
-			 unsigned lines )
-{
-   const struct pipe_scissor_state *cliprect = &setup->llvmpipe->cliprect;
-   const int minx = (int) cliprect->minx;
-   const int maxx = (int) cliprect->maxx;
-   const int miny = (int) cliprect->miny;
-   const int maxy = (int) cliprect->maxy;
-   int y, start_y, finish_y;
-   int sy = (int)eleft->sy;
-
-   assert((int)eleft->sy == (int) eright->sy);
-
-   /* clip top/bottom */
-   start_y = sy;
-   if (start_y < miny)
-      start_y = miny;
-
-   finish_y = sy + lines;
-   if (finish_y > maxy)
-      finish_y = maxy;
-
-   start_y -= sy;
-   finish_y -= sy;
-
-   /*
-   debug_printf("%s %d %d\n", __FUNCTION__, start_y, finish_y);
-   */
-
-   for (y = start_y; y < finish_y; y++) {
-
-      /* avoid accumulating adds as floats don't have the precision to
-       * accurately iterate large triangle edges that way.  luckily we
-       * can just multiply these days.
-       *
-       * this is all drowned out by the attribute interpolation anyway.
-       */
-      int left = (int)(eleft->sx + y * eleft->dxdy);
-      int right = (int)(eright->sx + y * eright->dxdy);
-
-      /* clip left/right */
-      if (left < minx)
-         left = minx;
-      if (right > maxx)
-         right = maxx;
-
-      if (left < right) {
-         int _y = sy + y;
-         if (block(_y) != setup->span.y) {
-            flush_spans(setup);
-            setup->span.y = block(_y);
-         }
-
-         setup->span.left[_y&1] = left;
-         setup->span.right[_y&1] = right;
-      }
-   }
-
-
-   /* save the values so that emaj can be restarted:
-    */
-   eleft->sx += lines * eleft->dxdy;
-   eright->sx += lines * eright->dxdy;
-   eleft->sy += lines;
-   eright->sy += lines;
-}
-
-
-/**
- * Recalculate prim's determinant.  This is needed as we don't have
- * get this information through the vbuf_render interface & we must
- * calculate it here.
- */
-static float
-calc_det( const float (*v0)[4],
-          const float (*v1)[4],
-          const float (*v2)[4] )
-{
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0][0] - v2[0][0];
-   const float ey = v0[0][1] - v2[0][1];
-   const float fx = v1[0][0] - v2[0][0];
-   const float fy = v1[0][1] - v2[0][1];
-
-   /* det = cross(e,f).z */
-   return ex * fy - ey * fx;
-}
-
-
-/**
- * Do setup for triangle rasterization, then render the triangle.
- */
-void llvmpipe_setup_tri( struct setup_context *setup,
-                const float (*v0)[4],
-                const float (*v1)[4],
-                const float (*v2)[4] )
-{
-   float det;
-
-#if DEBUG_VERTS
-   debug_printf("Setup triangle:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-   print_vertex(setup, v2);
-#endif
-
-   if (setup->llvmpipe->no_rast)
-      return;
-   
-   det = calc_det(v0, v1, v2);
-   /*
-   debug_printf("%s\n", __FUNCTION__ );
-   */
-
-#if DEBUG_FRAGS
-   setup->numFragsEmitted = 0;
-   setup->numFragsWritten = 0;
-#endif
-
-   if (cull_tri( setup, det ))
-      return;
-
-   if (!setup_sort_vertices( setup, det, v0, v1, v2 ))
-      return;
-   setup_tri_coefficients( setup );
-   setup_tri_edges( setup );
-
-   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_TRIANGLES);
-
-   setup->span.y = 0;
-   setup->span.right[0] = 0;
-   setup->span.right[1] = 0;
-   /*   setup->span.z_mode = tri_z_mode( setup->ctx ); */
-
-   /*   init_constant_attribs( setup ); */
-
-   if (setup->oneoverarea < 0.0) {
-      /* emaj on left:
-       */
-      subtriangle( setup, &setup->emaj, &setup->ebot, setup->ebot.lines );
-      subtriangle( setup, &setup->emaj, &setup->etop, setup->etop.lines );
-   }
-   else {
-      /* emaj on right:
-       */
-      subtriangle( setup, &setup->ebot, &setup->emaj, setup->ebot.lines );
-      subtriangle( setup, &setup->etop, &setup->emaj, setup->etop.lines );
-   }
-
-   flush_spans( setup );
-
-#if DEBUG_FRAGS
-   printf("Tri: %u frags emitted, %u written\n",
-          setup->numFragsEmitted,
-          setup->numFragsWritten);
-#endif
-}
-
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a line.
- */
-static void
-linear_pos_coeff(struct setup_context *setup,
-                 uint vertSlot, uint i)
-{
-   const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-   const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-   const float dady = da * setup->emaj.dy * setup->oneoverarea;
-   setup->coef.dadx[0][i] = dadx;
-   setup->coef.dady[0][i] = dady;
-   setup->coef.a0[0][i] = (setup->vmin[vertSlot][i] -
-                           (dadx * (setup->vmin[0][0] - 0.5f) +
-                            dady * (setup->vmin[0][1] - 0.5f)));
-}
-
-
-/**
- * Compute a0, dadx and dady for a linearly interpolated coefficient,
- * for a line.
- */
-static void
-line_linear_coeff(struct setup_context *setup,
-                  unsigned attrib,
-                  uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      const float da = setup->vmax[vertSlot][i] - setup->vmin[vertSlot][i];
-      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-      const float dady = da * setup->emaj.dy * setup->oneoverarea;
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
-   }
-}
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a line.
- */
-static void
-line_persp_coeff(struct setup_context *setup,
-                 unsigned attrib,
-                 uint vertSlot)
-{
-   unsigned i;
-   for (i = 0; i < NUM_CHANNELS; ++i) {
-      /* XXX double-check/verify this arithmetic */
-      const float a0 = setup->vmin[vertSlot][i] * setup->vmin[0][3];
-      const float a1 = setup->vmax[vertSlot][i] * setup->vmax[0][3];
-      const float da = a1 - a0;
-      const float dadx = da * setup->emaj.dx * setup->oneoverarea;
-      const float dady = da * setup->emaj.dy * setup->oneoverarea;
-      setup->coef.dadx[1 + attrib][i] = dadx;
-      setup->coef.dady[1 + attrib][i] = dady;
-      setup->coef.a0[1 + attrib][i] = (setup->vmin[vertSlot][i] -
-                     (dadx * (setup->vmin[0][0] - 0.5f) +
-                      dady * (setup->vmin[0][1] - 0.5f)));
-   }
-}
-
-
-/**
- * Compute the setup->coef[] array dadx, dady, a0 values.
- * Must be called after setup->vmin,vmax are initialized.
- */
-static INLINE boolean
-setup_line_coefficients(struct setup_context *setup,
-                        const float (*v0)[4],
-                        const float (*v1)[4])
-{
-   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-   uint fragSlot;
-   float area;
-
-   /* use setup->vmin, vmax to point to vertices */
-   if (llvmpipe->rasterizer->flatshade_first)
-      setup->vprovoke = v0;
-   else
-      setup->vprovoke = v1;
-   setup->vmin = v0;
-   setup->vmax = v1;
-
-   setup->emaj.dx = setup->vmax[0][0] - setup->vmin[0][0];
-   setup->emaj.dy = setup->vmax[0][1] - setup->vmin[0][1];
-
-   /* NOTE: this is not really area but something proportional to it */
-   area = setup->emaj.dx * setup->emaj.dx + setup->emaj.dy * setup->emaj.dy;
-   if (area == 0.0f || util_is_inf_or_nan(area))
-      return FALSE;
-   setup->oneoverarea = 1.0f / area;
-
-   /* z and w are done by linear interpolation:
-    */
-   linear_pos_coeff(setup, 0, 2);
-   linear_pos_coeff(setup, 0, 3);
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         const_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_LINEAR:
-         line_linear_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_PERSPECTIVE:
-         line_persp_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_POS:
-         setup_fragcoord_coeff(setup, fragSlot);
-         break;
-      default:
-         assert(0);
-      }
-
-      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-         setup->coef.dadx[1 + fragSlot][0] = 0.0;
-         setup->coef.dady[1 + fragSlot][0] = 0.0;
-      }
-   }
-   return TRUE;
-}
-
-
-/**
- * Plot a pixel in a line segment.
+/* Stubs for lines & points for now:
  */
-static INLINE void
-plot(struct setup_context *setup, int x, int y)
+void
+llvmpipe_setup_point(struct setup_context *setup,
+		     const float (*v0)[4])
 {
-   const int iy = y & 1;
-   const int ix = x & 1;
-   const int quadX = x - ix;
-   const int quadY = y - iy;
-   const int mask = (1 << ix) << (2 * iy);
-
-   if (quadX != setup->quad[0].input.x0 ||
-       quadY != setup->quad[0].input.y0)
-   {
-      /* flush prev quad, start new quad */
-
-      if (setup->quad[0].input.x0 != -1)
-         clip_emit_quad( setup, &setup->quad[0] );
-
-      setup->quad[0].input.x0 = quadX;
-      setup->quad[0].input.y0 = quadY;
-      setup->quad[0].inout.mask = 0x0;
-   }
-
-   setup->quad[0].inout.mask |= mask;
 }
 
-
-/**
- * Do setup for line rasterization, then render the line.
- * Single-pixel width, no stipple, etc.  We rely on the 'draw' module
- * to handle stippling and wide lines.
- */
 void
 llvmpipe_setup_line(struct setup_context *setup,
-           const float (*v0)[4],
-           const float (*v1)[4])
-{
-   int x0 = (int) v0[0][0];
-   int x1 = (int) v1[0][0];
-   int y0 = (int) v0[0][1];
-   int y1 = (int) v1[0][1];
-   int dx = x1 - x0;
-   int dy = y1 - y0;
-   int xstep, ystep;
-
-#if DEBUG_VERTS
-   debug_printf("Setup line:\n");
-   print_vertex(setup, v0);
-   print_vertex(setup, v1);
-#endif
-
-   if (setup->llvmpipe->no_rast)
-      return;
-
-   if (dx == 0 && dy == 0)
-      return;
-
-   if (!setup_line_coefficients(setup, v0, v1))
-      return;
-
-   assert(v0[0][0] < 1.0e9);
-   assert(v0[0][1] < 1.0e9);
-   assert(v1[0][0] < 1.0e9);
-   assert(v1[0][1] < 1.0e9);
-
-   if (dx < 0) {
-      dx = -dx;   /* make positive */
-      xstep = -1;
-   }
-   else {
-      xstep = 1;
-   }
-
-   if (dy < 0) {
-      dy = -dy;   /* make positive */
-      ystep = -1;
-   }
-   else {
-      ystep = 1;
-   }
-
-   assert(dx >= 0);
-   assert(dy >= 0);
-   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_LINES);
-
-   setup->quad[0].input.x0 = setup->quad[0].input.y0 = -1;
-   setup->quad[0].inout.mask = 0x0;
-
-   /* XXX temporary: set coverage to 1.0 so the line appears
-    * if AA mode happens to be enabled.
-    */
-   setup->quad[0].input.coverage[0] =
-   setup->quad[0].input.coverage[1] =
-   setup->quad[0].input.coverage[2] =
-   setup->quad[0].input.coverage[3] = 1.0;
-
-   if (dx > dy) {
-      /*** X-major line ***/
-      int i;
-      const int errorInc = dy + dy;
-      int error = errorInc - dx;
-      const int errorDec = error - dx;
-
-      for (i = 0; i < dx; i++) {
-         plot(setup, x0, y0);
-
-         x0 += xstep;
-         if (error < 0) {
-            error += errorInc;
-         }
-         else {
-            error += errorDec;
-            y0 += ystep;
-         }
-      }
-   }
-   else {
-      /*** Y-major line ***/
-      int i;
-      const int errorInc = dx + dx;
-      int error = errorInc - dy;
-      const int errorDec = error - dy;
-
-      for (i = 0; i < dy; i++) {
-         plot(setup, x0, y0);
-
-         y0 += ystep;
-         if (error < 0) {
-            error += errorInc;
-         }
-         else {
-            error += errorDec;
-            x0 += xstep;
-         }
-      }
-   }
-
-   /* draw final quad */
-   if (setup->quad[0].inout.mask) {
-      clip_emit_quad( setup, &setup->quad[0] );
-   }
-}
-
-
-static void
-point_persp_coeff(struct setup_context *setup,
-                  const float (*vert)[4],
-                  unsigned attrib,
-                  uint vertSlot)
+		    const float (*v0)[4],
+		    const float (*v1)[4])
 {
-   unsigned i;
-   for(i = 0; i < NUM_CHANNELS; ++i) {
-      setup->coef.dadx[1 + attrib][i] = 0.0F;
-      setup->coef.dady[1 + attrib][i] = 0.0F;
-      setup->coef.a0[1 + attrib][i] = vert[vertSlot][i] * vert[0][3];
-   }
 }
 
 
-/**
- * Do setup for point rasterization, then render the point.
- * Round or square points...
- * XXX could optimize a lot for 1-pixel points.
+/* Called after statechange, before emitting primitives.  If binning
+ * is active, this function should store relevant state in the binning
+ * context.
+ *
+ * That includes: 
+ *    - current fragment shader function
+ *    - bound constant buffer contents
+ *    - bound textures
+ *    - blend color
+ *    - etc.
+ *
+ * Basically everything needed at some point in the future to
+ * rasterize triangles for the current state.
+ *
+ * Additionally this will set up the state needed for the rasterizer
+ * to process and bin incoming triangles.  That would include such
+ * things as:
+ *    - cull mode
+ *    - ???
+ *    - etc.
+ * 
  */
-void
-llvmpipe_setup_point( struct setup_context *setup,
-             const float (*v0)[4] )
-{
-   struct llvmpipe_context *llvmpipe = setup->llvmpipe;
-   const struct lp_fragment_shader *lpfs = llvmpipe->fs;
-   const int sizeAttr = setup->llvmpipe->psize_slot;
-   const float size
-      = sizeAttr > 0 ? v0[sizeAttr][0]
-      : setup->llvmpipe->rasterizer->point_size;
-   const float halfSize = 0.5F * size;
-   const boolean round = (boolean) setup->llvmpipe->rasterizer->point_smooth;
-   const float x = v0[0][0];  /* Note: data[0] is always position */
-   const float y = v0[0][1];
-   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
-   uint fragSlot;
-
-#if DEBUG_VERTS
-   debug_printf("Setup point:\n");
-   print_vertex(setup, v0);
-#endif
-
-   if (llvmpipe->no_rast)
-      return;
-
-   assert(setup->llvmpipe->reduced_prim == PIPE_PRIM_POINTS);
-
-   /* For points, all interpolants are constant-valued.
-    * However, for point sprites, we'll need to setup texcoords appropriately.
-    * XXX: which coefficients are the texcoords???
-    * We may do point sprites as textured quads...
-    *
-    * KW: We don't know which coefficients are texcoords - ultimately
-    * the choice of what interpolation mode to use for each attribute
-    * should be determined by the fragment program, using
-    * per-attribute declaration statements that include interpolation
-    * mode as a parameter.  So either the fragment program will have
-    * to be adjusted for pointsprite vs normal point behaviour, or
-    * otherwise a special interpolation mode will have to be defined
-    * which matches the required behaviour for point sprites.  But -
-    * the latter is not a feature of normal hardware, and as such
-    * probably should be ruled out on that basis.
-    */
-   setup->vprovoke = v0;
-
-   /* setup Z, W */
-   const_pos_coeff(setup, 0, 2);
-   const_pos_coeff(setup, 0, 3);
-
-   for (fragSlot = 0; fragSlot < lpfs->info.num_inputs; fragSlot++) {
-      const uint vertSlot = vinfo->attrib[fragSlot].src_index;
-
-      switch (vinfo->attrib[fragSlot].interp_mode) {
-      case INTERP_CONSTANT:
-         /* fall-through */
-      case INTERP_LINEAR:
-         const_coeff(setup, fragSlot, vertSlot);
-         break;
-      case INTERP_PERSPECTIVE:
-         point_persp_coeff(setup, setup->vprovoke, fragSlot, vertSlot);
-         break;
-      case INTERP_POS:
-         setup_fragcoord_coeff(setup, fragSlot);
-         break;
-      default:
-         assert(0);
-      }
-
-      if (lpfs->info.input_semantic_name[fragSlot] == TGSI_SEMANTIC_FACE) {
-         setup->coef.a0[1 + fragSlot][0] = 1.0f - setup->facing;
-         setup->coef.dadx[1 + fragSlot][0] = 0.0;
-         setup->coef.dady[1 + fragSlot][0] = 0.0;
-      }
-   }
-
-
-   if (halfSize <= 0.5 && !round) {
-      /* special case for 1-pixel points */
-      const int ix = ((int) x) & 1;
-      const int iy = ((int) y) & 1;
-      setup->quad[0].input.x0 = (int) x - ix;
-      setup->quad[0].input.y0 = (int) y - iy;
-      setup->quad[0].inout.mask = (1 << ix) << (2 * iy);
-      clip_emit_quad( setup, &setup->quad[0] );
-   }
-   else {
-      if (round) {
-         /* rounded points */
-         const int ixmin = block((int) (x - halfSize));
-         const int ixmax = block((int) (x + halfSize));
-         const int iymin = block((int) (y - halfSize));
-         const int iymax = block((int) (y + halfSize));
-         const float rmin = halfSize - 0.7071F;  /* 0.7071 = sqrt(2)/2 */
-         const float rmax = halfSize + 0.7071F;
-         const float rmin2 = MAX2(0.0F, rmin * rmin);
-         const float rmax2 = rmax * rmax;
-         const float cscale = 1.0F / (rmax2 - rmin2);
-         int ix, iy;
-
-         for (iy = iymin; iy <= iymax; iy += 2) {
-            for (ix = ixmin; ix <= ixmax; ix += 2) {
-               float dx, dy, dist2, cover;
-
-               setup->quad[0].inout.mask = 0x0;
-
-               dx = (ix + 0.5f) - x;
-               dy = (iy + 0.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_TOP_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_TOP_LEFT;
-               }
-
-               dx = (ix + 1.5f) - x;
-               dy = (iy + 0.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_TOP_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_TOP_RIGHT;
-               }
-
-               dx = (ix + 0.5f) - x;
-               dy = (iy + 1.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_BOTTOM_LEFT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_BOTTOM_LEFT;
-               }
-
-               dx = (ix + 1.5f) - x;
-               dy = (iy + 1.5f) - y;
-               dist2 = dx * dx + dy * dy;
-               if (dist2 <= rmax2) {
-                  cover = 1.0F - (dist2 - rmin2) * cscale;
-                  setup->quad[0].input.coverage[QUAD_BOTTOM_RIGHT] = MIN2(cover, 1.0f);
-                  setup->quad[0].inout.mask |= MASK_BOTTOM_RIGHT;
-               }
-
-               if (setup->quad[0].inout.mask) {
-                  setup->quad[0].input.x0 = ix;
-                  setup->quad[0].input.y0 = iy;
-                  clip_emit_quad( setup, &setup->quad[0] );
-               }
-            }
-         }
-      }
-      else {
-         /* square points */
-         const int xmin = (int) (x + 0.75 - halfSize);
-         const int ymin = (int) (y + 0.25 - halfSize);
-         const int xmax = xmin + (int) size;
-         const int ymax = ymin + (int) size;
-         /* XXX could apply scissor to xmin,ymin,xmax,ymax now */
-         const int ixmin = block(xmin);
-         const int ixmax = block(xmax - 1);
-         const int iymin = block(ymin);
-         const int iymax = block(ymax - 1);
-         int ix, iy;
-
-         /*
-         debug_printf("(%f, %f) -> X:%d..%d Y:%d..%d\n", x, y, xmin, xmax,ymin,ymax);
-         */
-         for (iy = iymin; iy <= iymax; iy += 2) {
-            uint rowMask = 0xf;
-            if (iy < ymin) {
-               /* above the top edge */
-               rowMask &= (MASK_BOTTOM_LEFT | MASK_BOTTOM_RIGHT);
-            }
-            if (iy + 1 >= ymax) {
-               /* below the bottom edge */
-               rowMask &= (MASK_TOP_LEFT | MASK_TOP_RIGHT);
-            }
-
-            for (ix = ixmin; ix <= ixmax; ix += 2) {
-               uint mask = rowMask;
-
-               if (ix < xmin) {
-                  /* fragment is past left edge of point, turn off left bits */
-                  mask &= (MASK_BOTTOM_RIGHT | MASK_TOP_RIGHT);
-               }
-               if (ix + 1 >= xmax) {
-                  /* past the right edge */
-                  mask &= (MASK_BOTTOM_LEFT | MASK_TOP_LEFT);
-               }
-
-               setup->quad[0].inout.mask = mask;
-               setup->quad[0].input.x0 = ix;
-               setup->quad[0].input.y0 = iy;
-               clip_emit_quad( setup, &setup->quad[0] );
-            }
-         }
-      }
-   }
-}
-
-void llvmpipe_setup_prepare( struct setup_context *setup )
+void setup_prepare( struct setup_context *setup )
 {
    struct llvmpipe_context *lp = setup->llvmpipe;
 
@@ -1442,6 +94,8 @@ void llvmpipe_setup_prepare( struct setup_context *setup )
       llvmpipe_update_derived(lp);
    }
 
+   lp->quad.first->begin( lp->quad.first );
+
    if (lp->reduced_api_prim == PIPE_PRIM_TRIANGLES &&
        lp->rasterizer->fill_cw == PIPE_POLYGON_MODE_FILL &&
        lp->rasterizer->fill_ccw == PIPE_POLYGON_MODE_FILL) {
@@ -1452,38 +106,28 @@ void llvmpipe_setup_prepare( struct setup_context *setup )
       /* 'draw' will do culling */
       setup->winding = PIPE_WINDING_NONE;
    }
+
+   setup_prepare_tri( setup->llvmpipe );
 }
 
 
 
-void llvmpipe_setup_destroy_context( struct setup_context *setup )
+void setup_destroy_context( struct setup_context *setup )
 {
-   align_free( setup );
+   FREE( setup );
 }
 
 
 /**
  * Create a new primitive setup/render stage.
  */
-struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe )
+struct setup_context *setup_create_context( struct llvmpipe_context *llvmpipe )
 {
-   struct setup_context *setup;
+   struct setup_context *setup = CALLOC_STRUCT(setup_context);
    unsigned i;
 
-   setup = align_malloc(sizeof(struct setup_context), 16);
-   if (!setup)
-      return NULL;
-
-   memset(setup, 0, sizeof *setup);
    setup->llvmpipe = llvmpipe;
 
-   for (i = 0; i < MAX_QUADS; i++) {
-      setup->quad[i].coef = &setup->coef;
-   }
-
-   setup->span.left[0] = 1000000;     /* greater than right[0] */
-   setup->span.left[1] = 1000000;     /* greater than right[1] */
-
    return setup;
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 89c43da0460..05aaaf83b8e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -30,11 +30,8 @@
 struct setup_context;
 struct llvmpipe_context;
 
-void 
-llvmpipe_setup_tri( struct setup_context *setup,
-	   const float (*v0)[4],
-	   const float (*v1)[4],
-	   const float (*v2)[4] );
+/* Note, not using setup_context currently 
+ */
 
 void
 llvmpipe_setup_line(struct setup_context *setup,
@@ -46,8 +43,12 @@ llvmpipe_setup_point( struct setup_context *setup,
              const float (*v0)[4] );
 
 
-struct setup_context *llvmpipe_setup_create_context( struct llvmpipe_context *llvmpipe );
-void llvmpipe_setup_prepare( struct setup_context *setup );
-void llvmpipe_setup_destroy_context( struct setup_context *setup );
+struct setup_context *setup_create_context( struct llvmpipe_context *llvmpipe );
+
+void setup_prepare( struct setup_context *setup );
+
+void setup_destroy_context( struct setup_context *setup );
+
+void setup_prepare_tri( struct llvmpipe_context *llvmpipe );
 
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
new file mode 100644
index 00000000000..848705e0991
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -0,0 +1,140 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+#ifndef LP_SETUP_CONTEXT_H
+#define LP_SETUP_CONTEXT_H
+
+struct clear_tile {
+   boolean do_color;
+   boolean do_depth_stencil;
+   unsigned rgba;
+   unsigned depth_stencil;
+};
+
+struct load_tile {
+   boolean do_color;
+   boolean do_depth_stencil;
+};
+
+/* Shade tile points directly at this:
+ */
+struct shader_inputs {
+   /* Some way of updating rasterizer state:
+    */
+   /* ??? */
+
+   /* Attribute interpolation:
+    */
+   float oneoverarea;
+   float x1;
+   float y1;
+
+   struct tgsi_interp_coef position_coef;
+   struct tgsi_interp_coef *coef;
+};
+
+/* Shade triangle points at this:
+ */
+struct shade_triangle {
+   /* one-pixel sized trivial accept offsets for each plane */
+   float ei1;                   
+   float ei2;
+   float ei3;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   float eo1;                   
+   float eo2;
+   float eo3;
+
+   /* y deltas for vertex pairs */
+   float dy12;
+   float dy23;
+   float dy31;
+
+   /* x deltas for vertex pairs */
+   float dx12;
+   float dx23;
+   float dx31;
+   
+   struct shader_inputs inputs;
+};
+
+struct bin_cmd {
+   enum {
+      CMD_END = 0,
+      CMD_CLEAR,
+      CMD_LOAD_TILE,
+      CMD_SHADE_TILE,
+      CMD_SHADE_TRIANGLE,
+   } cmd;
+
+   union {
+      struct triangle *tri;
+      struct clear *clear;
+   } ptr;
+};
+
+struct cmd_block {
+   struct bin_cmd cmds[128];
+   unsigned count;
+   struct cmd_block *next;
+};
+
+/* Triangles
+ */
+struct data_block {
+   ubyte data[4096 - sizeof(unsigned) - sizeof(struct cmd_block *)];
+   unsigned count;
+   struct data_block *next;
+};
+
+/* Need to store the state at the time the triangle was drawn, at
+ * least as it is needed during rasterization.  That would include at
+ * minimum the constant values referred to by the fragment shader,
+ * blend state, etc.  Much of this is code-generated into the shader
+ * in llvmpipe -- may be easier to do this work there.
+ */
+struct state_block {
+};
+
+
+/**
+ * Basically all the data from a binner scene:
+ */
+struct binned_scene {
+   struct llvmpipe_context *llvmpipe;
+
+   struct cmd_block *bin[MAX_HEIGHT / BIN_SIZE][MAX_WIDTH / BIN_SIZE];
+   struct data_block *data;
+};
+
+static INLINE struct triangle *get_triangle( struct setup_context *setup )
+{
+   if (setup->triangles->count == TRIANGLE_BLOCK_COUNT)
+      return setup_triangle_from_new_block( setup );
+
+   return &setup->triangles[setup->triangles->count++];
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_rasterize.c b/src/gallium/drivers/llvmpipe/lp_setup_rasterize.c
new file mode 100644
index 00000000000..5b4faf489b8
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_rasterize.c
@@ -0,0 +1,7 @@
+
+void
+rasterize( struct llvmpipe_context *llvmpipe,
+	   struct binned_scene *scene )
+{
+   
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
new file mode 100644
index 00000000000..a09e0fa643e
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -0,0 +1,755 @@
+/**************************************************************************
+ *
+ * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+/*
+ * Recursive rasterization for triangles
+ */
+
+#include "lp_context.h"
+#include "lp_quad.h"
+#include "lp_quad_pipe.h"
+#include "lp_setup.h"
+#include "lp_state.h"
+#include "draw/draw_context.h"
+#include "draw/draw_private.h"
+#include "draw/draw_vertex.h"
+#include "pipe/p_shader_tokens.h"
+#include "pipe/p_thread.h"
+#include "util/u_math.h"
+#include "util/u_memory.h"
+
+#define BLOCKSIZE 4
+
+struct triangle {
+   /* one-pixel sized trivial accept offsets for each plane */
+   float ei1;                   
+   float ei2;
+   float ei3;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   float eo1;                   
+   float eo2;
+   float eo3;
+
+   /* y deltas for vertex pairs */
+   float dy12;
+   float dy23;
+   float dy31;
+
+   /* x deltas for vertex pairs */
+   float dx12;
+   float dx23;
+   float dx31;
+
+   /* Attribute interpolation:
+    */
+   float oneoverarea;
+   float x1;
+   float y1;
+   struct tgsi_interp_coef coef[PIPE_MAX_SHADER_INPUTS];
+   struct tgsi_interp_coef position_coef;
+
+   /* A run of pre-initialized quads:
+    */
+   struct llvmpipe_context *llvmpipe;
+   struct quad_header quad[4];
+};
+
+
+/**
+ * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
+ */
+static void constant_coef( struct tgsi_interp_coef *coef,
+			   const float (*v3)[4],
+			   unsigned vert_attr,
+			   unsigned i )
+{
+   coef->a0[i] = v3[vert_attr][i];
+   coef->dadx[i] = 0;
+   coef->dady[i] = 0;
+}
+
+/**
+ * Compute a0, dadx and dady for a linearly interpolated coefficient,
+ * for a triangle.
+ */
+static void linear_coef( struct triangle *tri,
+			 struct tgsi_interp_coef *coef,
+			 const float (*v1)[4],
+			 const float (*v2)[4],
+			 const float (*v3)[4],
+			 unsigned vert_attr,
+			 unsigned i)
+{
+   float a1 = v1[vert_attr][i];
+   float a2 = v2[vert_attr][i];
+   float a3 = v3[vert_attr][i];
+
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * tri->oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * tri->oneoverarea;
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+
+   /* calculate a0 as the value which would be sampled for the
+    * fragment at (0,0), taking into account that we want to sample at
+    * pixel centers, in other words (0.5, 0.5).
+    *
+    * this is neat but unfortunately not a good way to do things for
+    * triangles with very large values of dadx or dady as it will
+    * result in the subtraction and re-addition from a0 of a very
+    * large number, which means we'll end up loosing a lot of the
+    * fractional bits and precision from a0.  the way to fix this is
+    * to define a0 as the sample at a pixel center somewhere near vmin
+    * instead - i'll switch to this later.
+    */
+   coef->a0[i] = (v1[vert_attr][i] -
+                  (dadx * (v1[0][0] - 0.5f) +
+                   dady * (v1[0][1] - 0.5f)));
+}
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void perspective_coef( struct triangle *tri,
+			      struct tgsi_interp_coef *coef,
+			      const float (*v1)[4],
+			      const float (*v2)[4],
+			      const float (*v3)[4],
+			      unsigned vert_attr,
+			      unsigned i)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   float a1 = v1[vert_attr][i] * v1[0][3];
+   float a2 = v2[vert_attr][i] * v2[0][3];
+   float a3 = v3[vert_attr][i] * v3[0][3];
+   float da12 = a1 - a2;
+   float da31 = a3 - a1;
+   float dadx = (da12 * tri->dy31 - tri->dy12 * da31) * tri->oneoverarea;
+   float dady = (da31 * tri->dx12 - tri->dx31 * da12) * tri->oneoverarea;
+
+
+   coef->dadx[i] = dadx;
+   coef->dady[i] = dady;
+   coef->a0[i] = (a1 -
+                  (dadx * (v1[0][0] - 0.5f) +
+                   dady * (v1[0][1] - 0.5f)));
+}
+
+
+/**
+ * Special coefficient setup for gl_FragCoord.
+ * X and Y are trivial, though Y has to be inverted for OpenGL.
+ * Z and W are copied from position_coef which should have already been computed.
+ * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
+ */
+static void
+setup_fragcoord_coef(struct triangle *tri, unsigned slot)
+{
+   /*X*/
+   tri->coef[slot].a0[0] = 0.0;
+   tri->coef[slot].dadx[0] = 1.0;
+   tri->coef[slot].dady[0] = 0.0;
+   /*Y*/
+   tri->coef[slot].a0[1] = 0.0;
+   tri->coef[slot].dadx[1] = 0.0;
+   tri->coef[slot].dady[1] = 1.0;
+   /*Z*/
+   tri->coef[slot].a0[2] = tri->position_coef.a0[2];
+   tri->coef[slot].dadx[2] = tri->position_coef.dadx[2];
+   tri->coef[slot].dady[2] = tri->position_coef.dady[2];
+   /*W*/
+   tri->coef[slot].a0[3] = tri->position_coef.a0[3];
+   tri->coef[slot].dadx[3] = tri->position_coef.dadx[3];
+   tri->coef[slot].dady[3] = tri->position_coef.dady[3];
+}
+
+
+
+/**
+ * Compute the tri->coef[] array dadx, dady, a0 values.
+ */
+static void setup_tri_coefficients( struct llvmpipe_context *llvmpipe,
+				    struct triangle *tri,
+				    const float (*v1)[4],
+				    const float (*v2)[4],
+				    const float (*v3)[4],
+				    boolean frontface )
+{
+   const struct lp_fragment_shader *fs = llvmpipe->fs;
+   const struct vertex_info *vinfo = llvmpipe_get_vertex_info(llvmpipe);
+   unsigned input;
+
+   /* z and w are done by linear interpolation:
+    */
+   linear_coef(tri, &tri->position_coef, v1, v2, v3, 0, 2);
+   linear_coef(tri, &tri->position_coef, v1, v2, v3, 0, 3);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (input = 0; input < fs->info.num_inputs; input++) {
+      unsigned vert_attr = vinfo->attrib[input].src_index;
+      unsigned i;
+
+      switch (vinfo->attrib[input].interp_mode) {
+      case INTERP_CONSTANT:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            constant_coef(&tri->coef[input], v3, vert_attr, i);
+         break;
+
+      case INTERP_LINEAR:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            linear_coef(tri, &tri->coef[input], v1, v2, v3, vert_attr, i);
+         break;
+
+      case INTERP_PERSPECTIVE:
+         for (i = 0; i < NUM_CHANNELS; i++)
+            perspective_coef(tri, &tri->coef[input], v1, v2, v3, vert_attr, i);
+         break;
+
+      case INTERP_POS:
+         setup_fragcoord_coef(tri, input);
+         break;
+
+      default:
+         assert(0);
+      }
+
+      if (fs->info.input_semantic_name[input] == TGSI_SEMANTIC_FACE) {
+         tri->coef[input].a0[0] = 1.0f - frontface;
+         tri->coef[input].dadx[0] = 0.0;
+         tri->coef[input].dady[0] = 0.0;
+      }
+   }
+}
+
+
+
+/* XXX: do this by add/subtracting a large floating point number:
+ */
+static inline float subpixel_snap( float a )
+{
+   int i = a * 16;
+   return (float)i * (1.0/16);
+}
+
+
+/* Convert 8x8 block into four runs of quads and render each in turn.
+ */
+#if (BLOCKSIZE == 8)
+static void block_full( struct triangle *tri, int x, int y )
+{
+   struct quad_header *ptrs[4];
+   int i;
+
+   tri->quad[0].input.x0 = x + 0;
+   tri->quad[1].input.x0 = x + 2;
+   tri->quad[2].input.x0 = x + 4;
+   tri->quad[3].input.x0 = x + 6;
+
+   for (i = 0; i < 4; i++, y += 2) {
+      tri->quad[0].inout.mask = 0xf;
+      tri->quad[1].inout.mask = 0xf;
+      tri->quad[2].inout.mask = 0xf;
+      tri->quad[3].inout.mask = 0xf;
+
+      tri->quad[0].input.y0 = y;
+      tri->quad[1].input.y0 = y;
+      tri->quad[2].input.y0 = y;
+      tri->quad[3].input.y0 = y;
+
+      /* XXX: don't bother with this ptrs business */
+      ptrs[0] = &tri->quad[0];
+      ptrs[1] = &tri->quad[1];
+      ptrs[2] = &tri->quad[2];
+      ptrs[3] = &tri->quad[3];
+
+      tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, ptrs, 4 );
+   }
+}
+#elif (BLOCKSIZE == 4)
+static void block_full( struct triangle *tri, int x, int y )
+{
+   struct quad_header *ptrs[4];
+   int iy;
+
+   tri->quad[0].input.x0 = x + 0;
+   tri->quad[1].input.x0 = x + 2;
+
+   for (iy = 0; iy < 4; iy += 2) {
+      tri->quad[0].inout.mask = 0xf;
+      tri->quad[1].inout.mask = 0xf;
+
+      tri->quad[0].input.y0 = y + iy;
+      tri->quad[1].input.y0 = y + iy;
+
+      /* XXX: don't bother with this ptrs business */
+      ptrs[0] = &tri->quad[0];
+      ptrs[1] = &tri->quad[1];
+
+      tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, ptrs, 2 );
+   }
+}
+#else
+static void block_full( struct triangle *tri, int x, int y )
+{
+   struct quad_header *ptrs[4];
+   int iy;
+
+   tri->quad[0].input.x0 = x;
+   tri->quad[0].input.y0 = y;
+   tri->quad[0].inout.mask = 0xf;
+
+   ptrs[0] = &tri->quad[0];
+   tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, ptrs, 1 );
+}
+#endif
+
+
+static void
+do_quad( struct triangle *tri,
+	 int x, int y,
+	 float c1, float c2, float c3 )
+{
+   struct quad_header *quad = &tri->quad[0];
+
+   float xstep1 = -tri->dy12;
+   float xstep2 = -tri->dy23;
+   float xstep3 = -tri->dy31;
+
+   float ystep1 = tri->dx12;
+   float ystep2 = tri->dx23;
+   float ystep3 = tri->dx31;
+
+   quad->input.x0 = x;
+   quad->input.y0 = y;
+   quad->inout.mask = 0;
+
+   if (c1 > 0 &&
+       c2 > 0 &&
+       c3 > 0)
+      quad->inout.mask |= 1;
+	 
+   if (c1 + xstep1 > 0 && 
+       c2 + xstep2 > 0 && 
+       c3 + xstep3 > 0)
+      quad->inout.mask |= 2;
+
+   if (c1 + ystep1 > 0 && 
+       c2 + ystep2 > 0 && 
+       c3 + ystep3 > 0)
+      quad->inout.mask |= 4;
+
+   if (c1 + ystep1 + xstep1 > 0 && 
+       c2 + ystep2 + xstep2 > 0 && 
+       c3 + ystep3 + xstep3 > 0)
+      quad->inout.mask |= 8;
+
+   if (quad->inout.mask)
+      tri->llvmpipe->quad.first->run( tri->llvmpipe->quad.first, &quad, 1 );
+}
+
+/* Evaluate each pixel in a block, generate a mask and possibly render
+ * the quad:
+ */
+static void
+do_block( struct triangle *tri,
+	 int x, int y,
+	 float c1,
+	 float c2,
+	 float c3 )
+{
+   const int step = 2;
+
+   float xstep1 = -step * tri->dy12;
+   float xstep2 = -step * tri->dy23;
+   float xstep3 = -step * tri->dy31;
+
+   float ystep1 = step * tri->dx12;
+   float ystep2 = step * tri->dx23;
+   float ystep3 = step * tri->dx31;
+
+   int ix, iy;
+
+   for (iy = 0; iy < BLOCKSIZE; iy += 2) {
+      float cx1 = c1;
+      float cx2 = c2;
+      float cx3 = c3;
+
+      for (ix = 0; ix < BLOCKSIZE; ix += 2) {
+
+	 do_quad(tri, x+ix, y+iy, cx1, cx2, cx3);
+
+	 cx1 += xstep1;
+	 cx2 += xstep2;
+	 cx3 += xstep3;
+      }
+
+      c1 += ystep1;
+      c2 += ystep2;
+      c3 += ystep3;
+   }
+}
+
+
+
+
+/* to avoid having to allocate power-of-four, square render targets,
+ * end up having a specialized version of the above that runs only at
+ * the topmost level.
+ *
+ * at the topmost level there may be an arbitary number of steps on
+ * either dimension, so this loop needs to be either separately
+ * code-generated and unrolled for each render target size, or kept as
+ * generic looping code:
+ */
+
+#define MIN3(a,b,c) MIN2(MIN2(a,b),c)
+#define MAX3(a,b,c) MAX2(MAX2(a,b),c)
+
+static void 
+do_triangle_ccw(struct llvmpipe_context *llvmpipe,
+		const float (*v1)[4],
+		const float (*v2)[4],
+		const float (*v3)[4],
+		boolean frontfacing )
+{
+   const int rt_width = llvmpipe->framebuffer.cbufs[0]->width;
+   const int rt_height = llvmpipe->framebuffer.cbufs[0]->height;
+
+   const float y1 = subpixel_snap(v1[0][1]);
+   const float y2 = subpixel_snap(v2[0][1]);
+   const float y3 = subpixel_snap(v3[0][1]);
+
+   const float x1 = subpixel_snap(v1[0][0]);
+   const float x2 = subpixel_snap(v2[0][0]);
+   const float x3 = subpixel_snap(v3[0][0]);
+   
+   struct triangle tri;
+   float area;
+   float c1, c2, c3;
+   int i;
+   int minx, maxx, miny, maxy;
+
+   tri.llvmpipe = llvmpipe;
+
+
+   tri.dx12 = x1 - x2;
+   tri.dx23 = x2 - x3;
+   tri.dx31 = x3 - x1;
+
+   tri.dy12 = y1 - y2;
+   tri.dy23 = y2 - y3;
+   tri.dy31 = y3 - y1;
+
+   area = (tri.dx12 * tri.dy31 - 
+	   tri.dx31 * tri.dy12);
+
+   /* Cull non-ccw and zero-sized triangles.
+    */
+   if (area <= 0 || util_is_inf_or_nan(area))
+      return;
+
+   // Bounding rectangle
+   minx = util_iround(MIN3(x1, x2, x3) - .5);
+   maxx = util_iround(MAX3(x1, x2, x3) + .5);
+   miny = util_iround(MIN3(y1, y2, y3) - .5);
+   maxy = util_iround(MAX3(y1, y2, y3) + .5);
+   
+   /* Clamp to framebuffer (or tile) dimensions:
+    */
+   miny = MAX2(0, miny);
+   minx = MAX2(0, minx);
+   maxy = MIN2(rt_height, maxy);
+   maxx = MIN2(rt_width, maxx);
+
+   if (miny == maxy || minx == maxx)
+      return;
+
+   /* The only divide in this code.  Is it really needed?
+    */
+   tri.oneoverarea = 1.0f / area;
+
+   /* Setup parameter interpolants:
+    */
+   setup_tri_coefficients( llvmpipe, &tri, v1, v2, v3, frontfacing );
+
+   for (i = 0; i < Elements(tri.quad); i++) {
+      tri.quad[i].coef = tri.coef;
+      tri.quad[i].posCoef = &tri.position_coef;
+   }
+
+   /* half-edge constants, will be interated over the whole
+    * rendertarget.
+    */
+   c1 = tri.dy12 * x1 - tri.dx12 * y1;
+   c2 = tri.dy23 * x2 - tri.dx23 * y2;
+   c3 = tri.dy31 * x3 - tri.dx31 * y3;
+
+   /* correct for top-left fill convention:
+    */
+   if (tri.dy12 < 0 || (tri.dy12 == 0 && tri.dx12 > 0)) c1++;
+   if (tri.dy23 < 0 || (tri.dy23 == 0 && tri.dx23 > 0)) c2++;
+   if (tri.dy31 < 0 || (tri.dy31 == 0 && tri.dx31 > 0)) c3++;
+
+   /* find trivial reject offsets for each edge for a single-pixel
+    * sized block.  These will be scaled up at each recursive level to
+    * match the active blocksize.  Scaling in this way works best if
+    * the blocks are square.
+    */
+   tri.eo1 = 0;
+   if (tri.dy12 < 0) tri.eo1 -= tri.dy12;
+   if (tri.dx12 > 0) tri.eo1 += tri.dx12;
+
+   tri.eo2 = 0;
+   if (tri.dy23 < 0) tri.eo2 -= tri.dy23;
+   if (tri.dx23 > 0) tri.eo2 += tri.dx23;
+
+   tri.eo3 = 0;
+   if (tri.dy31 < 0) tri.eo3 -= tri.dy31;
+   if (tri.dx31 > 0) tri.eo3 += tri.dx31;
+
+   /* Calculate trivial accept offsets from the above.
+    */
+   tri.ei1 = tri.dx12 - tri.dy12 - tri.eo1;
+   tri.ei2 = tri.dx23 - tri.dy23 - tri.eo2;
+   tri.ei3 = tri.dx31 - tri.dy31 - tri.eo3;
+
+   minx &= ~(BLOCKSIZE-1);		/* aligned blocks */
+   miny &= ~(BLOCKSIZE-1);		/* aligned blocks */
+
+   c1 += tri.dx12 * miny - tri.dy12 * minx;
+   c2 += tri.dx23 * miny - tri.dy23 * minx;
+   c3 += tri.dx31 * miny - tri.dy31 * minx;
+
+   if ((miny & ~15) == (maxy & ~15) &&
+       (minx & ~15) == (maxx & ~15))
+   {
+      const int step = 2;
+
+      float xstep1 = -step * tri.dy12;
+      float xstep2 = -step * tri.dy23;
+      float xstep3 = -step * tri.dy31;
+
+      float ystep1 = step * tri.dx12;
+      float ystep2 = step * tri.dx23;
+      float ystep3 = step * tri.dx31;
+
+      float eo1 = tri.eo1 * step;
+      float eo2 = tri.eo2 * step;
+      float eo3 = tri.eo3 * step;
+
+      int x, y;
+
+      /* Subdivide space into NxM blocks, where each block is square and
+       * power-of-four in dimension.
+       *
+       * Trivially accept or reject blocks, else jump to per-pixel
+       * examination above.
+       */
+      for (y = miny; y < maxy; y += step)
+      {
+	 float cx1 = c1;
+	 float cx2 = c2;
+	 float cx3 = c3;
+
+	 for (x = minx; x < maxx; x += step)
+	 {
+	    if (cx1 + eo1 < 0 || 
+		cx2 + eo2 < 0 ||
+		cx3 + eo3 < 0) 
+	    {
+	    }
+	    else 
+	    {
+	       do_quad(&tri, x, y, cx1, cx2, cx3);
+	    }
+
+	    /* Iterate cx values across the region:
+	     */
+	    cx1 += xstep1;
+	    cx2 += xstep2;
+	    cx3 += xstep3;
+	 }
+      
+	 /* Iterate c values down the region:
+	  */
+	 c1 += ystep1;
+	 c2 += ystep2;
+	 c3 += ystep3;    
+      }
+   }
+   else 
+   {
+      const int step = BLOCKSIZE;
+
+      float ei1 = tri.ei1 * step;
+      float ei2 = tri.ei2 * step;
+      float ei3 = tri.ei3 * step;
+
+      float eo1 = tri.eo1 * step;
+      float eo2 = tri.eo2 * step;
+      float eo3 = tri.eo3 * step;
+
+      float xstep1 = -step * tri.dy12;
+      float xstep2 = -step * tri.dy23;
+      float xstep3 = -step * tri.dy31;
+
+      float ystep1 = step * tri.dx12;
+      float ystep2 = step * tri.dx23;
+      float ystep3 = step * tri.dx31;
+      int x, y;
+
+
+      /* Subdivide space into NxM blocks, where each block is square and
+       * power-of-four in dimension.
+       *
+       * Trivially accept or reject blocks, else jump to per-pixel
+       * examination above.
+       */
+      for (y = miny; y < maxy; y += step)
+      {
+	 float cx1 = c1;
+	 float cx2 = c2;
+	 float cx3 = c3;
+	 boolean in = false;
+
+	 for (x = minx; x < maxx; x += step)
+	 {
+	    if (cx1 + eo1 < 0 || 
+		cx2 + eo2 < 0 ||
+		cx3 + eo3 < 0) 
+	    {
+	       /* do nothing */
+	       if (in)
+		  break;
+	    }
+	    else if (cx1 + ei1 > 0 &&
+		     cx2 + ei2 > 0 &&
+		     cx3 + ei3 > 0) 
+	    {
+	       in = TRUE;
+	       block_full(&tri, x, y); /* trivial accept */
+	    }
+	    else 
+	    {
+	       in = TRUE;
+	       // block_full(&tri, x, y); /* trivial accept */
+	       do_block(&tri, x, y, cx1, cx2, cx3);
+	    }
+
+	    /* Iterate cx values across the region:
+	     */
+	    cx1 += xstep1;
+	    cx2 += xstep2;
+	    cx3 += xstep3;
+	 }
+      
+	 /* Iterate c values down the region:
+	  */
+	 c1 += ystep1;
+	 c2 += ystep2;
+	 c3 += ystep3;    
+      }
+   }
+}
+
+static void triangle_cw( struct llvmpipe_context *llvmpipe,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   do_triangle_ccw( llvmpipe, v1, v0, v2, !llvmpipe->ccw_is_frontface );
+}
+
+static void triangle_ccw( struct llvmpipe_context *llvmpipe,
+			 const float (*v0)[4],
+			 const float (*v1)[4],
+			 const float (*v2)[4] )
+{
+   do_triangle_ccw( llvmpipe, v0, v1, v2, llvmpipe->ccw_is_frontface );
+}
+
+static void triangle_both( struct llvmpipe_context *llvmpipe,
+			   const float (*v0)[4],
+			   const float (*v1)[4],
+			   const float (*v2)[4] )
+{
+   /* edge vectors e = v0 - v2, f = v1 - v2 */
+   const float ex = v0[0][0] - v2[0][0];
+   const float ey = v0[0][1] - v2[0][1];
+   const float fx = v1[0][0] - v2[0][0];
+   const float fy = v1[0][1] - v2[0][1];
+
+   /* det = cross(e,f).z */
+   if (ex * fy - ey * fx < 0) 
+      triangle_ccw( llvmpipe, v0, v1, v2 );
+   else
+      triangle_cw( llvmpipe, v0, v1, v2 );
+}
+
+static void triangle_nop( struct llvmpipe_context *llvmpipe,
+			  const float (*v0)[4],
+			  const float (*v1)[4],
+			  const float (*v2)[4] )
+{
+}
+
+/**
+ * Do setup for triangle rasterization, then render the triangle.
+ */
+void setup_prepare_tri( struct llvmpipe_context *llvmpipe )
+{
+   llvmpipe->ccw_is_frontface = (llvmpipe->rasterizer->front_winding == 
+				 PIPE_WINDING_CW);
+
+   switch (llvmpipe->rasterizer->cull_mode) {
+   case PIPE_WINDING_NONE:
+      llvmpipe->triangle = triangle_both;
+      break;
+   case PIPE_WINDING_CCW:
+      llvmpipe->triangle = triangle_cw;
+      break;
+   case PIPE_WINDING_CW:
+      llvmpipe->triangle = triangle_ccw;
+      break;
+   default:
+      llvmpipe->triangle = triangle_nop;
+      break;
+   }
+}
+
+
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index 30fb41ea65d..31eaadda216 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -67,24 +67,19 @@ llvmpipe_get_vertex_info(struct llvmpipe_context *llvmpipe)
       const struct lp_fragment_shader *lpfs = llvmpipe->fs;
       const enum interp_mode colorInterp
          = llvmpipe->rasterizer->flatshade ? INTERP_CONSTANT : INTERP_LINEAR;
+      struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
+      const uint num = draw_num_vs_outputs(llvmpipe->draw);
       uint i;
 
-      if (llvmpipe->vbuf) {
-         /* if using the post-transform vertex buffer, tell draw_vbuf to
-          * simply emit the whole post-xform vertex as-is:
-          */
-         struct vertex_info *vinfo_vbuf = &llvmpipe->vertex_info_vbuf;
-         const uint num = draw_num_vs_outputs(llvmpipe->draw);
-         uint i;
-
-         /* No longer any need to try and emit draw vertex_header info.
-          */
-         vinfo_vbuf->num_attribs = 0;
-         for (i = 0; i < num; i++) {
-            draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
-         }
-         draw_compute_vertex_size(vinfo_vbuf);
+      /* Tell draw_vbuf to simply emit the whole post-xform vertex
+       * as-is.  No longer any need to try and emit draw vertex_header
+       * info.
+       */
+      vinfo_vbuf->num_attribs = 0;
+      for (i = 0; i < num; i++) {
+	 draw_emit_vertex_attr(vinfo_vbuf, EMIT_4F, INTERP_PERSPECTIVE, i);
       }
+      draw_compute_vertex_size(vinfo_vbuf);
 
       /*
        * Loop over fragment shader inputs, searching for the matching output
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.c b/src/gallium/drivers/llvmpipe/lp_tile_cache.c
deleted file mode 100644
index ec3e002d628..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.c
+++ /dev/null
@@ -1,353 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-/**
- * Texture tile caching.
- *
- * Author:
- *    Brian Paul
- */
-
-#include "pipe/p_inlines.h"
-#include "util/u_memory.h"
-#include "util/u_math.h"
-#include "util/u_tile.h"
-#include "util/u_rect.h"
-#include "lp_context.h"
-#include "lp_surface.h"
-#include "lp_texture.h"
-#include "lp_tile_soa.h"
-#include "lp_tile_cache.h"
-
-
-#define MAX_WIDTH 4096
-#define MAX_HEIGHT 4096
-
-
-enum llvmpipe_tile_status
-{
-   LP_TILE_STATUS_UNDEFINED = 0,
-   LP_TILE_STATUS_CLEAR = 1,
-   LP_TILE_STATUS_DEFINED = 2
-};
-
-
-struct llvmpipe_cached_tile
-{
-   enum llvmpipe_tile_status status;
-
-   /** color in SOA format */
-   uint8_t *color;
-};
-
-
-struct llvmpipe_tile_cache
-{
-   struct pipe_screen *screen;
-   struct pipe_surface *surface;  /**< the surface we're caching */
-   struct pipe_transfer *transfer;
-   void *transfer_map;
-
-   struct llvmpipe_cached_tile entries[MAX_WIDTH/TILE_SIZE][MAX_HEIGHT/TILE_SIZE];
-
-   uint8_t clear_color[4];  /**< for color bufs */
-   uint clear_val;        /**< for z+stencil, or packed color clear value */
-
-   struct llvmpipe_cached_tile *last_tile;  /**< most recently retrieved tile */
-};
-
-
-struct llvmpipe_tile_cache *
-lp_create_tile_cache( struct pipe_screen *screen )
-{
-   struct llvmpipe_tile_cache *tc;
-   int maxLevels, maxTexSize;
-
-   /* sanity checking: max sure MAX_WIDTH/HEIGHT >= largest texture image */
-   maxLevels = screen->get_param(screen, PIPE_CAP_MAX_TEXTURE_2D_LEVELS);
-   maxTexSize = 1 << (maxLevels - 1);
-   assert(MAX_WIDTH >= maxTexSize);
-
-   tc = CALLOC_STRUCT( llvmpipe_tile_cache );
-   if(!tc)
-      return NULL;
-
-   tc->screen = screen;
-
-   return tc;
-}
-
-
-void
-lp_destroy_tile_cache(struct llvmpipe_tile_cache *tc)
-{
-   struct pipe_screen *screen;
-   unsigned x, y;
-
-   for (y = 0; y < MAX_HEIGHT; y += TILE_SIZE) {
-      for (x = 0; x < MAX_WIDTH; x += TILE_SIZE) {
-         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-
-         if(tile->color)
-            align_free(tile->color);
-      }
-   }
-
-   if (tc->transfer) {
-      screen = tc->transfer->texture->screen;
-      screen->tex_transfer_destroy(tc->transfer);
-   }
-
-   FREE( tc );
-}
-
-
-/**
- * Specify the surface to cache.
- */
-void
-lp_tile_cache_set_surface(struct llvmpipe_tile_cache *tc,
-                          struct pipe_surface *ps)
-{
-   if (tc->transfer) {
-      struct pipe_screen *screen = tc->transfer->texture->screen;
-
-      if (ps == tc->surface)
-         return;
-
-      if (tc->transfer_map) {
-         screen->transfer_unmap(screen, tc->transfer);
-         tc->transfer_map = NULL;
-      }
-
-      screen->tex_transfer_destroy(tc->transfer);
-      tc->transfer = NULL;
-   }
-
-   tc->surface = ps;
-
-   if (ps) {
-      struct pipe_screen *screen = ps->texture->screen;
-      unsigned x, y;
-
-      tc->transfer = screen->get_tex_transfer(screen, ps->texture, ps->face,
-                                              ps->level, ps->zslice,
-                                              PIPE_TRANSFER_READ_WRITE,
-                                              0, 0, ps->width, ps->height);
-
-      for (y = 0; y < ps->height; y += TILE_SIZE) {
-         for (x = 0; x < ps->width; x += TILE_SIZE) {
-            struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-
-            tile->status = LP_TILE_STATUS_UNDEFINED;
-
-            if(!tile->color)
-               tile->color = align_malloc( TILE_SIZE*TILE_SIZE*NUM_CHANNELS, 16 );
-         }
-      }
-   }
-}
-
-
-/**
- * Return the transfer being cached.
- */
-struct pipe_surface *
-lp_tile_cache_get_surface(struct llvmpipe_tile_cache *tc)
-{
-   return tc->surface;
-}
-
-
-void
-lp_tile_cache_map_transfers(struct llvmpipe_tile_cache *tc)
-{
-   if (tc->transfer && !tc->transfer_map)
-      tc->transfer_map = tc->screen->transfer_map(tc->screen, tc->transfer);
-}
-
-
-void
-lp_tile_cache_unmap_transfers(struct llvmpipe_tile_cache *tc)
-{
-   if (tc->transfer_map) {
-      tc->screen->transfer_unmap(tc->screen, tc->transfer);
-      tc->transfer_map = NULL;
-   }
-}
-
-
-/**
- * Set a tile to a solid color.
- */
-static void
-clear_tile(struct llvmpipe_cached_tile *tile,
-           uint8_t clear_color[4])
-{
-   if (clear_color[0] == clear_color[1] &&
-       clear_color[1] == clear_color[2] &&
-       clear_color[2] == clear_color[3]) {
-      memset(tile->color, clear_color[0], TILE_SIZE * TILE_SIZE * 4);
-   }
-   else {
-      uint x, y, chan;
-      for (y = 0; y < TILE_SIZE; y++)
-         for (x = 0; x < TILE_SIZE; x++)
-            for (chan = 0; chan < 4; ++chan)
-               TILE_PIXEL(tile->color, x, y, chan) = clear_color[chan];
-   }
-}
-
-
-/**
- * Flush the tile cache: write all dirty tiles back to the transfer.
- * any tiles "flagged" as cleared will be "really" cleared.
- */
-void
-lp_flush_tile_cache(struct llvmpipe_tile_cache *tc)
-{
-   struct pipe_transfer *pt = tc->transfer;
-   unsigned x, y;
-
-   if(!pt)
-      return;
-
-   assert(tc->transfer_map);
-
-   /* push the tile to all positions marked as clear */
-   for (y = 0; y < pt->height; y += TILE_SIZE) {
-      for (x = 0; x < pt->width; x += TILE_SIZE) {
-         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-
-         if(tile->status != LP_TILE_STATUS_UNDEFINED) {
-            unsigned w = TILE_SIZE;
-            unsigned h = TILE_SIZE;
-
-            if (!pipe_clip_tile(x, y, &w, &h, pt)) {
-               switch(tile->status) {
-               case LP_TILE_STATUS_CLEAR:
-                  /* Actually clear the tiles which were flagged as being in a
-                   * clear state. */
-                  util_fill_rect(tc->transfer_map, &pt->block, pt->stride,
-                                 x, y, w, h,
-                                 tc->clear_val);
-                  break;
-
-               case LP_TILE_STATUS_DEFINED:
-                  lp_tile_write_4ub(pt->format,
-                                    tile->color,
-                                    tc->transfer_map, pt->stride,
-                                    x, y, w, h);
-                  break;
-
-               default:
-                  assert(0);
-                  break;
-               }
-            }
-
-            tile->status = LP_TILE_STATUS_UNDEFINED;
-         }
-      }
-   }
-}
-
-
-/**
- * Get a tile from the cache.
- * \param x, y  position of tile, in pixels
- */
-void *
-lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
-                   unsigned x, unsigned y )
-{
-   struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-   struct pipe_transfer *pt = tc->transfer;
-   
-   assert(tc->surface);
-   assert(tc->transfer);
-
-   switch(tile->status) {
-   case LP_TILE_STATUS_CLEAR:
-      /* don't get tile from framebuffer, just clear it */
-      clear_tile(tile, tc->clear_color);
-      tile->status = LP_TILE_STATUS_DEFINED;
-      break;
-
-   case LP_TILE_STATUS_UNDEFINED: {
-      unsigned w = TILE_SIZE;
-      unsigned h = TILE_SIZE;
-
-      x &= ~(TILE_SIZE - 1);
-      y &= ~(TILE_SIZE - 1);
-
-      if (!pipe_clip_tile(x, y, &w, &h, tc->transfer))
-         lp_tile_read_4ub(pt->format,
-                          tile->color,
-                          tc->transfer_map, tc->transfer->stride,
-                          x, y, w, h);
-
-      tile->status = LP_TILE_STATUS_DEFINED;
-      break;
-   }
-
-   case LP_TILE_STATUS_DEFINED:
-      /* nothing to do */
-      break;
-   }
-
-   return tile->color;
-}
-
-
-/**
- * When a whole surface is being cleared to a value we can avoid
- * fetching tiles above.
- * Save the color and set a 'clearflag' for each tile of the screen.
- */
-void
-lp_tile_cache_clear(struct llvmpipe_tile_cache *tc, const float *rgba,
-                    uint clearValue)
-{
-   struct pipe_transfer *pt = tc->transfer;
-   const unsigned w = pt->width;
-   const unsigned h = pt->height;
-   unsigned x, y, chan;
-
-   for(chan = 0; chan < 4; ++chan)
-      tc->clear_color[chan] = float_to_ubyte(rgba[chan]);
-
-   tc->clear_val = clearValue;
-
-   /* push the tile to all positions marked as clear */
-   for (y = 0; y < h; y += TILE_SIZE) {
-      for (x = 0; x < w; x += TILE_SIZE) {
-         struct llvmpipe_cached_tile *tile = &tc->entries[y/TILE_SIZE][x/TILE_SIZE];
-         tile->status = LP_TILE_STATUS_CLEAR;
-      }
-   }
-}
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_cache.h b/src/gallium/drivers/llvmpipe/lp_tile_cache.h
deleted file mode 100644
index 161bab37991..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_tile_cache.h
+++ /dev/null
@@ -1,71 +0,0 @@
-/**************************************************************************
- * 
- * Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
- * All Rights Reserved.
- * 
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- * 
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- * 
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- * 
- **************************************************************************/
-
-#ifndef LP_TILE_CACHE_H
-#define LP_TILE_CACHE_H
-
-
-#include "pipe/p_compiler.h"
-#include "lp_tile_soa.h"
-
-
-struct llvmpipe_tile_cache;  /* opaque */
-
-
-extern struct llvmpipe_tile_cache *
-lp_create_tile_cache( struct pipe_screen *screen );
-
-extern void
-lp_destroy_tile_cache(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_set_surface(struct llvmpipe_tile_cache *tc,
-                          struct pipe_surface *lps);
-
-extern struct pipe_surface *
-lp_tile_cache_get_surface(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_map_transfers(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_unmap_transfers(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_flush_tile_cache(struct llvmpipe_tile_cache *tc);
-
-extern void
-lp_tile_cache_clear(struct llvmpipe_tile_cache *tc, const float *rgba,
-                    uint clearValue);
-
-extern void *
-lp_get_cached_tile(struct llvmpipe_tile_cache *tc,
-                   unsigned x, unsigned y );
-
-
-#endif /* LP_TILE_CACHE_H */
-
-- 
cgit v1.2.3