43 files changed, 2789 insertions, 1861 deletions
diff --git a/src/gallium/drivers/llvmpipe/Makefile b/src/gallium/drivers/llvmpipe/Makefile
index 55b877b4ab9..08da2286b05 100644
--- a/src/gallium/drivers/llvmpipe/Makefile
+++ b/src/gallium/drivers/llvmpipe/Makefile
@@ -28,8 +28,6 @@ C_SOURCES = \
 	lp_scene_queue.c \
 	lp_screen.c \
 	lp_setup.c \
-	lp_setup_coef.c \
-	lp_setup_coef_intrin.c \
 	lp_setup_line.c \
 	lp_setup_point.c \
 	lp_setup_tri.c \
@@ -38,6 +36,7 @@ C_SOURCES = \
 	lp_state_clip.c \
 	lp_state_derived.c \
 	lp_state_fs.c \
+	lp_state_setup.c \
 	lp_state_gs.c \
 	lp_state_rasterizer.c \
 	lp_state_sampler.c \
@@ -63,12 +62,12 @@ PROGS := lp_test_format	\
 # Need this for the lp_test_*.o files
 CLEAN_EXTRA = *.o
 
+include ../../Makefile.template
+
 lp_test_sincos.o : sse_mathfun.h
 
 PROGS_DEPS := ../../auxiliary/libgallium.a
 
-include ../../Makefile.template
-
 lp_tile_soa.c: lp_tile_soa.py ../../auxiliary/util/u_format_parse.py ../../auxiliary/util/u_format_pack.py ../../auxiliary/util/u_format.csv
 	python lp_tile_soa.py ../../auxiliary/util/u_format.csv > $@
 
diff --git a/src/gallium/drivers/llvmpipe/SConscript b/src/gallium/drivers/llvmpipe/SConscript
index 650435f0f19..49950153a4f 100644
--- a/src/gallium/drivers/llvmpipe/SConscript
+++ b/src/gallium/drivers/llvmpipe/SConscript
@@ -27,13 +27,7 @@ env.Depends('lp_tile_soa.c', [
 ])
 
 
-# Only enable SSSE3 for lp_tile_soa_sse3.c
-ssse3_env = env.Clone()
-if env['gcc'] \
-   and distutils.version.LooseVersion(env['CCVERSION']) >= distutils.version.LooseVersion('4.3') \
-   and env['machine'] in ('x86', 'x86_64') :
-    ssse3_env.Append(CCFLAGS = ['-mssse3'])
-lp_tile_soa_os = ssse3_env.SharedObject('lp_tile_soa.c')
+lp_tile_soa_os = env.SharedObject('lp_tile_soa.c')
 
 
 llvmpipe = env.ConvenienceLibrary(
@@ -64,13 +58,12 @@ llvmpipe = env.ConvenienceLibrary(
 		'lp_setup_line.c',
 		'lp_setup_point.c',
 		'lp_setup_tri.c',
-		'lp_setup_coef.c',
-		'lp_setup_coef_intrin.c',
 		'lp_setup_vbuf.c',
 		'lp_state_blend.c',
 		'lp_state_clip.c',
 		'lp_state_derived.c',
 		'lp_state_fs.c',
+		'lp_state_setup.c',
 		'lp_state_gs.c',
 		'lp_state_rasterizer.c',
 		'lp_state_sampler.c',
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
index e28efe778f9..e50643790c8 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.c
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
                     struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
-                    LLVMValueRef ref)
+                    LLVMValueRef ref,
+                    boolean do_branch)
 {
    struct lp_build_context bld;
    LLVMValueRef test;
@@ -60,4 +61,7 @@ lp_build_alpha_test(LLVMBuilderRef builder,
    lp_build_name(test, "alpha_mask");
 
    lp_build_mask_update(mask, test);
+
+   if (do_branch)
+      lp_build_mask_check(mask);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
index 44603b418c0..27ca8aad4d4 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_alpha.h
@@ -48,7 +48,8 @@ lp_build_alpha_test(LLVMBuilderRef builder,
                     struct lp_type type,
                     struct lp_build_mask_context *mask,
                     LLVMValueRef alpha,
-                    LLVMValueRef ref);
+                    LLVMValueRef ref,
+                    boolean do_branch);
 
 
 #endif /* !LP_BLD_ALPHA_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index 7561899a74e..7eb76d4fb31 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -1,6 +1,6 @@
 /**************************************************************************
  *
- * Copyright 2009 VMware, Inc.
+ * Copyright 2009-2010 VMware, Inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
@@ -53,15 +53,8 @@
  *  ... ... ... ... ... ... ... ... ...
  *
  *
- * Stencil test:
- * Two-sided stencil test is supported but probably not as efficient as
- * it could be.  Currently, we use if/then/else constructs to do the
- * operations for front vs. back-facing polygons.  We could probably do
- * both the front and back arithmetic then use a Select() instruction to
- * choose the result depending on polyon orientation.  We'd have to
- * measure performance both ways and see which is better.
- *
  * @author Jose Fonseca <[email protected]>
+ * @author Brian Paul <[email protected]>
  */
 
 #include "pipe/p_state.h"
@@ -71,6 +64,7 @@
 #include "gallivm/lp_bld_arit.h"
 #include "gallivm/lp_bld_bitarit.h"
 #include "gallivm/lp_bld_const.h"
+#include "gallivm/lp_bld_conv.h"
 #include "gallivm/lp_bld_logic.h"
 #include "gallivm/lp_bld_flow.h"
 #include "gallivm/lp_bld_intr.h"
@@ -128,57 +122,32 @@ lp_build_stencil_test_single(struct lp_build_context *bld,
 /**
  * Do the one or two-sided stencil test comparison.
  * \sa lp_build_stencil_test_single
- * \param face  an integer indicating front (+) or back (-) facing polygon.
- *              If NULL, assume front-facing.
+ * \param front_facing  an integer vector mask, indicating front (~0) or back
+ *                      (0) facing polygon. If NULL, assume front-facing.
  */
 static LLVMValueRef
 lp_build_stencil_test(struct lp_build_context *bld,
                       const struct pipe_stencil_state stencil[2],
                       LLVMValueRef stencilRefs[2],
                       LLVMValueRef stencilVals,
-                      LLVMValueRef face)
+                      LLVMValueRef front_facing)
 {
    LLVMValueRef res;
 
    assert(stencil[0].enabled);
 
-   if (stencil[1].enabled && face) {
-      /* do two-sided test */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
-
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   /* do front face test */
+   res = lp_build_stencil_test_single(bld, &stencil[0],
+                                      stencilRefs[0], stencilVals);
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+   if (stencil[1].enabled && front_facing) {
+      /* do back face test */
+      LLVMValueRef back_res;
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
-
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[0],
-                                               stencilRefs[0], stencilVals);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_test_single(bld, &stencil[1],
-                                               stencilRefs[1], stencilVals);
-      }
-      lp_build_endif(&if_ctx);
+      back_res = lp_build_stencil_test_single(bld, &stencil[1],
+                                              stencilRefs[1], stencilVals);
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
-
-      res = result;
-   }
-   else {
-      /* do single-side test */
-      res = lp_build_stencil_test_single(bld, &stencil[0],
-                                         stencilRefs[0], stencilVals);
+      res = lp_build_select(bld, front_facing, res, back_res);
    }
 
    return res;
@@ -195,14 +164,12 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
                            const struct pipe_stencil_state *stencil,
                            enum stencil_op op,
                            LLVMValueRef stencilRef,
-                           LLVMValueRef stencilVals,
-                           LLVMValueRef mask)
+                           LLVMValueRef stencilVals)
 
 {
-   const unsigned stencilMax = 255; /* XXX fix */
    struct lp_type type = bld->type;
    LLVMValueRef res;
-   LLVMValueRef max = lp_build_const_int_vec(type, stencilMax);
+   LLVMValueRef max = lp_build_const_int_vec(type, 0xff);
    unsigned stencil_op;
 
    assert(type.sign);
@@ -255,19 +222,7 @@ lp_build_stencil_op_single(struct lp_build_context *bld,
       break;
    default:
       assert(0 && "bad stencil op mode");
-      res = NULL;
-   }
-
-   if (stencil->writemask != stencilMax) {
-      /* mask &= stencil->writemask */
-      LLVMValueRef writemask = lp_build_const_int_vec(type, stencil->writemask);
-      mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
-      /* res = (res & mask) | (stencilVals & ~mask) */
-      res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
-   }
-   else {
-      /* res = mask ? res : stencilVals */
-      res = lp_build_select(bld, mask, res, stencilVals);
+      res = bld->undef;
    }
 
    return res;
@@ -284,49 +239,40 @@ lp_build_stencil_op(struct lp_build_context *bld,
                     LLVMValueRef stencilRefs[2],
                     LLVMValueRef stencilVals,
                     LLVMValueRef mask,
-                    LLVMValueRef face)
+                    LLVMValueRef front_facing)
 
 {
-   assert(stencil[0].enabled);
-
-   if (stencil[1].enabled && face) {
-      /* do two-sided op */
-      struct lp_build_flow_context *flow_ctx;
-      struct lp_build_if_state if_ctx;
-      LLVMValueRef front_facing;
-      LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
-      LLVMValueRef result = bld->undef;
+   LLVMValueRef res;
 
-      flow_ctx = lp_build_flow_create(bld->builder);
-      lp_build_flow_scope_begin(flow_ctx);
+   assert(stencil[0].enabled);
 
-      lp_build_flow_scope_declare(flow_ctx, &result);
+   /* do front face op */
+   res = lp_build_stencil_op_single(bld, &stencil[0], op,
+                                     stencilRefs[0], stencilVals);
 
-      /* front_facing = face > 0.0 */
-      front_facing = LLVMBuildFCmp(bld->builder, LLVMRealUGT, face, zero, "");
+   if (stencil[1].enabled && front_facing) {
+      /* do back face op */
+      LLVMValueRef back_res;
 
-      lp_build_if(&if_ctx, flow_ctx, bld->builder, front_facing);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[0], op,
-                                             stencilRefs[0], stencilVals, mask);
-      }
-      lp_build_else(&if_ctx);
-      {
-         result = lp_build_stencil_op_single(bld, &stencil[1], op,
-                                             stencilRefs[1], stencilVals, mask);
-      }
-      lp_build_endif(&if_ctx);
+      back_res = lp_build_stencil_op_single(bld, &stencil[1], op,
+                                            stencilRefs[1], stencilVals);
 
-      lp_build_flow_scope_end(flow_ctx);
-      lp_build_flow_destroy(flow_ctx);
+      res = lp_build_select(bld, front_facing, res, back_res);
+   }
 
-      return result;
+   if (stencil->writemask != 0xff) {
+      /* mask &= stencil->writemask */
+      LLVMValueRef writemask = lp_build_const_int_vec(bld->type, stencil->writemask);
+      mask = LLVMBuildAnd(bld->builder, mask, writemask, "");
+      /* res = (res & mask) | (stencilVals & ~mask) */
+      res = lp_build_select_bitwise(bld, writemask, res, stencilVals);
    }
    else {
-      /* do single-sided op */
-      return lp_build_stencil_op_single(bld, &stencil[0], op,
-                                        stencilRefs[0], stencilVals, mask);
+      /* res = mask ? res : stencilVals */
+      res = lp_build_select(bld, mask, res, stencilVals);
    }
+
+   return res;
 }
 
 
@@ -358,8 +304,13 @@ lp_depth_type(const struct util_format_description *format_desc,
    }
    else if(format_desc->channel[swizzle].type == UTIL_FORMAT_TYPE_UNSIGNED) {
       assert(format_desc->block.bits <= 32);
-      if(format_desc->channel[swizzle].normalized)
-         type.norm = TRUE;
+      assert(format_desc->channel[swizzle].normalized);
+      if (format_desc->channel[swizzle].size < format_desc->block.bits) {
+         /* Prefer signed integers when possible, as SSE has less support
+          * for unsigned comparison;
+          */
+         type.sign = TRUE;
+      }
    }
    else
       assert(0);
@@ -381,7 +332,7 @@ lp_depth_type(const struct util_format_description *format_desc,
  */
 static boolean
 get_z_shift_and_mask(const struct util_format_description *format_desc,
-                     unsigned *shift, unsigned *mask)
+                     unsigned *shift, unsigned *width, unsigned *mask)
 {
    const unsigned total_bits = format_desc->block.bits;
    unsigned z_swizzle;
@@ -397,12 +348,14 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
    if (z_swizzle == UTIL_FORMAT_SWIZZLE_NONE)
       return FALSE;
 
+   *width = format_desc->channel[z_swizzle].size;
+
    padding_right = 0;
    for (chan = 0; chan < z_swizzle; ++chan)
       padding_right += format_desc->channel[chan].size;
 
    padding_left =
-      total_bits - (padding_right + format_desc->channel[z_swizzle].size);
+      total_bits - (padding_right + *width);
 
    if (padding_left || padding_right) {
       unsigned long long mask_left = (1ULL << (total_bits - padding_left)) - 1;
@@ -413,7 +366,7 @@ get_z_shift_and_mask(const struct util_format_description *format_desc,
       *mask = 0xffffffff;
    }
 
-   *shift = padding_left;
+   *shift = padding_right;
 
    return TRUE;
 }
@@ -457,7 +410,7 @@ get_s_shift_and_mask(const struct util_format_description *format_desc,
  * \param maskvalue is the depth test mask.
  * \param counter is a pointer of the uint32 counter.
  */
-static void
+void
 lp_build_occlusion_count(LLVMBuilderRef builder,
                          struct lp_type type,
                          LLVMValueRef maskvalue,
@@ -494,31 +447,57 @@ lp_build_occlusion_count(LLVMBuilderRef builder,
  * \param format_desc  description of the depth/stencil surface
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param stencil_refs  the front/back stencil ref values (scalar)
- * \param z_src  the incoming depth/stencil values (a 2x2 quad)
+ * \param z_src  the incoming depth/stencil values (a 2x2 quad, float32)
  * \param zs_dst_ptr  pointer to depth/stencil values in framebuffer
- * \param facing  contains float value indicating front/back facing polygon
+ * \param facing  contains boolean value indicating front/back facing polygon
  */
 void
 lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             const struct pipe_depth_state *depth,
                             const struct pipe_stencil_state stencil[2],
-                            struct lp_type type,
+                            struct lp_type z_src_type,
                             const struct util_format_description *format_desc,
                             struct lp_build_mask_context *mask,
                             LLVMValueRef stencil_refs[2],
                             LLVMValueRef z_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef face,
-                            LLVMValueRef counter)
+                            LLVMValueRef *zs_value,
+                            boolean do_branch)
 {
-   struct lp_build_context bld;
-   struct lp_build_context sbld;
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   struct lp_build_context s_bld;
    struct lp_type s_type;
+   unsigned z_shift = 0, z_width = 0, z_mask = 0;
    LLVMValueRef zs_dst, z_dst = NULL;
    LLVMValueRef stencil_vals = NULL;
    LLVMValueRef z_bitmask = NULL, stencil_shift = NULL;
    LLVMValueRef z_pass = NULL, s_pass_mask = NULL;
-   LLVMValueRef orig_mask = mask->value;
+   LLVMValueRef orig_mask = lp_build_mask_value(mask);
+   LLVMValueRef front_facing = NULL;
+
+
+   /*
+    * Depths are expected to be between 0 and 1, even if they are stored in
+    * floats. Setting these bits here will ensure that the lp_build_conv() call
+    * below won't try to unnecessarily clamp the incoming values.
+    */
+   if(z_src_type.floating) {
+      z_src_type.sign = FALSE;
+      z_src_type.norm = TRUE;
+   }
+   else {
+      assert(!z_src_type.sign);
+      assert(z_src_type.norm);
+   }
+
+   /* Pick the depth type. */
+   z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+
+   /* FIXME: Cope with a depth test type with a different bit width. */
+   assert(z_type.width == z_src_type.width);
+   assert(z_type.length == z_src_type.length);
 
    /* Sanity checking */
    {
@@ -540,8 +519,8 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
       }
 
       assert(z_swizzle < 4);
-      assert(format_desc->block.bits == type.width);
-      if (type.floating) {
+      assert(format_desc->block.bits == z_type.width);
+      if (z_type.floating) {
          assert(z_swizzle == 0);
          assert(format_desc->channel[z_swizzle].type ==
                 UTIL_FORMAT_TYPE_FLOAT);
@@ -552,54 +531,56 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          assert(format_desc->channel[z_swizzle].type ==
                 UTIL_FORMAT_TYPE_UNSIGNED);
          assert(format_desc->channel[z_swizzle].normalized);
-         assert(!type.fixed);
-         assert(!type.sign);
-         assert(type.norm);
+         assert(!z_type.fixed);
       }
    }
 
 
    /* Setup build context for Z vals */
-   lp_build_context_init(&bld, builder, type);
+   lp_build_context_init(&z_bld, builder, z_type);
 
    /* Setup build context for stencil vals */
-   s_type = lp_type_int_vec(type.width);
-   lp_build_context_init(&sbld, builder, s_type);
+   s_type = lp_type_int_vec(z_type.width);
+   lp_build_context_init(&s_bld, builder, s_type);
 
    /* Load current z/stencil value from z/stencil buffer */
+   zs_dst_ptr = LLVMBuildBitCast(builder,
+                                 zs_dst_ptr,
+                                 LLVMPointerType(z_bld.vec_type, 0), "");
    zs_dst = LLVMBuildLoad(builder, zs_dst_ptr, "");
 
-   lp_build_name(zs_dst, "zsbufval");
+   lp_build_name(zs_dst, "zs_dst");
 
 
    /* Compute and apply the Z/stencil bitmasks and shifts.
     */
    {
-      unsigned z_shift, z_mask;
       unsigned s_shift, s_mask;
 
-      if (get_z_shift_and_mask(format_desc, &z_shift, &z_mask)) {
-         if (z_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, z_shift);
-            z_src = LLVMBuildLShr(builder, z_src, shift, "");
-         }
-
+      if (get_z_shift_and_mask(format_desc, &z_shift, &z_width, &z_mask)) {
          if (z_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, z_mask);
-            z_src = LLVMBuildAnd(builder, z_src, mask, "");
-            z_dst = LLVMBuildAnd(builder, zs_dst, mask, "");
-            z_bitmask = mask;  /* used below */
+            z_bitmask = lp_build_const_int_vec(z_type, z_mask);
          }
-         else {
+
+         /*
+          * Align the framebuffer Z 's LSB to the right.
+          */
+         if (z_shift) {
+            LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+            z_dst = LLVMBuildLShr(builder, zs_dst, shift, "z_dst");
+         } else if (z_bitmask) {
+	    /* TODO: Instead of loading a mask from memory and ANDing, it's
+	     * probably faster to just shake the bits with two shifts. */
+            z_dst = LLVMBuildAnd(builder, zs_dst, z_bitmask, "z_dst");
+         } else {
             z_dst = zs_dst;
+            lp_build_name(z_dst, "z_dst");
          }
-
-         lp_build_name(z_dst, "zsbuf.z");
       }
 
       if (get_s_shift_and_mask(format_desc, &s_shift, &s_mask)) {
          if (s_shift) {
-            LLVMValueRef shift = lp_build_const_int_vec(type, s_shift);
+            LLVMValueRef shift = lp_build_const_int_vec(s_type, s_shift);
             stencil_vals = LLVMBuildLShr(builder, zs_dst, shift, "");
             stencil_shift = shift;  /* used below */
          }
@@ -608,35 +589,85 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          }
 
          if (s_mask != 0xffffffff) {
-            LLVMValueRef mask = lp_build_const_int_vec(type, s_mask);
+            LLVMValueRef mask = lp_build_const_int_vec(s_type, s_mask);
             stencil_vals = LLVMBuildAnd(builder, stencil_vals, mask, "");
          }
 
-         lp_build_name(stencil_vals, "stencil");
+         lp_build_name(stencil_vals, "s_dst");
       }
    }
 
-
    if (stencil[0].enabled) {
+
+      if (face) {
+         LLVMValueRef zero = LLVMConstInt(LLVMInt32Type(), 0, 0);
+
+         /* front_facing = face != 0 ? ~0 : 0 */
+         front_facing = LLVMBuildICmp(builder, LLVMIntNE, face, zero, "");
+         front_facing = LLVMBuildSExt(builder, front_facing,
+                                      LLVMIntType(s_bld.type.length*s_bld.type.width),
+                                      "");
+         front_facing = LLVMBuildBitCast(builder, front_facing,
+                                         s_bld.int_vec_type, "");
+      }
+
       /* convert scalar stencil refs into vectors */
-      stencil_refs[0] = lp_build_broadcast_scalar(&bld, stencil_refs[0]);
-      stencil_refs[1] = lp_build_broadcast_scalar(&bld, stencil_refs[1]);
+      stencil_refs[0] = lp_build_broadcast_scalar(&s_bld, stencil_refs[0]);
+      stencil_refs[1] = lp_build_broadcast_scalar(&s_bld, stencil_refs[1]);
 
-      s_pass_mask = lp_build_stencil_test(&sbld, stencil,
-                                          stencil_refs, stencil_vals, face);
+      s_pass_mask = lp_build_stencil_test(&s_bld, stencil,
+                                          stencil_refs, stencil_vals,
+                                          front_facing);
 
       /* apply stencil-fail operator */
       {
-         LLVMValueRef s_fail_mask = lp_build_andnot(&bld, orig_mask, s_pass_mask);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, S_FAIL_OP,
+         LLVMValueRef s_fail_mask = lp_build_andnot(&s_bld, orig_mask, s_pass_mask);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, S_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            s_fail_mask, face);
+                                            s_fail_mask, front_facing);
       }
    }
 
    if (depth->enabled) {
+      /*
+       * Convert fragment Z to the desired type, aligning the LSB to the right.
+       */
+
+      assert(z_type.width == z_src_type.width);
+      assert(z_type.length == z_src_type.length);
+      assert(lp_check_value(z_src_type, z_src));
+      if (z_src_type.floating) {
+         /*
+          * Convert from floating point values
+          */
+
+         if (!z_type.floating) {
+            z_src = lp_build_clamped_float_to_unsigned_norm(builder,
+                                                            z_src_type,
+                                                            z_width,
+                                                            z_src);
+         }
+      } else {
+         /*
+          * Convert from unsigned normalized values.
+          */
+
+         assert(!z_src_type.sign);
+         assert(!z_src_type.fixed);
+         assert(z_src_type.norm);
+         assert(!z_type.floating);
+         if (z_src_type.width > z_width) {
+            LLVMValueRef shift = lp_build_const_int_vec(z_src_type,
+                                                        z_src_type.width - z_width);
+            z_src = LLVMBuildLShr(builder, z_src, shift, "");
+         }
+      }
+      assert(lp_check_value(z_type, z_src));
+
+      lp_build_name(z_src, "z_src");
+
       /* compare src Z to dst Z, returning 'pass' mask */
-      z_pass = lp_build_cmp(&bld, depth->func, z_src, z_dst);
+      z_pass = lp_build_cmp(&z_bld, depth->func, z_src, z_dst);
 
       if (!stencil[0].enabled) {
          /* We can potentially skip all remaining operations here, but only
@@ -644,28 +675,28 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
           * buffer values.  Don't need to update Z buffer values.
           */
          lp_build_mask_update(mask, z_pass);
+
+         if (do_branch) {
+            lp_build_mask_check(mask);
+            do_branch = FALSE;
+         }
       }
 
       if (depth->writemask) {
-         LLVMValueRef zselectmask = mask->value;
+         LLVMValueRef zselectmask;
 
          /* mask off bits that failed Z test */
-         zselectmask = LLVMBuildAnd(builder, zselectmask, z_pass, "");
+         zselectmask = LLVMBuildAnd(builder, orig_mask, z_pass, "");
 
          /* mask off bits that failed stencil test */
          if (s_pass_mask) {
             zselectmask = LLVMBuildAnd(builder, zselectmask, s_pass_mask, "");
          }
 
-         /* if combined Z/stencil format, mask off the stencil bits */
-         if (z_bitmask) {
-            zselectmask = LLVMBuildAnd(builder, zselectmask, z_bitmask, "");
-         }
-
          /* Mix the old and new Z buffer values.
-          * z_dst[i] = (zselectmask[i] & z_src[i]) | (~zselectmask[i] & z_dst[i])
+          * z_dst[i] = zselectmask[i] ? z_src[i] : z_dst[i]
           */
-         z_dst = lp_build_select_bitwise(&bld, zselectmask, z_src, z_dst);
+         z_dst = lp_build_select(&z_bld, zselectmask, z_src, z_dst);
       }
 
       if (stencil[0].enabled) {
@@ -673,33 +704,35 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
          LLVMValueRef z_fail_mask, z_pass_mask;
 
          /* apply Z-fail operator */
-         z_fail_mask = lp_build_andnot(&bld, orig_mask, z_pass);
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_FAIL_OP,
+         z_fail_mask = lp_build_andnot(&z_bld, orig_mask, z_pass);
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_FAIL_OP,
                                             stencil_refs, stencil_vals,
-                                            z_fail_mask, face);
+                                            z_fail_mask, front_facing);
 
          /* apply Z-pass operator */
-         z_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, z_pass, "");
-         stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+         z_pass_mask = LLVMBuildAnd(z_bld.builder, orig_mask, z_pass, "");
+         stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                             stencil_refs, stencil_vals,
-                                            z_pass_mask, face);
+                                            z_pass_mask, front_facing);
       }
    }
    else {
       /* No depth test: apply Z-pass operator to stencil buffer values which
        * passed the stencil test.
        */
-      s_pass_mask = LLVMBuildAnd(bld.builder, orig_mask, s_pass_mask, "");
-      stencil_vals = lp_build_stencil_op(&sbld, stencil, Z_PASS_OP,
+      s_pass_mask = LLVMBuildAnd(s_bld.builder, orig_mask, s_pass_mask, "");
+      stencil_vals = lp_build_stencil_op(&s_bld, stencil, Z_PASS_OP,
                                          stencil_refs, stencil_vals,
-                                         s_pass_mask, face);
+                                         s_pass_mask, front_facing);
    }
 
-   /* The Z bits are already in the right place but we may need to shift the
-    * stencil bits before ORing Z with Stencil to make the final pixel value.
-    */
+   /* Put Z and ztencil bits in the right place */
+   if (z_dst && z_shift) {
+      LLVMValueRef shift = lp_build_const_int_vec(z_type, z_shift);
+      z_dst = LLVMBuildShl(builder, z_dst, shift, "");
+   }
    if (stencil_vals && stencil_shift)
-      stencil_vals = LLVMBuildShl(bld.builder, stencil_vals,
+      stencil_vals = LLVMBuildShl(s_bld.builder, stencil_vals,
                                   stencil_shift, "");
 
    /* Finally, merge/store the z/stencil values */
@@ -707,13 +740,13 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
        (stencil[0].enabled && stencil[0].writemask)) {
 
       if (z_dst && stencil_vals)
-         zs_dst = LLVMBuildOr(bld.builder, z_dst, stencil_vals, "");
+         zs_dst = LLVMBuildOr(z_bld.builder, z_dst, stencil_vals, "");
       else if (z_dst)
          zs_dst = z_dst;
       else
          zs_dst = stencil_vals;
 
-      LLVMBuildStore(builder, zs_dst, zs_dst_ptr);
+      *zs_value = zs_dst;
    }
 
    if (s_pass_mask)
@@ -722,6 +755,47 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
    if (depth->enabled && stencil[0].enabled)
       lp_build_mask_update(mask, z_pass);
 
-   if (counter)
-      lp_build_occlusion_count(builder, type, mask->value, counter);
+   if (do_branch)
+      lp_build_mask_check(mask);
+
+}
+
+
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef zs_dst_ptr,
+                     LLVMValueRef zs_value)
+{
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+                                 LLVMPointerType(LLVMTypeOf(zs_value), 0), "");
+
+   LLVMBuildStore(builder, zs_value, zs_dst_ptr);
+}
+
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+                              struct lp_type z_src_type,
+                              const struct util_format_description *format_desc,
+                              struct lp_build_mask_context *mask,
+                              LLVMValueRef zs_dst_ptr,
+                              LLVMValueRef zs_value)
+{
+   struct lp_type z_type;
+   struct lp_build_context z_bld;
+   LLVMValueRef z_dst;
+
+   /* XXX: pointlessly redo type logic:
+    */
+   z_type = lp_depth_type(format_desc, z_src_type.width*z_src_type.length);
+   lp_build_context_init(&z_bld, builder, z_type);
+
+   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr,
+                                 LLVMPointerType(z_bld.vec_type, 0), "");
+
+   z_dst = LLVMBuildLoad(builder, zs_dst_ptr, "zsbufval");
+   z_dst = lp_build_select(&z_bld, lp_build_mask_value(mask), zs_value, z_dst);
+
+   LLVMBuildStore(builder, z_dst, zs_dst_ptr);
 }
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index e257a5bd7d0..a54ef3a711e 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -61,7 +61,27 @@ lp_build_depth_stencil_test(LLVMBuilderRef builder,
                             LLVMValueRef zs_src,
                             LLVMValueRef zs_dst_ptr,
                             LLVMValueRef facing,
-                            LLVMValueRef counter);
+                            LLVMValueRef *zs_value,
+                            boolean do_branch);
 
+void
+lp_build_depth_write(LLVMBuilderRef builder,
+                     const struct util_format_description *format_desc,
+                     LLVMValueRef zs_dst_ptr,
+                     LLVMValueRef zs_value);
+
+void
+lp_build_deferred_depth_write(LLVMBuilderRef builder,
+                              struct lp_type z_src_type,
+                              const struct util_format_description *format_desc,
+                              struct lp_build_mask_context *mask,
+                              LLVMValueRef zs_dst_ptr,
+                              LLVMValueRef zs_value);
+
+void
+lp_build_occlusion_count(LLVMBuilderRef builder,
+                         struct lp_type type,
+                         LLVMValueRef maskvalue,
+                         LLVMValueRef counter);
 
 #endif /* !LP_BLD_DEPTH_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.c b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
index 2a374f8c390..c9da8900d0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.c
@@ -206,7 +206,7 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
             dadq2 = LLVMBuildFAdd(builder, dadq, dadq, "");
 
             /*
-             * a = a0 + x * dadx + y * dady
+             * a = a0 + (x * dadx + y * dady)
              */
 
             if (attrib == 0 && chan == 0) {
@@ -219,11 +219,11 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
                a = a0;
                if (interp != LP_INTERP_CONSTANT &&
                    interp != LP_INTERP_FACING) {
-                  LLVMValueRef tmp;
-                  tmp = LLVMBuildFMul(builder, bld->x, dadx, "");
-                  a = LLVMBuildFAdd(builder, a, tmp, "");
-                  tmp = LLVMBuildFMul(builder, bld->y, dady, "");
-                  a = LLVMBuildFAdd(builder, a, tmp, "");
+                  LLVMValueRef ax, ay, axy;
+                  ax = LLVMBuildFMul(builder, bld->x, dadx, "");
+                  ay = LLVMBuildFMul(builder, bld->y, dady, "");
+                  axy = LLVMBuildFAdd(builder, ax, ay, "");
+                  a = LLVMBuildFAdd(builder, a, axy, "");
                }
             }
 
@@ -272,7 +272,10 @@ coeffs_init(struct lp_build_interp_soa_context *bld,
  * This is called when we move from one quad to the next.
  */
 static void
-attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
+attribs_update(struct lp_build_interp_soa_context *bld,
+               int quad_index,
+               int start,
+               int end)
 {
    struct lp_build_context *coeff_bld = &bld->coeff_bld;
    LLVMValueRef shuffle = lp_build_const_int_vec(coeff_bld->type, quad_index);
@@ -282,7 +285,7 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
 
    assert(quad_index < 4);
 
-   for(attrib = 0; attrib < bld->num_attribs; ++attrib) {
+   for(attrib = start; attrib < end; ++attrib) {
       const unsigned mask = bld->mask[attrib];
       const unsigned interp = bld->interp[attrib];
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
@@ -350,6 +353,14 @@ attribs_update(struct lp_build_interp_soa_context *bld, int quad_index)
                }
 #endif
 
+               if (attrib == 0 && chan == 2) {
+                  /* FIXME: Depth values can exceed 1.0, due to the fact that
+                   * setup interpolation coefficients refer to (0,0) which causes
+                   * precision loss. So we must clamp to 1.0 here to avoid artifacts
+                   */
+                  a = lp_build_min(coeff_bld, a, coeff_bld->one);
+               }
+
                attrib_name(a, attrib, chan, "");
             }
             bld->attribs[attrib][chan] = a;
@@ -434,8 +445,6 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
    pos_init(bld, x0, y0);
 
    coeffs_init(bld, a0_ptr, dadx_ptr, dady_ptr);
-
-   attribs_update(bld, 0);
 }
 
 
@@ -443,10 +452,20 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
  * Advance the position and inputs to the given quad within the block.
  */
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
-                           int quad_index)
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+                                  int quad_index)
+{
+   assert(quad_index < 4);
+
+   attribs_update(bld, quad_index, 1, bld->num_attribs);
+}
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
+                                  int quad_index)
 {
    assert(quad_index < 4);
 
-   attribs_update(bld, quad_index);
+   attribs_update(bld, quad_index, 0, 1);
 }
+
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_interp.h b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
index 3054030f739..a7ebdd1bfa2 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_interp.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_interp.h
@@ -46,7 +46,31 @@
 
 #include "tgsi/tgsi_exec.h"
 
-#include "lp_setup.h"
+/**
+ * Describes how to compute the interpolation coefficients (a0, dadx, dady)
+ * from the vertices passed into our triangle/line/point functions by the
+ * draw module.
+ *
+ * Vertices are treated as an array of float[4] values, indexed by
+ * src_index.
+ *
+ * LP_INTERP_COLOR is translated to either LP_INTERP_CONSTANT or
+ * LINEAR depending on flatshade state.
+ */
+enum lp_interp {
+   LP_INTERP_CONSTANT,
+   LP_INTERP_COLOR,
+   LP_INTERP_LINEAR,
+   LP_INTERP_PERSPECTIVE,
+   LP_INTERP_POSITION,
+   LP_INTERP_FACING
+};
+
+struct lp_shader_input {
+   ushort interp:4;       /* enum lp_interp */
+   ushort usage_mask:4;   /* bitmask of TGSI_WRITEMASK_x flags */
+   ushort src_index:8;    /* where to find values in incoming vertices */
+};
 
 
 struct lp_build_interp_soa_context
@@ -89,7 +113,11 @@ lp_build_interp_soa_init(struct lp_build_interp_soa_context *bld,
                          LLVMValueRef y);
 
 void
-lp_build_interp_soa_update(struct lp_build_interp_soa_context *bld,
+lp_build_interp_soa_update_inputs(struct lp_build_interp_soa_context *bld,
+                           int quad_index);
+
+void
+lp_build_interp_soa_update_pos(struct lp_build_interp_soa_context *bld,
                            int quad_index);
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_context.c b/src/gallium/drivers/llvmpipe/lp_context.c
index 39f2c6085ef..763432ed712 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.c
+++ b/src/gallium/drivers/llvmpipe/lp_context.c
@@ -82,6 +82,8 @@ static void llvmpipe_destroy( struct pipe_context *pipe )
       }
    }
 
+   lp_delete_setup_variants(llvmpipe);
+
    align_free( llvmpipe );
 }
 
@@ -108,6 +110,7 @@ llvmpipe_create_context( struct pipe_screen *screen, void *priv )
    memset(llvmpipe, 0, sizeof *llvmpipe);
 
    make_empty_list(&llvmpipe->fs_variants_list);
+   make_empty_list(&llvmpipe->setup_variants_list);
 
    llvmpipe->pipe.winsys = screen->winsys;
    llvmpipe->pipe.screen = screen;
diff --git a/src/gallium/drivers/llvmpipe/lp_context.h b/src/gallium/drivers/llvmpipe/lp_context.h
index 34fa20e204a..db09c95b272 100644
--- a/src/gallium/drivers/llvmpipe/lp_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_context.h
@@ -39,6 +39,7 @@
 #include "lp_jit.h"
 #include "lp_setup.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 
 
 struct llvmpipe_vbuf_render;
@@ -48,6 +49,7 @@ struct lp_fragment_shader;
 struct lp_vertex_shader;
 struct lp_blend_state;
 struct lp_setup_context;
+struct lp_setup_variant;
 struct lp_velems_state;
 
 struct llvmpipe_context {
@@ -105,12 +107,9 @@ struct llvmpipe_context {
    /** Which vertex shader output slot contains point size */
    int psize_slot;
 
-   /** Fragment shader input interpolation info */
-   unsigned num_inputs;
-   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
-
    /** The tiling engine */
    struct lp_setup_context *setup;
+   struct lp_setup_variant setup_variant;
 
    /** The primitive drawing context */
    struct draw_context *draw;
@@ -120,6 +119,9 @@ struct llvmpipe_context {
 
    struct lp_fs_variant_list_item fs_variants_list;
    unsigned nr_fs_variants;
+
+   struct lp_setup_variant_list_item setup_variants_list;
+   unsigned nr_setup_variants;
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_flush.h b/src/gallium/drivers/llvmpipe/lp_flush.h
index bb538b2bd83..3626ce4a86c 100644
--- a/src/gallium/drivers/llvmpipe/lp_flush.h
+++ b/src/gallium/drivers/llvmpipe/lp_flush.h
@@ -32,6 +32,7 @@
 
 struct pipe_context;
 struct pipe_fence_handle;
+struct pipe_resource;
 
 void
 llvmpipe_flush(struct pipe_context *pipe,
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.c b/src/gallium/drivers/llvmpipe/lp_jit.c
index 04b12dedccf..c540f9b3628 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.c
+++ b/src/gallium/drivers/llvmpipe/lp_jit.c
@@ -36,7 +36,6 @@
 #include <llvm-c/Transforms/Scalar.h>
 
 #include "util/u_memory.h"
-#include "util/u_cpu_detect.h"
 #include "gallivm/lp_bld_init.h"
 #include "gallivm/lp_bld_debug.h"
 #include "lp_screen.h"
@@ -162,9 +161,6 @@ lp_jit_init_globals(struct llvmpipe_screen *screen)
 void
 lp_jit_screen_cleanup(struct llvmpipe_screen *screen)
 {
-   if(screen->engine)
-      LLVMDisposeExecutionEngine(screen->engine);
-
    if(screen->pass)
       LLVMDisposePassManager(screen->pass);
 }
@@ -190,13 +186,7 @@ lp_jit_screen_init(struct llvmpipe_screen *screen)
       LLVMAddCFGSimplificationPass(screen->pass);
       LLVMAddPromoteMemoryToRegisterPass(screen->pass);
       LLVMAddConstantPropagationPass(screen->pass);
-      if(util_cpu_caps.has_sse4_1) {
-         /* FIXME: There is a bug in this pass, whereby the combination of fptosi
-          * and sitofp (necessary for trunc/floor/ceil/round implementation)
-          * somehow becomes invalid code.
-          */
-         LLVMAddInstructionCombiningPass(screen->pass);
-      }
+      LLVMAddInstructionCombiningPass(screen->pass);
       LLVMAddGVNPass(screen->pass);
    } else {
       /* We need at least this pass to prevent the backends to fail in
diff --git a/src/gallium/drivers/llvmpipe/lp_jit.h b/src/gallium/drivers/llvmpipe/lp_jit.h
index 16e04fce0cd..114f21f2d16 100644
--- a/src/gallium/drivers/llvmpipe/lp_jit.h
+++ b/src/gallium/drivers/llvmpipe/lp_jit.h
@@ -144,7 +144,7 @@ typedef void
 (*lp_jit_frag_func)(const struct lp_jit_context *context,
                     uint32_t x,
                     uint32_t y,
-                    float facing,
+                    uint32_t facing,
                     const void *a0,
                     const void *dadx,
                     const void *dady,
diff --git a/src/gallium/drivers/llvmpipe/lp_limits.h b/src/gallium/drivers/llvmpipe/lp_limits.h
index d1c431475d8..2538164ffaa 100644
--- a/src/gallium/drivers/llvmpipe/lp_limits.h
+++ b/src/gallium/drivers/llvmpipe/lp_limits.h
@@ -72,4 +72,14 @@
  */
 #define LP_MAX_SHADER_VARIANTS 1024
 
+/**
+ * Max number of setup variants that will be kept around.
+ *
+ * These are determined by the combination of the fragment shader
+ * input signature and a small amount of rasterization state (eg
+ * flatshading).  It is likely that many active fragment shaders will
+ * share the same setup variant.
+ */
+#define LP_MAX_SETUP_VARIANTS 64
+
 #endif /* LP_LIMITS_H */
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index d7e6415e139..d358a983943 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -211,8 +211,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
                        const union lp_rast_cmd_arg arg)
 {
    const struct lp_scene *scene = task->scene;
-   unsigned clear_value = arg.clear_zstencil.value;
-   unsigned clear_mask = arg.clear_zstencil.mask;
+   uint32_t clear_value = arg.clear_zstencil.value;
+   uint32_t clear_mask = arg.clear_zstencil.mask;
    const unsigned height = TILE_SIZE / TILE_VECTOR_HEIGHT;
    const unsigned width = TILE_SIZE * TILE_VECTOR_HEIGHT;
    const unsigned block_size = scene->zsbuf.blocksize;
@@ -220,7 +220,8 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
    uint8_t *dst;
    unsigned i, j;
 
-   LP_DBG(DEBUG_RAST, "%s 0x%x%x\n", __FUNCTION__, clear_value, clear_mask);
+   LP_DBG(DEBUG_RAST, "%s: value=0x%08x, mask=0x%08x\n",
+           __FUNCTION__, clear_value, clear_mask);
 
    /*
     * Clear the aera of the swizzled depth/depth buffer matching this tile, in
@@ -232,16 +233,31 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
 
    dst = task->depth_tile;
 
+   clear_value &= clear_mask;
+
    switch (block_size) {
    case 1:
+      assert(clear_mask == 0xff);
       memset(dst, (uint8_t) clear_value, height * width);
       break;
    case 2:
-      for (i = 0; i < height; i++) {
-         uint16_t *row = (uint16_t *)dst;
-         for (j = 0; j < width; j++)
-            *row++ = (uint16_t) clear_value;
-         dst += dst_stride;
+      if (clear_mask == 0xffff) {
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst;
+            for (j = 0; j < width; j++)
+               *row++ = (uint16_t) clear_value;
+            dst += dst_stride;
+         }
+      }
+      else {
+         for (i = 0; i < height; i++) {
+            uint16_t *row = (uint16_t *)dst;
+            for (j = 0; j < width; j++) {
+               uint16_t tmp = ~clear_mask & *row;
+               *row++ = clear_value | tmp;
+            }
+            dst += dst_stride;
+         }
       }
       break;
    case 4:
@@ -258,7 +274,7 @@ lp_rast_clear_zstencil(struct lp_rasterizer_task *task,
             uint32_t *row = (uint32_t *)dst;
             for (j = 0; j < width; j++) {
                uint32_t tmp = ~clear_mask & *row;
-               *row++ = (clear_value & clear_mask) | tmp;
+               *row++ = clear_value | tmp;
             }
             dst += dst_stride;
          }
@@ -318,7 +334,7 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
 {
    const struct lp_scene *scene = task->scene;
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    const unsigned tile_x = task->x, tile_y = task->y;
    unsigned x, y;
@@ -349,10 +365,10 @@ lp_rast_shade_tile(struct lp_rasterizer_task *task,
          BEGIN_JIT_CALL(state);
          variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                             tile_x + x, tile_y + y,
-                                            inputs->facing,
-                                            inputs->a0,
-                                            inputs->dadx,
-                                            inputs->dady,
+                                            inputs->frontfacing,
+                                            GET_A0(inputs),
+                                            GET_DADX(inputs),
+                                            GET_DADY(inputs),
                                             color,
                                             depth,
                                             0xffff,
@@ -398,7 +414,7 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
                          unsigned x, unsigned y,
                          unsigned mask)
 {
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    const struct lp_scene *scene = task->scene;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
@@ -430,10 +446,10 @@ lp_rast_shade_quads_mask(struct lp_rasterizer_task *task,
    BEGIN_JIT_CALL(state);
    variant->jit_function[RAST_EDGE_TEST](&state->jit_context,
                                          x, y,
-                                         inputs->facing,
-                                         inputs->a0,
-                                         inputs->dadx,
-                                         inputs->dady,
+                                         inputs->frontfacing,
+                                         GET_A0(inputs),
+                                         GET_DADX(inputs),
+                                         GET_DADY(inputs),
                                          color,
                                          depth,
                                          mask,
@@ -474,6 +490,14 @@ lp_rast_end_query(struct lp_rasterizer_task *task,
 }
 
 
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg)
+{
+   task->state = arg.state;
+}
+
+
 
 /**
  * Set top row and left column of the tile's pixels to white.  For debugging.
@@ -581,10 +605,12 @@ static lp_rast_cmd_func dispatch[LP_RAST_OP_MAX] =
    lp_rast_triangle_8,
    lp_rast_triangle_3_4,
    lp_rast_triangle_3_16,
+   lp_rast_triangle_4_16,
    lp_rast_shade_tile,
    lp_rast_shade_tile_opaque,
    lp_rast_begin_query,
    lp_rast_end_query,
+   lp_rast_set_state,
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.h b/src/gallium/drivers/llvmpipe/lp_rast.h
index c55b97a9d13..a64c152cf83 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast.h
@@ -78,30 +78,28 @@ struct lp_rast_state {
  * These pointers point into the bin data buffer.
  */
 struct lp_rast_shader_inputs {
-   float facing;     /** Positive for front-facing, negative for back-facing */
-   unsigned disable:1;  /** Partially binned, disable this command */
-   unsigned opaque:1;   /** Is opaque */
-
-   float (*a0)[4];
-   float (*dadx)[4];
-   float (*dady)[4];
-
-   const struct lp_rast_state *state;
+   unsigned frontfacing:1;      /** True for front-facing */
+   unsigned disable:1;          /** Partially binned, disable this command */
+   unsigned opaque:1;           /** Is opaque */
+   unsigned pad0:29;            /* wasted space */
+   unsigned stride;             /* how much to advance data between a0, dadx, dady */
+   unsigned pad2;               /* wasted space */
+   unsigned pad3;               /* wasted space */
+   /* followed by a0, dadx, dady and planes[] */
 };
 
-
+/* Note: the order of these values is important as they are loaded by
+ * sse code in rasterization:
+ */
 struct lp_rast_plane {
-   /* one-pixel sized trivial accept offsets for each plane */
-   int ei;
-
-   /* one-pixel sized trivial reject offsets for each plane */
-   int eo;
-
    /* edge function values at minx,miny ?? */
    int c;
 
    int dcdx;
    int dcdy;
+
+   /* one-pixel sized trivial reject offsets for each plane */
+   int eo;
 };
 
 /**
@@ -111,17 +109,24 @@ struct lp_rast_plane {
  * Objects of this type are put into the lp_setup_context::data buffer.
  */
 struct lp_rast_triangle {
-   /* inputs for the shader */
-   struct lp_rast_shader_inputs inputs;
-
 #ifdef DEBUG
    float v[3][2];
+   float pad0;
+   float pad1;
 #endif
 
-   struct lp_rast_plane plane[8]; /* NOTE: may allocate fewer planes */
+   /* inputs for the shader */
+   struct lp_rast_shader_inputs inputs;
+   /* planes are also allocated here */
 };
 
 
+#define GET_A0(inputs) ((float (*)[4])((inputs)+1))
+#define GET_DADX(inputs) ((float (*)[4])((char *)((inputs) + 1) + (inputs)->stride))
+#define GET_DADY(inputs) ((float (*)[4])((char *)((inputs) + 1) + 2 * (inputs)->stride))
+#define GET_PLANES(tri) ((struct lp_rast_plane *)((char *)(&(tri)->inputs + 1) + 3 * (tri)->inputs.stride))
+
+
 
 struct lp_rasterizer *
 lp_rast_create( unsigned num_threads );
@@ -149,9 +154,10 @@ union lp_rast_cmd_arg {
    const struct lp_rast_state *set_state;
    uint8_t clear_color[4];
    struct {
-      unsigned value;
-      unsigned mask;
+      uint32_t value;
+      uint32_t mask;
    } clear_zstencil;
+   const struct lp_rast_state *state;
    struct lp_fence *fence;
    struct llvmpipe_query *query_obj;
 };
@@ -238,12 +244,14 @@ lp_rast_arg_null( void )
 #define LP_RAST_OP_TRIANGLE_8        0x9
 #define LP_RAST_OP_TRIANGLE_3_4      0xa
 #define LP_RAST_OP_TRIANGLE_3_16     0xb
-#define LP_RAST_OP_SHADE_TILE        0xc
-#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xd
-#define LP_RAST_OP_BEGIN_QUERY       0xe
-#define LP_RAST_OP_END_QUERY         0xf
-
-#define LP_RAST_OP_MAX               0x10
+#define LP_RAST_OP_TRIANGLE_4_16     0xc
+#define LP_RAST_OP_SHADE_TILE        0xd
+#define LP_RAST_OP_SHADE_TILE_OPAQUE 0xe
+#define LP_RAST_OP_BEGIN_QUERY       0xf
+#define LP_RAST_OP_END_QUERY         0x10
+#define LP_RAST_OP_SET_STATE         0x11
+
+#define LP_RAST_OP_MAX               0x12
 #define LP_RAST_OP_MASK              0xff
 
 void
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_debug.c b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
index 9fc78645a3a..64ac616f629 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_debug.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_debug.c
@@ -12,6 +12,7 @@ static INLINE int u_bit_scan(unsigned *mask)
 struct tile {
    int coverage;
    int overdraw;
+   const struct lp_rast_state *state;
    char data[TILE_SIZE][TILE_SIZE];
 };
 
@@ -42,10 +43,12 @@ static const char *cmd_names[LP_RAST_OP_MAX] =
    "triangle_8",
    "triangle_3_4",
    "triangle_3_16",
+   "triangle_4_16",
    "shade_tile",
    "shade_tile_opaque",
    "begin_query",
    "end_query",
+   "set_state",
 };
 
 static const char *cmd_name(unsigned cmd)
@@ -55,31 +58,31 @@ static const char *cmd_name(unsigned cmd)
 }
 
 static const struct lp_fragment_shader_variant *
-get_variant(  const struct cmd_block *block,
-              int k )
+get_variant( const struct lp_rast_state *state,
+             const struct cmd_block *block,
+             int k )
 {
    if (block->cmd[k] == LP_RAST_OP_SHADE_TILE ||
-       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE)
-      return  block->arg[k].shade_tile->state->variant;
-
-   if (block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
+       block->cmd[k] == LP_RAST_OP_SHADE_TILE_OPAQUE ||
+       block->cmd[k] == LP_RAST_OP_TRIANGLE_1 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_2 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_3 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_4 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_5 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_6 ||
        block->cmd[k] == LP_RAST_OP_TRIANGLE_7)
-      return block->arg[k].triangle.tri->inputs.state->variant;
+      return state->variant;
 
    return NULL;
 }
 
 
 static boolean
-is_blend( const struct cmd_block *block,
+is_blend( const struct lp_rast_state *state,
+          const struct cmd_block *block,
           int k )
 {
-   const struct lp_fragment_shader_variant *variant = get_variant(block, k);
+   const struct lp_fragment_shader_variant *variant = get_variant(state, block, k);
 
    if (variant)
       return  variant->key.blend.rt[0].blend_enable;
@@ -92,6 +95,7 @@ is_blend( const struct cmd_block *block,
 static void
 debug_bin( const struct cmd_bin *bin )
 {
+   const struct lp_rast_state *state = NULL;
    const struct cmd_block *head = bin->head;
    int i, j = 0;
 
@@ -99,9 +103,12 @@ debug_bin( const struct cmd_bin *bin )
                 
    while (head) {
       for (i = 0; i < head->count; i++, j++) {
+         if (head->cmd[i] == LP_RAST_OP_SET_STATE)
+            state = head->arg[i].state;
+
          debug_printf("%d: %s %s\n", j,
                       cmd_name(head->cmd[i]),
-                      is_blend(head, i) ? "blended" : "");
+                      is_blend(state, head, i) ? "blended" : "");
       }
       head = head->next;
    }
@@ -133,7 +140,7 @@ debug_shade_tile(int x, int y,
                  char val)
 {
    const struct lp_rast_shader_inputs *inputs = arg.shade_tile;
-   boolean blend = inputs->state->variant->key.blend.rt[0].blend_enable;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
    unsigned i,j;
 
    if (inputs->disable)
@@ -171,11 +178,12 @@ debug_triangle(int tilex, int tiley,
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    struct lp_rast_plane plane[8];
    int x, y;
    int count = 0;
    unsigned i, nr_planes = 0;
-   boolean blend = tri->inputs.state->variant->key.blend.rt[0].blend_enable;
+   boolean blend = tile->state->variant->key.blend.rt[0].blend_enable;
 
    if (tri->inputs.disable) {
       /* This triangle was partially binned and has been disabled */
@@ -183,7 +191,7 @@ debug_triangle(int tilex, int tiley,
    }
 
    while (plane_mask) {
-      plane[nr_planes] = tri->plane[u_bit_scan(&plane_mask)];
+      plane[nr_planes] = tri_plane[u_bit_scan(&plane_mask)];
       plane[nr_planes].c = (plane[nr_planes].c +
                             plane[nr_planes].dcdy * tiley -
                             plane[nr_planes].dcdx * tilex);
@@ -232,15 +240,19 @@ do_debug_bin( struct tile *tile,
    memset(tile->data, ' ', sizeof tile->data);
    tile->coverage = 0;
    tile->overdraw = 0;
+   tile->state = NULL;
 
    for (block = bin->head; block; block = block->next) {
       for (k = 0; k < block->count; k++, j++) {
-         boolean blend = is_blend(block, k);
+         boolean blend = is_blend(tile->state, block, k);
          char val = get_label(j);
          int count = 0;
             
          if (print_cmds)
             debug_printf("%c: %15s", val, cmd_name(block->cmd[k]));
+
+         if (block->cmd[k] == LP_RAST_OP_SET_STATE)
+            tile->state = block->arg[k].state;
          
          if (block->cmd[k] == LP_RAST_OP_CLEAR_COLOR ||
              block->cmd[k] == LP_RAST_OP_CLEAR_ZSTENCIL)
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_priv.h b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
index 7370119e966..b30408f097b 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_priv.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_priv.h
@@ -77,6 +77,7 @@ struct cmd_bin;
 struct lp_rasterizer_task
 {
    const struct cmd_bin *bin;
+   const struct lp_rast_state *state;
 
    struct lp_scene *scene;
    unsigned x, y;          /**< Pos of this tile in framebuffer, in pixels */
@@ -244,7 +245,7 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
                          unsigned x, unsigned y )
 {
    const struct lp_scene *scene = task->scene;
-   const struct lp_rast_state *state = inputs->state;
+   const struct lp_rast_state *state = task->state;
    struct lp_fragment_shader_variant *variant = state->variant;
    uint8_t *color[PIPE_MAX_COLOR_BUFS];
    void *depth;
@@ -260,10 +261,10 @@ lp_rast_shade_quads_all( struct lp_rasterizer_task *task,
    BEGIN_JIT_CALL(state);
    variant->jit_function[RAST_WHOLE]( &state->jit_context,
                                       x, y,
-                                      inputs->facing,
-                                      inputs->a0,
-                                      inputs->dadx,
-                                      inputs->dady,
+                                      inputs->frontfacing,
+                                      GET_A0(inputs),
+                                      GET_DADX(inputs),
+                                      GET_DADY(inputs),
                                       color,
                                       depth,
                                       0xffff,
@@ -293,6 +294,14 @@ void lp_rast_triangle_3_4(struct lp_rasterizer_task *,
 
 void lp_rast_triangle_3_16( struct lp_rasterizer_task *, 
                             const union lp_rast_cmd_arg );
+
+void lp_rast_triangle_4_16( struct lp_rasterizer_task *, 
+                            const union lp_rast_cmd_arg );
+
+void
+lp_rast_set_state(struct lp_rasterizer_task *task,
+                  const union lp_rast_cmd_arg arg);
+ 
 void
 lp_debug_bin( const struct cmd_bin *bin );
 
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri.c b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
index a1f309d4b01..042c315635e 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri.c
@@ -123,6 +123,16 @@ lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
 }
 
 void
+lp_rast_triangle_4_16(struct lp_rasterizer_task *task,
+                      const union lp_rast_cmd_arg arg)
+{
+   union lp_rast_cmd_arg arg2;
+   arg2.triangle.tri = arg.triangle.tri;
+   arg2.triangle.plane_mask = (1<<4)-1;
+   lp_rast_triangle_3(task, arg2);
+}
+
+void
 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
@@ -230,144 +240,207 @@ sign_bits4(const __m128i *cstep, int cdiff)
 }
 
 
-/* Special case for 3 plane triangle which is contained entirely
- * within a 16x16 block.
- */
+#define NR_PLANES 3
+
+
+
+
+
+
+
 void
 lp_rast_triangle_3_16(struct lp_rasterizer_task *task,
                       const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
-   unsigned mask = arg.triangle.plane_mask;
-   const int x = task->x + (mask & 0xff);
-   const int y = task->y + (mask >> 8);
-   unsigned outmask, inmask, partmask, partial_mask;
-   unsigned j;
-   __m128i cstep4[3][4];
-
-   outmask = 0;                 /* outside one or more trivial reject planes */
-   partmask = 0;                /* outside one or more trivial accept planes */
-
-   for (j = 0; j < 3; j++) {
-      const int dcdx = -plane[j].dcdx * 4;
-      const int dcdy = plane[j].dcdy * 4;
-      __m128i xdcdy = _mm_set1_epi32(dcdy);
-
-      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
-      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
-      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
-      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
-
-      {
-	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
-	 const int cox = plane[j].eo * 4;
-	 const int cio = plane[j].ei * 4 - 1;
-
-	 outmask |= sign_bits4(cstep4[j], c + cox);
-	 partmask |= sign_bits4(cstep4[j], c + cio);
-      }
-   }
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+   unsigned i, j;
+
+   struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16];
+   unsigned nr = 0;
+
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+   __m128i rej4;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &rej4);
+
+   /* Adjust dcdx;
+    */
+   dcdx = _mm_sub_epi32(zero, dcdx);
 
-   if (outmask == 0xffff)
-      return;
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+   rej4 = _mm_slli_epi32(rej4, 2);
 
-   /* Mask of sub-blocks which are inside all trivial accept planes:
-    */
-   inmask = ~partmask & 0xffff;
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
 
-   /* Mask of sub-blocks which are inside all trivial reject planes,
-    * but outside at least one trivial accept plane:
-    */
-   partial_mask = partmask & ~outmask;
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
 
-   assert((partial_mask & inmask) == 0);
+   for (i = 0; i < 4; i++) {
+      __m128i cx = c;
 
-   /* Iterate over partials:
-    */
-   while (partial_mask) {
-      int i = ffs(partial_mask) - 1;
-      int ix = (i & 3) * 4;
-      int iy = (i >> 2) * 4;
-      int px = x + ix;
-      int py = y + iy; 
-      unsigned mask = 0xffff;
-
-      partial_mask &= ~(1 << i);
-
-      for (j = 0; j < 3; j++) {
-         const int cx = (plane[j].c 
-			 - plane[j].dcdx * px
-			 + plane[j].dcdy * py) * 4;
-
-	 mask &= ~sign_bits4(cstep4[j], cx);
-      }
+      for (j = 0; j < 4; j++) {
+         __m128i c4rej = _mm_add_epi32(cx, rej4);
+         __m128i rej_masks = _mm_srai_epi32(c4rej, 31);
 
-      if (mask)
-	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
-   }
+         /* if (is_zero(rej_masks)) */
+         if (_mm_movemask_epi8(rej_masks) == 0) {
+            __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0);
+            __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1);
+            __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2);
 
-   /* Iterate over fulls: 
-    */
-   while (inmask) {
-      int i = ffs(inmask) - 1;
-      int ix = (i & 3) * 4;
-      int iy = (i >> 2) * 4;
-      int px = x + ix;
-      int py = y + iy; 
+            __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
+
+            __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+            __m128i c_01 = _mm_packs_epi32(c_0, c_1);
+
+            __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
+
+            __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
 
-      inmask &= ~(1 << i);
+            __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+            __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+            __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
 
-      block_full_4(task, tri, px, py);
+            __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+            __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+            __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+            unsigned mask = _mm_movemask_epi8(c_0123);
+
+            out[nr].i = i;
+            out[nr].j = j;
+            out[nr].mask = mask;
+            if (mask != 0xffff)
+               nr++;
+         }
+         cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2));
+      }
+
+      c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2));
    }
+
+   for (i = 0; i < nr; i++)
+      lp_rast_shade_quads_mask(task,
+                               &tri->inputs,
+                               x + 4 * out[i].j,
+                               y + 4 * out[i].i,
+                               0xffff & ~out[i].mask);
 }
 
 
+
+
+
 void
 lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
-		     const union lp_rast_cmd_arg arg)
+                     const union lp_rast_cmd_arg arg)
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
-   const struct lp_rast_plane *plane = tri->plane;
-   unsigned mask = arg.triangle.plane_mask;
-   const int x = task->x + (mask & 0xff);
-   const int y = task->y + (mask >> 8);
-   unsigned j;
-
-   /* Iterate over partials:
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   int x = (arg.triangle.plane_mask & 0xff) + task->x;
+   int y = (arg.triangle.plane_mask >> 8) + task->y;
+
+   __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */
+   __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */
+   __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */
+   __m128i zero = _mm_setzero_si128();
+
+   __m128i c;
+   __m128i dcdx;
+   __m128i dcdy;
+
+   __m128i dcdx2;
+   __m128i dcdx3;
+   
+   __m128i span_0;                /* 0,dcdx,2dcdx,3dcdx for plane 0 */
+   __m128i span_1;                /* 0,dcdx,2dcdx,3dcdx for plane 1 */
+   __m128i span_2;                /* 0,dcdx,2dcdx,3dcdx for plane 2 */
+   __m128i unused;
+   
+   transpose4_epi32(&p0, &p1, &p2, &zero,
+                    &c, &dcdx, &dcdy, &unused);
+
+   /* Adjust dcdx;
     */
+   dcdx = _mm_sub_epi32(zero, dcdx);
+
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x)));
+   c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y)));
+
+   dcdx2 = _mm_add_epi32(dcdx, dcdx);
+   dcdx3 = _mm_add_epi32(dcdx2, dcdx);
+
+   transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3,
+                    &span_0, &span_1, &span_2, &unused);
+
+
    {
-      unsigned mask = 0xffff;
+      __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(c, 0), span_0);
+      __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(c, 1), span_1);
+      __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(c, 2), span_2);
+      
+      __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0);
 
-      for (j = 0; j < 3; j++) {
-	 const int cx = (plane[j].c 
-			 - plane[j].dcdx * x
-			 + plane[j].dcdy * y);
+      __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2));
 
-	 const int dcdx = -plane[j].dcdx;
-	 const int dcdy = plane[j].dcdy;
-	 __m128i xdcdy = _mm_set1_epi32(dcdy);
+      __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1);
+      __m128i c_01 = _mm_packs_epi32(c_0, c_1);
 
-	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
-	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
-	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
-	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+      __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2));
 
-	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
-	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
-	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
+      __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2);
 
-	 /* Extract the sign bits
-	  */
-	 mask &= ~_mm_movemask_epi8(result);
-      }
+      __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0));
+      __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1));
+      __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2));
 
-      if (mask)
-	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+      __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3);
+      __m128i c_23 = _mm_packs_epi32(c_2, c_3);
+      __m128i c_0123 = _mm_packs_epi16(c_01, c_23);
+
+      unsigned mask = _mm_movemask_epi8(c_0123);
+
+      if (mask != 0xffff)
+         lp_rast_shade_quads_mask(task,
+                                  &tri->inputs,
+                                  x,
+                                  y,
+                                  0xffff & ~mask);
    }
 }
 
-
+#undef NR_PLANES
 #endif
 
 
@@ -383,10 +456,13 @@ lp_rast_triangle_3_4(struct lp_rasterizer_task *task,
 
 #define TAG(x) x##_3
 #define NR_PLANES 3
+/*#define TRI_4 lp_rast_triangle_3_4*/
+/*#define TRI_16 lp_rast_triangle_3_16*/
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_4
 #define NR_PLANES 4
+#define TRI_16 lp_rast_triangle_4_16
 #include "lp_rast_tri_tmp.h"
 
 #define TAG(x) x##_5
diff --git a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
index 9830a43ba55..4825d651c04 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
+++ b/src/gallium/drivers/llvmpipe/lp_rast_tri_tmp.h
@@ -82,7 +82,8 @@ TAG(do_block_16)(struct lp_rasterizer_task *task,
       const int dcdx = -plane[j].dcdx * 4;
       const int dcdy = plane[j].dcdy * 4;
       const int cox = plane[j].eo * 4;
-      const int cio = plane[j].ei * 4 - 1;
+      const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+      const int cio = ei * 4 - 1;
 
       build_masks(c[j] + cox,
 		  cio - cox,
@@ -156,6 +157,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 {
    const struct lp_rast_triangle *tri = arg.triangle.tri;
    unsigned plane_mask = arg.triangle.plane_mask;
+   const struct lp_rast_plane *tri_plane = GET_PLANES(tri);
    const int x = task->x, y = task->y;
    struct lp_rast_plane plane[NR_PLANES];
    int c[NR_PLANES];
@@ -172,7 +174,7 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 
    while (plane_mask) {
       int i = ffs(plane_mask) - 1;
-      plane[j] = tri->plane[i];
+      plane[j] = tri_plane[i];
       plane_mask &= ~(1 << i);
       c[j] = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
 
@@ -180,7 +182,8 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
 	 const int dcdx = -plane[j].dcdx * 16;
 	 const int dcdy = plane[j].dcdy * 16;
 	 const int cox = plane[j].eo * 16;
-	 const int cio = plane[j].ei * 16 - 1;
+         const int ei = plane[j].dcdy - plane[j].dcdx - plane[j].eo;
+         const int cio = ei * 16 - 1;
 
 	 build_masks(c[j] + cox,
 		     cio - cox,
@@ -245,6 +248,133 @@ TAG(lp_rast_triangle)(struct lp_rasterizer_task *task,
    }
 }
 
+#if defined(PIPE_ARCH_SSE) && defined(TRI_16)
+/* XXX: special case this when intersection is not required.
+ *      - tile completely within bbox,
+ *      - bbox completely within tile.
+ */
+void
+TRI_16(struct lp_rasterizer_task *task,
+       const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned mask = arg.triangle.plane_mask;
+   unsigned outmask, partial_mask;
+   unsigned j;
+   __m128i cstep4[NR_PLANES][4];
+
+   int x = (mask & 0xff);
+   int y = (mask >> 8);
+
+   outmask = 0;                 /* outside one or more trivial reject planes */
+   
+   x += task->x;
+   y += task->y;
+
+   for (j = 0; j < NR_PLANES; j++) {
+      const int dcdx = -plane[j].dcdx * 4;
+      const int dcdy = plane[j].dcdy * 4;
+      __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+      cstep4[j][0] = _mm_setr_epi32(0, dcdx, dcdx*2, dcdx*3);
+      cstep4[j][1] = _mm_add_epi32(cstep4[j][0], xdcdy);
+      cstep4[j][2] = _mm_add_epi32(cstep4[j][1], xdcdy);
+      cstep4[j][3] = _mm_add_epi32(cstep4[j][2], xdcdy);
+
+      {
+	 const int c = plane[j].c + plane[j].dcdy * y - plane[j].dcdx * x;
+	 const int cox = plane[j].eo * 4;
+
+	 outmask |= sign_bits4(cstep4[j], c + cox);
+      }
+   }
+
+   if (outmask == 0xffff)
+      return;
+
+
+   /* Mask of sub-blocks which are inside all trivial reject planes,
+    * but outside at least one trivial accept plane:
+    */
+   partial_mask = 0xffff & ~outmask;
+
+   /* Iterate over partials:
+    */
+   while (partial_mask) {
+      int i = ffs(partial_mask) - 1;
+      int ix = (i & 3) * 4;
+      int iy = (i >> 2) * 4;
+      int px = x + ix;
+      int py = y + iy; 
+      unsigned mask = 0xffff;
+
+      partial_mask &= ~(1 << i);
+
+      for (j = 0; j < NR_PLANES; j++) {
+         const int cx = (plane[j].c - 1
+			 - plane[j].dcdx * px
+			 + plane[j].dcdy * py) * 4;
+
+	 mask &= ~sign_bits4(cstep4[j], cx);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, px, py, mask);
+   }
+}
+#endif
+
+#if defined(PIPE_ARCH_SSE) && defined(TRI_4)
+void
+TRI_4(struct lp_rasterizer_task *task,
+      const union lp_rast_cmd_arg arg)
+{
+   const struct lp_rast_triangle *tri = arg.triangle.tri;
+   const struct lp_rast_plane *plane = GET_PLANES(tri);
+   unsigned mask = arg.triangle.plane_mask;
+   const int x = task->x + (mask & 0xff);
+   const int y = task->y + (mask >> 8);
+   unsigned j;
+
+   /* Iterate over partials:
+    */
+   {
+      unsigned mask = 0xffff;
+
+      for (j = 0; j < NR_PLANES; j++) {
+	 const int cx = (plane[j].c 
+			 - plane[j].dcdx * x
+			 + plane[j].dcdy * y);
+
+	 const int dcdx = -plane[j].dcdx;
+	 const int dcdy = plane[j].dcdy;
+	 __m128i xdcdy = _mm_set1_epi32(dcdy);
+
+	 __m128i cstep0 = _mm_setr_epi32(cx, cx + dcdx, cx + dcdx*2, cx + dcdx*3);
+	 __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
+	 __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
+	 __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);
+
+	 __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
+	 __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);
+	 __m128i result = _mm_packs_epi16(cstep01, cstep23);
+
+	 /* Extract the sign bits
+	  */
+	 mask &= ~_mm_movemask_epi8(result);
+      }
+
+      if (mask)
+	 lp_rast_shade_quads_mask(task, &tri->inputs, x, y, mask);
+   }
+}
+#endif
+
+
+
 #undef TAG
+#undef TRI_4
+#undef TRI_16
 #undef NR_PLANES
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index 8b504f23a33..a4fdf7cff36 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -203,7 +203,9 @@ lp_scene_end_rasterization(struct lp_scene *scene )
    for (i = 0; i < scene->tiles_x; i++) {
       for (j = 0; j < scene->tiles_y; j++) {
          struct cmd_bin *bin = lp_scene_get_bin(scene, i, j);
-         bin->head = bin->tail = NULL;
+         bin->head = NULL;
+         bin->tail = NULL;
+         bin->last_state = NULL;
       }
    }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index dbef7692e42..622c522f11a 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -41,6 +41,7 @@
 #include "lp_debug.h"
 
 struct lp_scene_queue;
+struct lp_rast_state;
 
 /* We're limited to 2K by 2K for 32bit fixed point rasterization.
  * Will need a 64-bit version for larger framebuffers.
@@ -94,6 +95,7 @@ struct data_block {
 struct cmd_bin {
    ushort x;
    ushort y;
+   const struct lp_rast_state *last_state;       /* most recent state set in bin */
    struct cmd_block *head;
    struct cmd_block *tail;
 };
@@ -297,7 +299,7 @@ lp_scene_bin_command( struct lp_scene *scene,
 
    assert(x < scene->tiles_x);
    assert(y < scene->tiles_y);
-   assert(cmd <= LP_RAST_OP_END_QUERY);
+   assert(cmd < LP_RAST_OP_MAX);
 
    if (tail == NULL || tail->count == CMD_BLOCK_MAX) {
       tail = lp_scene_new_cmd_block( scene, bin );
@@ -318,6 +320,30 @@ lp_scene_bin_command( struct lp_scene *scene,
 }
 
 
+static INLINE boolean
+lp_scene_bin_cmd_with_state( struct lp_scene *scene,
+                             unsigned x, unsigned y,
+                             const struct lp_rast_state *state,
+                             unsigned cmd,
+                             union lp_rast_cmd_arg arg )
+{
+   struct cmd_bin *bin = lp_scene_get_bin(scene, x, y);
+
+   if (state != bin->last_state) {
+      bin->last_state = state;
+      if (!lp_scene_bin_command(scene, x, y,
+                                LP_RAST_OP_SET_STATE,
+                                lp_rast_arg_state(state)))
+         return FALSE;
+   }
+
+   if (!lp_scene_bin_command( scene, x, y, cmd, arg ))
+      return FALSE;
+
+   return TRUE;
+}
+
+
 /* Add a command to all active bins.
  */
 static INLINE boolean
diff --git a/src/gallium/drivers/llvmpipe/lp_screen.c b/src/gallium/drivers/llvmpipe/lp_screen.c
index 96633d93654..ad0ea75b3a3 100644
--- a/src/gallium/drivers/llvmpipe/lp_screen.c
+++ b/src/gallium/drivers/llvmpipe/lp_screen.c
@@ -158,6 +158,8 @@ llvmpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
    case PIPE_CAP_TGSI_FS_COORD_ORIGIN_LOWER_LEFT:
    case PIPE_CAP_TGSI_FS_COORD_PIXEL_CENTER_HALF_INTEGER:
       return 0;
+   case PIPE_CAP_PRIMITIVE_RESTART:
+      return 1;
    case PIPE_CAP_DEPTHSTENCIL_CLEAR_SEPARATE:
       return 1;
    case PIPE_CAP_DEPTH_CLAMP:
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.c b/src/gallium/drivers/llvmpipe/lp_setup.c
index 5ff11a33632..6118434d3d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup.c
@@ -56,7 +56,7 @@
 #include "draw/draw_vbuf.h"
 
 
-static void set_scene_state( struct lp_setup_context *, enum setup_state,
+static boolean set_scene_state( struct lp_setup_context *, enum setup_state,
                              const char *reason);
 static boolean try_update_scene_state( struct lp_setup_context *setup );
 
@@ -167,7 +167,7 @@ lp_setup_rasterize_scene( struct lp_setup_context *setup )
 
 
 
-static void
+static boolean
 begin_binning( struct lp_setup_context *setup )
 {
    struct lp_scene *scene = setup->scene;
@@ -181,6 +181,8 @@ begin_binning( struct lp_setup_context *setup )
    /* Always create a fence:
     */
    scene->fence = lp_fence_create(MAX2(1, setup->num_threads));
+   if (!scene->fence)
+      return FALSE;
 
    /* Initialize the bin flags and x/y coords:
     */
@@ -192,7 +194,8 @@ begin_binning( struct lp_setup_context *setup )
    }
 
    ok = try_update_scene_state(setup);
-   assert(ok);
+   if (!ok)
+      return FALSE;
 
    if (setup->fb.zsbuf &&
        ((setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL) &&
@@ -208,7 +211,8 @@ begin_binning( struct lp_setup_context *setup )
          ok = lp_scene_bin_everywhere( scene, 
                                        LP_RAST_OP_CLEAR_COLOR, 
                                        setup->clear.color );
-         assert(ok);
+         if (!ok)
+            return FALSE;
       }
    }
 
@@ -216,12 +220,14 @@ begin_binning( struct lp_setup_context *setup )
       if (setup->clear.flags & PIPE_CLEAR_DEPTHSTENCIL) {
          if (!need_zsload)
             scene->has_depthstencil_clear = TRUE;
+
          ok = lp_scene_bin_everywhere( scene,
                                        LP_RAST_OP_CLEAR_ZSTENCIL,
                                        lp_rast_arg_clearzs(
                                           setup->clear.zsvalue,
                                           setup->clear.zsmask));
-         assert(ok);
+         if (!ok)
+            return FALSE;
       }
    }
 
@@ -229,15 +235,16 @@ begin_binning( struct lp_setup_context *setup )
       ok = lp_scene_bin_everywhere( scene,
                                     LP_RAST_OP_BEGIN_QUERY,
                                     lp_rast_arg_query(setup->active_query) );
-      assert(ok);
+      if (!ok)
+         return FALSE;
    }
-      
 
    setup->clear.flags = 0;
    setup->clear.zsmask = 0;
    setup->clear.zsvalue = 0;
 
    LP_DBG(DEBUG_SETUP, "%s done\n", __FUNCTION__);
+   return TRUE;
 }
 
 
@@ -246,12 +253,12 @@ begin_binning( struct lp_setup_context *setup )
  *
  * TODO: fast path for fullscreen clears and no triangles.
  */
-static void
+static boolean
 execute_clears( struct lp_setup_context *setup )
 {
    LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
 
-   begin_binning( setup );
+   return begin_binning( setup );
 }
 
 const char *states[] = {
@@ -262,7 +269,7 @@ const char *states[] = {
 };
 
 
-static void
+static boolean
 set_scene_state( struct lp_setup_context *setup,
                  enum setup_state new_state,
                  const char *reason)
@@ -270,7 +277,7 @@ set_scene_state( struct lp_setup_context *setup,
    unsigned old_state = setup->state;
 
    if (old_state == new_state)
-      return;
+      return TRUE;
    
    if (LP_DEBUG & DEBUG_SCENE) {
       debug_printf("%s old %s new %s%s%s\n",
@@ -294,12 +301,14 @@ set_scene_state( struct lp_setup_context *setup,
       break;
 
    case SETUP_ACTIVE:
-      begin_binning( setup );
+      if (!begin_binning( setup ))
+         goto fail;
       break;
 
    case SETUP_FLUSHED:
       if (old_state == SETUP_CLEARED)
-         execute_clears( setup );
+         if (!execute_clears( setup ))
+            goto fail;
 
       lp_setup_rasterize_scene( setup );
       assert(setup->scene == NULL);
@@ -307,9 +316,21 @@ set_scene_state( struct lp_setup_context *setup,
 
    default:
       assert(0 && "invalid setup state mode");
+      goto fail;
    }
 
    setup->state = new_state;
+   return TRUE;
+
+fail:
+   if (setup->scene) {
+      lp_scene_end_rasterization(setup->scene);
+      setup->scene = NULL;
+   }
+
+   setup->state = SETUP_FLUSHED;
+   lp_setup_reset( setup );
+   return FALSE;
 }
 
 
@@ -377,16 +398,19 @@ lp_setup_try_clear( struct lp_setup_context *setup,
    }
 
    if (flags & PIPE_CLEAR_DEPTHSTENCIL) {
-      unsigned zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
-      unsigned smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
+      uint32_t zmask = (flags & PIPE_CLEAR_DEPTH) ? ~0 : 0;
+      uint32_t smask = (flags & PIPE_CLEAR_STENCIL) ? ~0 : 0;
 
       zsvalue = util_pack_z_stencil(setup->fb.zsbuf->format,
                                     depth,
                                     stencil);
 
-      zsmask = util_pack_uint_z_stencil(setup->fb.zsbuf->format,
+
+      zsmask = util_pack_mask_z_stencil(setup->fb.zsbuf->format,
                                         zmask,
                                         smask);
+
+      zsvalue &= zsmask;
    }
 
    if (setup->state == SETUP_ACTIVE) {
@@ -431,7 +455,7 @@ lp_setup_try_clear( struct lp_setup_context *setup,
       if (flags & PIPE_CLEAR_COLOR) {
          memcpy(setup->clear.color.clear_color,
                 &color_arg,
-                sizeof color_arg);
+                sizeof setup->clear.color.clear_color);
       }
    }
    
@@ -502,14 +526,12 @@ lp_setup_set_point_state( struct lp_setup_context *setup,
 }
 
 void
-lp_setup_set_fs_inputs( struct lp_setup_context *setup,
-                        const struct lp_shader_input *input,
-                        unsigned nr )
+lp_setup_set_setup_variant( struct lp_setup_context *setup,
+			    const struct lp_setup_variant *variant)
 {
-   LP_DBG(DEBUG_SETUP, "%s %p %u\n", __FUNCTION__, (void *) input, nr);
-
-   memcpy( setup->fs.input, input, nr * sizeof input[0] );
-   setup->fs.nr_inputs = nr;
+   LP_DBG(DEBUG_SETUP, "%s\n", __FUNCTION__);
+   
+   setup->setup.variant = variant;
 }
 
 void
@@ -875,7 +897,7 @@ try_update_scene_state( struct lp_setup_context *setup )
    return TRUE;
 }
 
-void
+boolean
 lp_setup_update_state( struct lp_setup_context *setup,
                        boolean update_scene )
 {
@@ -897,22 +919,47 @@ lp_setup_update_state( struct lp_setup_context *setup,
       setup->psize = lp->psize_slot;
 
       assert(lp->dirty == 0);
+
+      assert(lp->setup_variant.key.size == 
+	     setup->setup.variant->key.size);
+
+      assert(memcmp(&lp->setup_variant.key,
+		    &setup->setup.variant->key,
+		    setup->setup.variant->key.size) == 0);
    }
 
-   if (update_scene)
-      set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ );
+   if (update_scene) {
+      if (!set_scene_state( setup, SETUP_ACTIVE, __FUNCTION__ ))
+         return FALSE;
+   }
 
    /* Only call into update_scene_state() if we already have a
     * scene:
     */
    if (update_scene && setup->scene) {
       assert(setup->state == SETUP_ACTIVE);
-      if (!try_update_scene_state(setup)) {
-         lp_setup_flush_and_restart(setup);
-         if (!try_update_scene_state(setup))
-            assert(0);
-      }
+
+      if (try_update_scene_state(setup))
+         return TRUE;
+
+      /* Update failed, try to restart the scene.
+       *
+       * Cannot call lp_setup_flush_and_restart() directly here
+       * because of potential recursion.
+       */
+      if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+         return FALSE;
+
+      if (!set_scene_state(setup, SETUP_ACTIVE, __FUNCTION__))
+         return FALSE;
+
+      if (!setup->scene)
+         return FALSE;
+
+      return try_update_scene_state(setup);
    }
+
+   return TRUE;
 }
 
 
@@ -1016,12 +1063,12 @@ lp_setup_begin_query(struct lp_setup_context *setup,
                                    LP_RAST_OP_BEGIN_QUERY,
                                    lp_rast_arg_query(pq))) {
 
-         lp_setup_flush_and_restart(setup);
+         if (!lp_setup_flush_and_restart(setup))
+            return;
 
          if (!lp_scene_bin_everywhere(setup->scene,
                                       LP_RAST_OP_BEGIN_QUERY,
                                       lp_rast_arg_query(pq))) {
-            assert(0);
             return;
          }
       }
@@ -1065,14 +1112,20 @@ lp_setup_end_query(struct lp_setup_context *setup, struct llvmpipe_query *pq)
 }
 
 
-void
+boolean
 lp_setup_flush_and_restart(struct lp_setup_context *setup)
 {
    if (0) debug_printf("%s\n", __FUNCTION__);
 
    assert(setup->state == SETUP_ACTIVE);
-   set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__);
-   lp_setup_update_state(setup, TRUE);
+
+   if (!set_scene_state(setup, SETUP_FLUSHED, __FUNCTION__))
+      return FALSE;
+   
+   if (!lp_setup_update_state(setup, TRUE))
+      return FALSE;
+
+   return TRUE;
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup.h b/src/gallium/drivers/llvmpipe/lp_setup.h
index 25dab78f64f..ebb18f81344 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup.h
@@ -33,28 +33,6 @@
 struct draw_context;
 struct vertex_info;
 
-enum lp_interp {
-   LP_INTERP_CONSTANT,
-   LP_INTERP_LINEAR,
-   LP_INTERP_PERSPECTIVE,
-   LP_INTERP_POSITION,
-   LP_INTERP_FACING
-};
-
-
-/**
- * Describes how to compute the interpolation coefficients (a0, dadx, dady)
- * from the vertices passed into our triangle/line/point functions by the
- * draw module.
- *
- * Vertices are treated as an array of float[4] values, indexed by
- * src_index.
- */
-struct lp_shader_input {
-   enum lp_interp interp;       /* how to interpolate values */
-   unsigned src_index;          /* where to find values in incoming vertices */
-   unsigned usage_mask;         /* bitmask of TGSI_WRITEMASK_x flags */
-};
 
 struct pipe_resource;
 struct pipe_query;
@@ -66,7 +44,7 @@ struct lp_fragment_shader_variant;
 struct lp_jit_context;
 struct llvmpipe_query;
 struct pipe_fence_handle;
-
+struct lp_setup_variant;
 
 struct lp_setup_context *
 lp_setup_create( struct pipe_context *pipe,
@@ -111,9 +89,8 @@ lp_setup_set_point_state( struct lp_setup_context *setup,
                           uint sprite_coord_origin);
 
 void
-lp_setup_set_fs_inputs( struct lp_setup_context *setup,
-                        const struct lp_shader_input *interp,
-                        unsigned nr );
+lp_setup_set_setup_variant( struct lp_setup_context *setup,
+			    const struct lp_setup_variant *variant );
 
 void
 lp_setup_set_fs_variant( struct lp_setup_context *setup,
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.c b/src/gallium/drivers/llvmpipe/lp_setup_coef.c
deleted file mode 100644
index 8dc2688ddb6..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.c
+++ /dev/null
@@ -1,279 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010, VMware.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/*
- * Binning code for triangles
- */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "lp_perf.h"
-#include "lp_setup_context.h"
-#include "lp_setup_coef.h"
-#include "lp_rast.h"
-#include "lp_state_fs.h"
-
-#if !defined(PIPE_ARCH_SSE)
-
-/**
- * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
- */
-static void constant_coef( struct lp_rast_shader_inputs *inputs,
-                           unsigned slot,
-			   const float value,
-                           unsigned i )
-{
-   inputs->a0[slot][i] = value;
-   inputs->dadx[slot][i] = 0.0f;
-   inputs->dady[slot][i] = 0.0f;
-}
-
-
-
-static void linear_coef( struct lp_rast_shader_inputs *inputs,
-                         const struct lp_tri_info *info,
-                         unsigned slot,
-                         unsigned vert_attr,
-                         unsigned i)
-{
-   float a0 = info->v0[vert_attr][i];
-   float a1 = info->v1[vert_attr][i];
-   float a2 = info->v2[vert_attr][i];
-
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = (da01 * info->dy20_ooa - info->dy01_ooa * da20);
-   float dady = (da20 * info->dx01_ooa - info->dx20_ooa * da01);
-
-   inputs->dadx[slot][i] = dadx;
-   inputs->dady[slot][i] = dady;
-
-   /* calculate a0 as the value which would be sampled for the
-    * fragment at (0,0), taking into account that we want to sample at
-    * pixel centers, in other words (0.5, 0.5).
-    *
-    * this is neat but unfortunately not a good way to do things for
-    * triangles with very large values of dadx or dady as it will
-    * result in the subtraction and re-addition from a0 of a very
-    * large number, which means we'll end up loosing a lot of the
-    * fractional bits and precision from a0.  the way to fix this is
-    * to define a0 as the sample at a pixel center somewhere near vmin
-    * instead - i'll switch to this later.
-    */
-   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
-				   dady * info->y0_center);
-}
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void perspective_coef( struct lp_rast_shader_inputs *inputs,
-                              const struct lp_tri_info *info,
-                              unsigned slot,
-			      unsigned vert_attr,
-                              unsigned i)
-{
-   /* premultiply by 1/w  (v[0][3] is always 1/w):
-    */
-   float a0 = info->v0[vert_attr][i] * info->v0[0][3];
-   float a1 = info->v1[vert_attr][i] * info->v1[0][3];
-   float a2 = info->v2[vert_attr][i] * info->v2[0][3];
-   float da01 = a0 - a1;
-   float da20 = a2 - a0;
-   float dadx = da01 * info->dy20_ooa - info->dy01_ooa * da20;
-   float dady = da20 * info->dx01_ooa - info->dx20_ooa * da01;
-
-   inputs->dadx[slot][i] = dadx;
-   inputs->dady[slot][i] = dady;
-   inputs->a0[slot][i] = a0 - (dadx * info->x0_center +
-				   dady * info->y0_center);
-}
-
-
-/**
- * Special coefficient setup for gl_FragCoord.
- * X and Y are trivial
- * Z and W are copied from position_coef which should have already been computed.
- * We could do a bit less work if we'd examine gl_FragCoord's swizzle mask.
- */
-static void
-setup_fragcoord_coef(struct lp_rast_shader_inputs *inputs,
-                     const struct lp_tri_info *info,
-                     unsigned slot,
-                     unsigned usage_mask)
-{
-   /*X*/
-   if (usage_mask & TGSI_WRITEMASK_X) {
-      inputs->a0[slot][0] = 0.0;
-      inputs->dadx[slot][0] = 1.0;
-      inputs->dady[slot][0] = 0.0;
-   }
-
-   /*Y*/
-   if (usage_mask & TGSI_WRITEMASK_Y) {
-      inputs->a0[slot][1] = 0.0;
-      inputs->dadx[slot][1] = 0.0;
-      inputs->dady[slot][1] = 1.0;
-   }
-
-   /*Z*/
-   if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(inputs, info, slot, 0, 2);
-   }
-
-   /*W*/
-   if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(inputs, info, slot, 0, 3);
-   }
-}
-
-
-/**
- * Setup the fragment input attribute with the front-facing value.
- * \param frontface  is the triangle front facing?
- */
-static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
-                               unsigned slot,
-                               boolean frontface,
-                               unsigned usage_mask)
-{
-   /* convert TRUE to 1.0 and FALSE to -1.0 */
-   if (usage_mask & TGSI_WRITEMASK_X)
-      constant_coef( inputs, slot, 2.0f * frontface - 1.0f, 0 );
-
-   if (usage_mask & TGSI_WRITEMASK_Y)
-      constant_coef( inputs, slot, 0.0f, 1 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_Z)
-      constant_coef( inputs, slot, 0.0f, 2 ); /* wasted */
-
-   if (usage_mask & TGSI_WRITEMASK_W)
-      constant_coef( inputs, slot, 0.0f, 3 ); /* wasted */
-}
-
-
-/**
- * Compute the tri->coef[] array dadx, dady, a0 values.
- */
-void lp_setup_tri_coef( struct lp_setup_context *setup,
-			struct lp_rast_shader_inputs *inputs,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        boolean frontfacing)
-{
-   unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
-   unsigned slot;
-   unsigned i;
-   struct lp_tri_info info;
-   float dx01 = v0[0][0] - v1[0][0];
-   float dy01 = v0[0][1] - v1[0][1];
-   float dx20 = v2[0][0] - v0[0][0];
-   float dy20 = v2[0][1] - v0[0][1];
-   float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01);
-
-   info.v0 = v0;
-   info.v1 = v1;
-   info.v2 = v2;
-   info.frontfacing = frontfacing;
-   info.x0_center = v0[0][0] - setup->pixel_offset;
-   info.y0_center = v0[0][1] - setup->pixel_offset;
-   info.dx01_ooa  = dx01 * oneoverarea;
-   info.dx20_ooa  = dx20 * oneoverarea;
-   info.dy01_ooa  = dy01 * oneoverarea;
-   info.dy20_ooa  = dy20 * oneoverarea;
-
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
-
-      switch (setup->fs.input[slot].interp) {
-      case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(inputs, slot+1, info.v0[vert_attr][i], i);
-         }
-         else {
-            for (i = 0; i < NUM_CHANNELS; i++)
-               if (usage_mask & (1 << i))
-                  constant_coef(inputs, slot+1, info.v2[vert_attr][i], i);
-         }
-         break;
-
-      case LP_INTERP_LINEAR:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               linear_coef(inputs, &info, slot+1, vert_attr, i);
-         break;
-
-      case LP_INTERP_PERSPECTIVE:
-         for (i = 0; i < NUM_CHANNELS; i++)
-            if (usage_mask & (1 << i))
-               perspective_coef(inputs, &info, slot+1, vert_attr, i);
-         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-         break;
-
-      case LP_INTERP_POSITION:
-         /*
-          * The generated pixel interpolators will pick up the coeffs from
-          * slot 0, so all need to ensure that the usage mask is covers all
-          * usages.
-          */
-         fragcoord_usage_mask |= usage_mask;
-         break;
-
-      case LP_INTERP_FACING:
-         setup_facing_coef(inputs, slot+1, info.frontfacing, usage_mask);
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-
-   /* The internal position input is in slot zero:
-    */
-   setup_fragcoord_coef(inputs, &info, 0, fragcoord_usage_mask);
-}
-
-#else
-extern void lp_setup_coef_dummy(void);
-void lp_setup_coef_dummy(void)
-{
-}
-
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef.h b/src/gallium/drivers/llvmpipe/lp_setup_coef.h
deleted file mode 100644
index 87a3255ccc6..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef.h
+++ /dev/null
@@ -1,64 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware, Inc.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-
-/**
- * The setup code is concerned with point/line/triangle setup and
- * putting commands/data into the bins.
- */
-
-
-#ifndef LP_SETUP_COEF_H
-#define LP_SETUP_COEF_H
-
-
-struct lp_tri_info {
-
-   float x0_center;
-   float y0_center;
-
-   /* turn these into an aligned float[4] */
-   float dy01_ooa;
-   float dy20_ooa;
-   float dx01_ooa;
-   float dx20_ooa;
-
-   const float (*v0)[4];
-   const float (*v1)[4];
-   const float (*v2)[4];
-
-   boolean frontfacing;		/* remove eventually */
-};
-
-void lp_setup_tri_coef( struct lp_setup_context *setup,
-			struct lp_rast_shader_inputs *inputs,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        boolean frontfacing);
-
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c b/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
deleted file mode 100644
index 3742fd672b2..00000000000
--- a/src/gallium/drivers/llvmpipe/lp_setup_coef_intrin.c
+++ /dev/null
@@ -1,228 +0,0 @@
-/**************************************************************************
- *
- * Copyright 2010 VMware.
- * All Rights Reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the
- * "Software"), to deal in the Software without restriction, including
- * without limitation the rights to use, copy, modify, merge, publish,
- * distribute, sub license, and/or sell copies of the Software, and to
- * permit persons to whom the Software is furnished to do so, subject to
- * the following conditions:
- *
- * The above copyright notice and this permission notice (including the
- * next paragraph) shall be included in all copies or substantial portions
- * of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
- * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
- * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
- * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
- * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
- * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- *
- **************************************************************************/
-
-/*
- * Binning code for triangles
- */
-
-#include "util/u_math.h"
-#include "util/u_memory.h"
-#include "lp_perf.h"
-#include "lp_setup_context.h"
-#include "lp_setup_coef.h"
-#include "lp_rast.h"
-
-#if defined(PIPE_ARCH_SSE)
-#include <emmintrin.h>
-
-
-static void constant_coef4( struct lp_rast_shader_inputs *inputs,
-			    const struct lp_tri_info *info,
-			    unsigned slot,
-			    const float *attr)
-{
-   *(__m128 *)inputs->a0[slot]   = *(__m128 *)attr;
-   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
-   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
-}
-
-
-
-/**
- * Setup the fragment input attribute with the front-facing value.
- * \param frontface  is the triangle front facing?
- */
-static void setup_facing_coef( struct lp_rast_shader_inputs *inputs,
-			       const struct lp_tri_info *info,
-			       unsigned slot )
-{
-   /* XXX: just pass frontface directly to the shader, don't bother
-    * treating it as an input.
-    */
-   __m128 a0 = _mm_setr_ps(info->frontfacing ? 1.0 : -1.0,
-			   0, 0, 0);
-
-   *(__m128 *)inputs->a0[slot]   = a0;
-   *(__m128 *)inputs->dadx[slot] = _mm_set1_ps(0.0);
-   *(__m128 *)inputs->dady[slot] = _mm_set1_ps(0.0);
-}
-
-
-
-static void calc_coef4( struct lp_rast_shader_inputs *inputs,
-			const struct lp_tri_info *info,
-			unsigned slot,
-			__m128 a0,
-			__m128 a1,
-			__m128 a2)
-{
-   __m128 da01          = _mm_sub_ps(a0, a1);
-   __m128 da20          = _mm_sub_ps(a2, a0);
-
-   __m128 da01_dy20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dy20_ooa));
-   __m128 da20_dy01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dy01_ooa));   
-   __m128 dadx          = _mm_sub_ps(da01_dy20_ooa, da20_dy01_ooa);
-
-   __m128 da01_dx20_ooa = _mm_mul_ps(da01, _mm_set1_ps(info->dx20_ooa));
-   __m128 da20_dx01_ooa = _mm_mul_ps(da20, _mm_set1_ps(info->dx01_ooa));
-   __m128 dady          = _mm_sub_ps(da20_dx01_ooa, da01_dx20_ooa);
-
-   __m128 dadx_x0       = _mm_mul_ps(dadx, _mm_set1_ps(info->x0_center));
-   __m128 dady_y0       = _mm_mul_ps(dady, _mm_set1_ps(info->y0_center));
-   __m128 attr_v0       = _mm_add_ps(dadx_x0, dady_y0);
-   __m128 attr_0        = _mm_sub_ps(a0, attr_v0);
-
-   *(__m128 *)inputs->a0[slot]   = attr_0;
-   *(__m128 *)inputs->dadx[slot] = dadx;
-   *(__m128 *)inputs->dady[slot] = dady;
-}
-
-
-static void linear_coef( struct lp_rast_shader_inputs *inputs,
-                         const struct lp_tri_info *info,
-                         unsigned slot,
-                         unsigned vert_attr)
-{
-   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
-   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
-   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
-
-   calc_coef4(inputs, info, slot, a0, a1, a2);
-}
-
-
-
-/**
- * Compute a0, dadx and dady for a perspective-corrected interpolant,
- * for a triangle.
- * We basically multiply the vertex value by 1/w before computing
- * the plane coefficients (a0, dadx, dady).
- * Later, when we compute the value at a particular fragment position we'll
- * divide the interpolated value by the interpolated W at that fragment.
- */
-static void perspective_coef( struct lp_rast_shader_inputs *inputs,
-                              const struct lp_tri_info *info,
-                              unsigned slot,
-			      unsigned vert_attr)
-{
-   /* premultiply by 1/w  (v[0][3] is always 1/w):
-    */
-   __m128 a0 = *(const __m128 *)info->v0[vert_attr];
-   __m128 a1 = *(const __m128 *)info->v1[vert_attr];
-   __m128 a2 = *(const __m128 *)info->v2[vert_attr];
-
-   __m128 a0_oow = _mm_mul_ps(a0, _mm_set1_ps(info->v0[0][3]));
-   __m128 a1_oow = _mm_mul_ps(a1, _mm_set1_ps(info->v1[0][3]));
-   __m128 a2_oow = _mm_mul_ps(a2, _mm_set1_ps(info->v2[0][3]));
-
-   calc_coef4(inputs, info, slot, a0_oow, a1_oow, a2_oow);
-}
-
-
-
-
-
-/**
- * Compute the inputs-> dadx, dady, a0 values.
- */
-void lp_setup_tri_coef( struct lp_setup_context *setup,
-			struct lp_rast_shader_inputs *inputs,
-                        const float (*v0)[4],
-                        const float (*v1)[4],
-                        const float (*v2)[4],
-                        boolean frontfacing)
-{
-   unsigned slot;
-   struct lp_tri_info info;
-   float dx01 = v0[0][0] - v1[0][0];
-   float dy01 = v0[0][1] - v1[0][1];
-   float dx20 = v2[0][0] - v0[0][0];
-   float dy20 = v2[0][1] - v0[0][1];
-   float oneoverarea = 1.0f / (dx01 * dy20 - dx20 * dy01);
-
-   info.v0 = v0;
-   info.v1 = v1;
-   info.v2 = v2;
-   info.frontfacing = frontfacing;
-   info.x0_center = v0[0][0] - setup->pixel_offset;
-   info.y0_center = v0[0][1] - setup->pixel_offset;
-   info.dx01_ooa  = dx01 * oneoverarea;
-   info.dx20_ooa  = dx20 * oneoverarea;
-   info.dy01_ooa  = dy01 * oneoverarea;
-   info.dy20_ooa  = dy20 * oneoverarea;
-
-
-   /* The internal position input is in slot zero:
-    */
-   linear_coef(inputs, &info, 0, 0);
-
-   /* setup interpolation for all the remaining attributes:
-    */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-
-      switch (setup->fs.input[slot].interp) {
-      case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
-	    constant_coef4(inputs, &info, slot+1, info.v0[vert_attr]);
-         }
-         else {
-	    constant_coef4(inputs, &info, slot+1, info.v2[vert_attr]);
-         }
-         break;
-
-      case LP_INTERP_LINEAR:
-	 linear_coef(inputs, &info, slot+1, vert_attr);
-         break;
-
-      case LP_INTERP_PERSPECTIVE:
-	 perspective_coef(inputs, &info, slot+1, vert_attr);
-         break;
-
-      case LP_INTERP_POSITION:
-         /*
-          * The generated pixel interpolators will pick up the coeffs from
-          * slot 0.
-          */
-         break;
-
-      case LP_INTERP_FACING:
-         setup_facing_coef(inputs, &info, slot+1);
-         break;
-
-      default:
-         assert(0);
-      }
-   }
-}
-
-#else
-extern void lp_setup_coef_dummy(void);
-void lp_setup_coef_dummy(void)
-{
-}
-#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_context.h b/src/gallium/drivers/llvmpipe/lp_setup_context.h
index 8506ed2dc9e..dc2533bedc5 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_context.h
+++ b/src/gallium/drivers/llvmpipe/lp_setup_context.h
@@ -39,6 +39,7 @@
 #include "lp_rast.h"
 #include "lp_tile_soa.h"        /* for TILE_SIZE */
 #include "lp_scene.h"
+#include "lp_bld_interp.h"	/* for struct lp_shader_input */
 
 #include "draw/draw_vbuf.h"
 #include "util/u_rect.h"
@@ -49,6 +50,8 @@
 #define LP_SETUP_NEW_SCISSOR     0x08
 
 
+struct lp_setup_variant;
+
 
 /** Max number of scenes */
 #define MAX_SCENES 2
@@ -118,9 +121,6 @@ struct lp_setup_context
    } state;
    
    struct {
-      struct lp_shader_input input[PIPE_MAX_ATTRIBS];
-      unsigned nr_inputs;
-
       const struct lp_rast_state *stored; /**< what's in the scene */
       struct lp_rast_state current;  /**< currently set state */
       struct pipe_resource *current_tex[PIPE_MAX_SAMPLERS];
@@ -139,6 +139,10 @@ struct lp_setup_context
    } blend_color;
 
 
+   struct {
+      const struct lp_setup_variant *variant;
+   } setup;
+
    unsigned dirty;   /**< bitmask of LP_SETUP_NEW_x bits */
 
    void (*point)( struct lp_setup_context *,
@@ -160,12 +164,12 @@ void lp_setup_choose_point( struct lp_setup_context *setup );
 
 void lp_setup_init_vbuf(struct lp_setup_context *setup);
 
-void lp_setup_update_state( struct lp_setup_context *setup,
+boolean lp_setup_update_state( struct lp_setup_context *setup,
                             boolean update_scene);
 
 void lp_setup_destroy( struct lp_setup_context *setup );
 
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
+boolean lp_setup_flush_and_restart(struct lp_setup_context *setup);
 
 void
 lp_setup_print_triangle(struct lp_setup_context *setup,
@@ -181,7 +185,7 @@ lp_setup_print_vertex(struct lp_setup_context *setup,
 
 struct lp_rast_triangle *
 lp_setup_alloc_triangle(struct lp_scene *scene,
-                        unsigned nr_inputs,
+                        unsigned num_inputs,
                         unsigned nr_planes,
                         unsigned *tri_size);
 
@@ -191,6 +195,4 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                        const struct u_rect *bbox,
                        int nr_planes );
 
-void lp_setup_flush_and_restart(struct lp_setup_context *setup);
-
 #endif
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_line.c b/src/gallium/drivers/llvmpipe/lp_setup_line.c
index 156bd633754..827413bb33e 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_line.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_line.c
@@ -35,6 +35,7 @@
 #include "lp_setup_context.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 
 #define NUM_CHANNELS 4
 
@@ -46,6 +47,10 @@ struct lp_line_info {
 
    const float (*v1)[4];
    const float (*v2)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };
 
 
@@ -53,14 +58,14 @@ struct lp_line_info {
  * Compute a0 for a constant-valued coefficient (GL_FLAT shading).
  */
 static void constant_coef( struct lp_setup_context *setup,
-                           struct lp_rast_triangle *tri,
+                           struct lp_line_info *info,
                            unsigned slot,
                            const float value,
                            unsigned i )
 {
-   tri->inputs.a0[slot][i] = value;
-   tri->inputs.dadx[slot][i] = 0.0f;
-   tri->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
@@ -69,7 +74,6 @@ static void constant_coef( struct lp_setup_context *setup,
  * for a triangle.
  */
 static void linear_coef( struct lp_setup_context *setup,
-                         struct lp_rast_triangle *tri,
                          struct lp_line_info *info,
                          unsigned slot,
                          unsigned vert_attr,
@@ -82,10 +86,10 @@ static void linear_coef( struct lp_setup_context *setup,
    float dadx = da21 * info->dx * info->oneoverarea;
    float dady = da21 * info->dy * info->oneoverarea;
 
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;  
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;  
    
-   tri->inputs.a0[slot][i] = (a1 -
+   info->a0[slot][i] = (a1 -
                               (dadx * (info->v1[0][0] - setup->pixel_offset) +
                                dady * (info->v1[0][1] - setup->pixel_offset)));
 }
@@ -100,7 +104,6 @@ static void linear_coef( struct lp_setup_context *setup,
  * divide the interpolated value by the interpolated W at that fragment.
  */
 static void perspective_coef( struct lp_setup_context *setup,
-                              struct lp_rast_triangle *tri,
                               struct lp_line_info *info,
                               unsigned slot,
                               unsigned vert_attr,
@@ -115,43 +118,42 @@ static void perspective_coef( struct lp_setup_context *setup,
    float dadx = da21 * info->dx * info->oneoverarea;
    float dady = da21 * info->dy * info->oneoverarea;
 
-   tri->inputs.dadx[slot][i] = dadx;
-   tri->inputs.dady[slot][i] = dady;
+   info->dadx[slot][i] = dadx;
+   info->dady[slot][i] = dady;
    
-   tri->inputs.a0[slot][i] = (a1 -
-                              (dadx * (info->v1[0][0] - setup->pixel_offset) +
-                               dady * (info->v1[0][1] - setup->pixel_offset)));
+   info->a0[slot][i] = (a1 -
+                        (dadx * (info->v1[0][0] - setup->pixel_offset) +
+                         dady * (info->v1[0][1] - setup->pixel_offset)));
 }
 
 static void
 setup_fragcoord_coef( struct lp_setup_context *setup,
-                      struct lp_rast_triangle *tri,
                       struct lp_line_info *info,
                       unsigned slot,
                       unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      tri->inputs.a0[slot][0] = 0.0;
-      tri->inputs.dadx[slot][0] = 1.0;
-      tri->inputs.dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      tri->inputs.a0[slot][1] = 0.0;
-      tri->inputs.dadx[slot][1] = 0.0;
-      tri->inputs.dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      linear_coef(setup, tri, info, slot, 0, 2);
+      linear_coef(setup, info, slot, 0, 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      linear_coef(setup, tri, info, slot, 0, 3);
+      linear_coef(setup, info, slot, 0, 3);
    }
 }
 
@@ -159,43 +161,43 @@ setup_fragcoord_coef( struct lp_setup_context *setup,
  * Compute the tri->coef[] array dadx, dady, a0 values.
  */
 static void setup_line_coefficients( struct lp_setup_context *setup,
-                                     struct lp_rast_triangle *tri,
                                      struct lp_line_info *info)
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
    unsigned slot;
 
    /* setup interpolation for all the remaining attributes:
     */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+      unsigned usage_mask = key->inputs[slot].usage_mask;
       unsigned i;
            
-      switch (setup->fs.input[slot].interp) {
+      switch (key->inputs[slot].interp) {
       case LP_INTERP_CONSTANT:
-         if (setup->flatshade_first) {
+         if (key->flatshade_first) {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(setup, tri, slot+1, info->v1[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v1[vert_attr][i], i);
          }
          else {
             for (i = 0; i < NUM_CHANNELS; i++)
                if (usage_mask & (1 << i))
-                  constant_coef(setup, tri, slot+1, info->v2[vert_attr][i], i);
+                  constant_coef(setup, info, slot+1, info->v2[vert_attr][i], i);
          }
          break;
 
       case LP_INTERP_LINEAR:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               linear_coef(setup, tri, info, slot+1, vert_attr, i);
+               linear_coef(setup, info, slot+1, vert_attr, i);
          break;
 
       case LP_INTERP_PERSPECTIVE:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               perspective_coef(setup, tri, info, slot+1, vert_attr, i);
+               perspective_coef(setup, info, slot+1, vert_attr, i);
          fragcoord_usage_mask |= TGSI_WRITEMASK_W;
          break;
 
@@ -211,7 +213,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_FACING:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               constant_coef(setup, tri, slot+1, 1.0, i);
+               constant_coef(setup, info, slot+1, 1.0, i);
          break;
 
       default:
@@ -221,7 +223,7 @@ static void setup_line_coefficients( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_fragcoord_coef(setup, tri, info, 0,
+   setup_fragcoord_coef(setup, info, 0,
                         fragcoord_usage_mask);
 }
 
@@ -241,14 +243,15 @@ print_line(struct lp_setup_context *setup,
            const float (*v1)[4],
            const float (*v2)[4])
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    uint i;
 
    debug_printf("llvmpipe line\n");
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+   for (i = 0; i < 1 + key->num_inputs; i++) {
       debug_printf("  v1[%d]:  %f %f %f %f\n", i,
                    v1[i][0], v1[i][1], v1[i][2], v1[i][3]);
    }
-   for (i = 0; i < 1 + setup->fs.nr_inputs; i++) {
+   for (i = 0; i < 1 + key->num_inputs; i++) {
       debug_printf("  v2[%d]:  %f %f %f %f\n", i,
                    v2[i][0], v2[i][1], v2[i][2], v2[i][3]);
    }
@@ -275,7 +278,9 @@ try_setup_line( struct lp_setup_context *setup,
                const float (*v2)[4])
 {
    struct lp_scene *scene = setup->scene;
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    struct lp_rast_triangle *line;
+   struct lp_rast_plane *plane;
    struct lp_line_info info;
    float width = MAX2(1.0, setup->line_width);
    struct u_rect bbox;
@@ -475,7 +480,7 @@ try_setup_line( struct lp_setup_context *setup,
       else {
          /* do intersection test */
          float xintersect = fracf(v2[0][0]) + y2diff * dxdy;
-         draw_end = (xintersect < 1.0 && xintersect > 0.0);
+         draw_end = (xintersect < 1.0 && xintersect >= 0.0);
       }
 
       /* Are we already drawing start/end?
@@ -513,7 +518,7 @@ try_setup_line( struct lp_setup_context *setup,
             x_offset_end = y_offset_end * dxdy;
          }
       }
- 
+
       /* x/y positions in fixed point */
       x[0] = subpixel_snap(v1[0][0] + x_offset     - setup->pixel_offset) - fixed_width/2;
       x[1] = subpixel_snap(v2[0][0] + x_offset_end - setup->pixel_offset) - fixed_width/2;
@@ -567,7 +572,7 @@ try_setup_line( struct lp_setup_context *setup,
    u_rect_find_intersection(&setup->draw_region, &bbox);
 
    line = lp_setup_alloc_triangle(scene,
-                                  setup->fs.nr_inputs,
+                                  key->num_inputs,
                                   nr_planes,
                                   &tri_bytes);
    if (!line)
@@ -581,33 +586,35 @@ try_setup_line( struct lp_setup_context *setup,
 #endif
 
    /* calculate the deltas */
-   line->plane[0].dcdy = x[0] - x[1];
-   line->plane[1].dcdy = x[1] - x[2];
-   line->plane[2].dcdy = x[2] - x[3];
-   line->plane[3].dcdy = x[3] - x[0];
+   plane = GET_PLANES(line);
+   plane[0].dcdy = x[0] - x[1];
+   plane[1].dcdy = x[1] - x[2];
+   plane[2].dcdy = x[2] - x[3];
+   plane[3].dcdy = x[3] - x[0];
 
-   line->plane[0].dcdx = y[0] - y[1];
-   line->plane[1].dcdx = y[1] - y[2];
-   line->plane[2].dcdx = y[2] - y[3];
-   line->plane[3].dcdx = y[3] - y[0];
+   plane[0].dcdx = y[0] - y[1];
+   plane[1].dcdx = y[1] - y[2];
+   plane[2].dcdx = y[2] - y[3];
+   plane[3].dcdx = y[3] - y[0];
 
 
    /* Setup parameter interpolants:
     */
-   setup_line_coefficients( setup, line, &info); 
+   info.a0 = GET_A0(&line->inputs);
+   info.dadx = GET_DADX(&line->inputs);
+   info.dady = GET_DADY(&line->inputs);
+   setup_line_coefficients(setup, &info); 
 
-   line->inputs.facing = 1.0F;
-   line->inputs.state = setup->fs.stored;
+   line->inputs.frontfacing = TRUE;
    line->inputs.disable = FALSE;
    line->inputs.opaque = FALSE;
 
    for (i = 0; i < 4; i++) {
-      struct lp_rast_plane *plane = &line->plane[i];
 
       /* half-edge constants, will be interated over the whole render
        * target.
        */
-      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
+      plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
 
       
       /* correct for top-left vs. bottom-left fill convention.  
@@ -623,38 +630,34 @@ try_setup_line( struct lp_setup_context *setup,
        * to its usual method, in which case it will probably want
        * to use the opposite, top-left convention.
        */         
-      if (plane->dcdx < 0) {
+      if (plane[i].dcdx < 0) {
          /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
+         plane[i].c++;            
       }
-      else if (plane->dcdx == 0) {
+      else if (plane[i].dcdx == 0) {
          if (setup->pixel_offset == 0) {
             /* correct for top-left fill convention:
              */
-            if (plane->dcdy > 0) plane->c++;
+            if (plane[i].dcdy > 0) plane[i].c++;
          }
          else {
             /* correct for bottom-left fill convention:
              */
-            if (plane->dcdy < 0) plane->c++;
+            if (plane[i].dcdy < 0) plane[i].c++;
          }
       }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+      plane[i].dcdx *= FIXED_ONE;
+      plane[i].dcdy *= FIXED_ONE;
 
       /* find trivial reject offsets for each edge for a single-pixel
        * sized block.  These will be scaled up at each recursive level to
        * match the active blocksize.  Scaling in this way works best if
        * the blocks are square.
        */
-      plane->eo = 0;
-      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
-      if (plane->dcdy > 0) plane->eo += plane->dcdy;
-
-      /* Calculate trivial accept offsets from the above.
-       */
-      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+      plane[i].eo = 0;
+      if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+      if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
    }
 
 
@@ -677,29 +680,25 @@ try_setup_line( struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 8) {
-      line->plane[4].dcdx = -1;
-      line->plane[4].dcdy = 0;
-      line->plane[4].c = 1-bbox.x0;
-      line->plane[4].ei = 0;
-      line->plane[4].eo = 1;
-
-      line->plane[5].dcdx = 1;
-      line->plane[5].dcdy = 0;
-      line->plane[5].c = bbox.x1+1;
-      line->plane[5].ei = -1;
-      line->plane[5].eo = 0;
-
-      line->plane[6].dcdx = 0;
-      line->plane[6].dcdy = 1;
-      line->plane[6].c = 1-bbox.y0;
-      line->plane[6].ei = 0;
-      line->plane[6].eo = 1;
-
-      line->plane[7].dcdx = 0;
-      line->plane[7].dcdy = -1;
-      line->plane[7].c = bbox.y1+1;
-      line->plane[7].ei = -1;
-      line->plane[7].eo = 0;
+      plane[4].dcdx = -1;
+      plane[4].dcdy = 0;
+      plane[4].c = 1-bbox.x0;
+      plane[4].eo = 1;
+
+      plane[5].dcdx = 1;
+      plane[5].dcdy = 0;
+      plane[5].c = bbox.x1+1;
+      plane[5].eo = 0;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = 1;
+      plane[6].c = 1-bbox.y0;
+      plane[6].eo = 1;
+
+      plane[7].dcdx = 0;
+      plane[7].dcdy = -1;
+      plane[7].c = bbox.y1+1;
+      plane[7].eo = 0;
    }
 
    return lp_setup_bin_triangle(setup, line, &bbox, nr_planes);
@@ -712,10 +711,11 @@ static void lp_setup_line( struct lp_setup_context *setup,
 {
    if (!try_setup_line( setup, v0, v1 ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!try_setup_line( setup, v0, v1 ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_point.c b/src/gallium/drivers/llvmpipe/lp_setup_point.c
index 3b217f95447..146f1bd07ca 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_point.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_point.c
@@ -35,6 +35,7 @@
 #include "lp_perf.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 #include "tgsi/tgsi_scan.h"
 
 #define NUM_CHANNELS 4
@@ -45,6 +46,10 @@ struct point_info {
    int dx01, dx12;
 
    const float (*v0)[4];
+
+   float (*a0)[4];
+   float (*dadx)[4];
+   float (*dady)[4];
 };   
 
 
@@ -53,14 +58,37 @@ struct point_info {
  */
 static void
 constant_coef(struct lp_setup_context *setup,
-              struct lp_rast_triangle *point,
+              struct point_info *info,
               unsigned slot,
               const float value,
               unsigned i)
 {
-   point->inputs.a0[slot][i] = value;
-   point->inputs.dadx[slot][i] = 0.0f;
-   point->inputs.dady[slot][i] = 0.0f;
+   info->a0[slot][i] = value;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
+}
+
+
+static void
+point_persp_coeff(struct lp_setup_context *setup,
+                  const struct point_info *info,
+                  unsigned slot,
+                  unsigned i)
+{
+   /*
+    * Fragment shader expects pre-multiplied w for LP_INTERP_PERSPECTIVE. A
+    * better stratergy would be to take the primitive in consideration when
+    * generating the fragment shader key, and therefore avoid the per-fragment
+    * perspective divide.
+    */
+
+   float w0 = info->v0[0][3];
+
+   assert(i < 4);
+
+   info->a0[slot][i] = info->v0[slot][i]*w0;
+   info->dadx[slot][i] = 0.0f;
+   info->dady[slot][i] = 0.0f;
 }
 
 
@@ -69,17 +97,19 @@ constant_coef(struct lp_setup_context *setup,
  * \param slot  the vertex attribute slot to setup
  * \param i  the attribute channel in [0,3]
  * \param sprite_coord_origin  one of PIPE_SPRITE_COORD_x
- * \param perspective_proj  will the TEX instruction do a divide by Q?
+ * \param perspective  does the shader expects pre-multiplied w, i.e.,
+ *    LP_INTERP_PERSPECTIVE is specified in the shader key
  */
 static void
 texcoord_coef(struct lp_setup_context *setup,
-              struct lp_rast_triangle *point,
               const struct point_info *info,
               unsigned slot,
               unsigned i,
               unsigned sprite_coord_origin,
-              boolean perspective_proj)
+              boolean perspective)
 {
+   float w0 = info->v0[0][3];
+
    assert(i < 4);
 
    if (i == 0) {
@@ -88,21 +118,14 @@ texcoord_coef(struct lp_setup_context *setup,
       float x0 = info->v0[0][0] - setup->pixel_offset;
       float y0 = info->v0[0][1] - setup->pixel_offset;
 
-      point->inputs.dadx[slot][0] = dadx;
-      point->inputs.dady[slot][0] = dady;
-      point->inputs.a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
+      info->dadx[slot][0] = dadx;
+      info->dady[slot][0] = dady;
+      info->a0[slot][0] = 0.5 - (dadx * x0 + dady * y0);
 
-      if (!perspective_proj) {
-         /* Divide coefficients by vertex.w here.
-          *
-          * It would be clearer to always multiply by w0 above and
-          * then divide it out for perspective projection here, but
-          * doing it this way involves less algebra.
-          */
-         float w0 = info->v0[0][3];
-         point->inputs.dadx[slot][0] *= w0;
-         point->inputs.dady[slot][0] *= w0;
-         point->inputs.a0[slot][0] *= w0;
+      if (perspective) {
+         info->dadx[slot][0] *= w0;
+         info->dady[slot][0] *= w0;
+         info->a0[slot][0] *= w0;
       }
    }
    else if (i == 1) {
@@ -115,26 +138,25 @@ texcoord_coef(struct lp_setup_context *setup,
          dady = -dady;
       }
 
-      point->inputs.dadx[slot][1] = dadx;
-      point->inputs.dady[slot][1] = dady;
-      point->inputs.a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
+      info->dadx[slot][1] = dadx;
+      info->dady[slot][1] = dady;
+      info->a0[slot][1] = 0.5 - (dadx * x0 + dady * y0);
 
-      if (!perspective_proj) {
-         float w0 = info->v0[0][3];
-         point->inputs.dadx[slot][1] *= w0;
-         point->inputs.dady[slot][1] *= w0;
-         point->inputs.a0[slot][1] *= w0;
+      if (perspective) {
+         info->dadx[slot][1] *= w0;
+         info->dady[slot][1] *= w0;
+         info->a0[slot][1] *= w0;
       }
    }
    else if (i == 2) {
-      point->inputs.a0[slot][2] = 0.0f;
-      point->inputs.dadx[slot][2] = 0.0f;
-      point->inputs.dady[slot][2] = 0.0f;
+      info->a0[slot][2] = 0.0f;
+      info->dadx[slot][2] = 0.0f;
+      info->dady[slot][2] = 0.0f;
    }
    else {
-      point->inputs.a0[slot][3] = 1.0f;
-      point->inputs.dadx[slot][3] = 0.0f;
-      point->inputs.dady[slot][3] = 0.0f;
+      info->a0[slot][3] = perspective ? w0 : 1.0f;
+      info->dadx[slot][3] = 0.0f;
+      info->dady[slot][3] = 0.0f;
    }
 }
 
@@ -147,33 +169,32 @@ texcoord_coef(struct lp_setup_context *setup,
  */
 static void
 setup_point_fragcoord_coef(struct lp_setup_context *setup,
-                           struct lp_rast_triangle *point,
-                           const struct point_info *info,
+                           struct point_info *info,
                            unsigned slot,
                            unsigned usage_mask)
 {
    /*X*/
    if (usage_mask & TGSI_WRITEMASK_X) {
-      point->inputs.a0[slot][0] = 0.0;
-      point->inputs.dadx[slot][0] = 1.0;
-      point->inputs.dady[slot][0] = 0.0;
+      info->a0[slot][0] = 0.0;
+      info->dadx[slot][0] = 1.0;
+      info->dady[slot][0] = 0.0;
    }
 
    /*Y*/
    if (usage_mask & TGSI_WRITEMASK_Y) {
-      point->inputs.a0[slot][1] = 0.0;
-      point->inputs.dadx[slot][1] = 0.0;
-      point->inputs.dady[slot][1] = 1.0;
+      info->a0[slot][1] = 0.0;
+      info->dadx[slot][1] = 0.0;
+      info->dady[slot][1] = 1.0;
    }
 
    /*Z*/
    if (usage_mask & TGSI_WRITEMASK_Z) {
-      constant_coef(setup, point, slot, info->v0[0][2], 2);
+      constant_coef(setup, info, slot, info->v0[0][2], 2);
    }
 
    /*W*/
    if (usage_mask & TGSI_WRITEMASK_W) {
-      constant_coef(setup, point, slot, info->v0[0][3], 3);
+      constant_coef(setup, info, slot, info->v0[0][3], 3);
    }
 }
 
@@ -183,21 +204,27 @@ setup_point_fragcoord_coef(struct lp_setup_context *setup,
  */
 static void   
 setup_point_coefficients( struct lp_setup_context *setup,
-                          struct lp_rast_triangle *point,
-                          const struct point_info *info)
+                          struct point_info *info)
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    const struct lp_fragment_shader *shader = setup->fs.current.variant->shader;
    unsigned fragcoord_usage_mask = TGSI_WRITEMASK_XYZ;
    unsigned slot;
 
    /* setup interpolation for all the remaining attributes:
     */
-   for (slot = 0; slot < setup->fs.nr_inputs; slot++) {
-      unsigned vert_attr = setup->fs.input[slot].src_index;
-      unsigned usage_mask = setup->fs.input[slot].usage_mask;
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      enum lp_interp interp = key->inputs[slot].interp;
+      boolean perspective = !!(interp == LP_INTERP_PERSPECTIVE);
       unsigned i;
+
+      if (perspective & usage_mask) {
+         fragcoord_usage_mask |= TGSI_WRITEMASK_W;
+      }
       
-      switch (setup->fs.input[slot].interp) {
+      switch (interp) {
       case LP_INTERP_POSITION:
          /*
           * The generated pixel interpolators will pick up the coeffs from
@@ -210,39 +237,45 @@ setup_point_coefficients( struct lp_setup_context *setup,
       case LP_INTERP_LINEAR:
          /* Sprite tex coords may use linear interpolation someday */
          /* fall-through */
-
       case LP_INTERP_PERSPECTIVE:
          /* check if the sprite coord flag is set for this attribute.
           * If so, set it up so it up so x and y vary from 0 to 1.
           */
-         if (shader->info.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
-            const int index = shader->info.input_semantic_index[slot];
+         if (shader->info.base.input_semantic_name[slot] == TGSI_SEMANTIC_GENERIC) {
+            unsigned semantic_index = shader->info.base.input_semantic_index[slot];
             /* Note that sprite_coord enable is a bitfield of
              * PIPE_MAX_SHADER_OUTPUTS bits.
              */
-            if (index < PIPE_MAX_SHADER_OUTPUTS &&
-                (setup->sprite_coord_enable & (1 << index))) {
-               for (i = 0; i < NUM_CHANNELS; i++)
-                  if (usage_mask & (1 << i))
-                     texcoord_coef(setup, point, info, slot + 1, i,
+            if (semantic_index < PIPE_MAX_SHADER_OUTPUTS &&
+                (setup->sprite_coord_enable & (1 << semantic_index))) {
+               for (i = 0; i < NUM_CHANNELS; i++) {
+                  if (usage_mask & (1 << i)) {
+                     texcoord_coef(setup, info, slot + 1, i,
                                    setup->sprite_coord_origin,
-                                   (usage_mask & TGSI_WRITEMASK_W));
-               fragcoord_usage_mask |= TGSI_WRITEMASK_W;
-               break;                     
+                                   perspective);
+                  }
+               }
+               break;
             }
          }
-         /* FALLTHROUGH */
+         /* fall-through */
       case LP_INTERP_CONSTANT:
          for (i = 0; i < NUM_CHANNELS; i++) {
-            if (usage_mask & (1 << i))
-               constant_coef(setup, point, slot+1, info->v0[vert_attr][i], i);
+            if (usage_mask & (1 << i)) {
+               if (perspective) {
+                  point_persp_coeff(setup, info, slot+1, i);
+               }
+               else {
+                  constant_coef(setup, info, slot+1, info->v0[vert_attr][i], i);
+               }
+            }
          }
          break;
 
       case LP_INTERP_FACING:
          for (i = 0; i < NUM_CHANNELS; i++)
             if (usage_mask & (1 << i))
-               constant_coef(setup, point, slot+1, 1.0, i);
+               constant_coef(setup, info, slot+1, 1.0, i);
          break;
 
       default:
@@ -253,7 +286,7 @@ setup_point_coefficients( struct lp_setup_context *setup,
 
    /* The internal position input is in slot zero:
     */
-   setup_point_fragcoord_coef(setup, point, info, 0,
+   setup_point_fragcoord_coef(setup, info, 0,
                               fragcoord_usage_mask);
 }
 
@@ -270,6 +303,7 @@ try_setup_point( struct lp_setup_context *setup,
                  const float (*v0)[4] )
 {
    /* x/y positions in fixed point */
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    const int sizeAttr = setup->psize;
    const float size
       = (setup->point_size_per_vertex && sizeAttr > 0) ? v0[sizeAttr][0]
@@ -321,7 +355,7 @@ try_setup_point( struct lp_setup_context *setup,
    u_rect_find_intersection(&setup->draw_region, &bbox);
 
    point = lp_setup_alloc_triangle(scene,
-                                   setup->fs.nr_inputs,
+                                   key->num_inputs,
                                    nr_planes,
                                    &bytes);
    if (!point)
@@ -337,40 +371,40 @@ try_setup_point( struct lp_setup_context *setup,
    info.dx12 = fixed_width;
    info.dy01 = fixed_width;
    info.dy12 = 0;
+   info.a0 = GET_A0(&point->inputs);
+   info.dadx = GET_DADX(&point->inputs);
+   info.dady = GET_DADY(&point->inputs);
    
    /* Setup parameter interpolants:
     */
-   setup_point_coefficients(setup, point, &info);
+   setup_point_coefficients(setup, &info);
 
-   point->inputs.facing = 1.0F;
-   point->inputs.state = setup->fs.stored;
+   point->inputs.frontfacing = TRUE;
    point->inputs.disable = FALSE;
    point->inputs.opaque = FALSE;
 
    {
-      point->plane[0].dcdx = -1;
-      point->plane[0].dcdy = 0;
-      point->plane[0].c = 1-bbox.x0;
-      point->plane[0].ei = 0;
-      point->plane[0].eo = 1;
-
-      point->plane[1].dcdx = 1;
-      point->plane[1].dcdy = 0;
-      point->plane[1].c = bbox.x1+1;
-      point->plane[1].ei = -1;
-      point->plane[1].eo = 0;
-
-      point->plane[2].dcdx = 0;
-      point->plane[2].dcdy = 1;
-      point->plane[2].c = 1-bbox.y0;
-      point->plane[2].ei = 0;
-      point->plane[2].eo = 1;
-
-      point->plane[3].dcdx = 0;
-      point->plane[3].dcdy = -1;
-      point->plane[3].c = bbox.y1+1;
-      point->plane[3].ei = -1;
-      point->plane[3].eo = 0;
+      struct lp_rast_plane *plane = GET_PLANES(point);
+
+      plane[0].dcdx = -1;
+      plane[0].dcdy = 0;
+      plane[0].c = 1-bbox.x0;
+      plane[0].eo = 1;
+
+      plane[1].dcdx = 1;
+      plane[1].dcdy = 0;
+      plane[1].c = bbox.x1+1;
+      plane[1].eo = 0;
+
+      plane[2].dcdx = 0;
+      plane[2].dcdy = 1;
+      plane[2].c = 1-bbox.y0;
+      plane[2].eo = 1;
+
+      plane[3].dcdx = 0;
+      plane[3].dcdy = -1;
+      plane[3].c = bbox.y1+1;
+      plane[3].eo = 0;
    }
 
    return lp_setup_bin_triangle(setup, point, &bbox, nr_planes);
@@ -383,10 +417,11 @@ lp_setup_point(struct lp_setup_context *setup,
 {
    if (!try_setup_point( setup, v0 ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
       if (!try_setup_point( setup, v0 ))
-         assert(0);
+         return;
    }
 }
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_tri.c b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
index 9016bb8e249..4ab0b72a574 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_tri.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_tri.c
@@ -32,15 +32,18 @@
 #include "util/u_math.h"
 #include "util/u_memory.h"
 #include "util/u_rect.h"
+#include "util/u_sse.h"
 #include "lp_perf.h"
 #include "lp_setup_context.h"
-#include "lp_setup_coef.h"
 #include "lp_rast.h"
 #include "lp_state_fs.h"
+#include "lp_state_setup.h"
 
 #define NUM_CHANNELS 4
 
-
+#if defined(PIPE_ARCH_SSE)
+#include <emmintrin.h>
+#endif
    
 static INLINE int
 subpixel_snap(float a)
@@ -65,7 +68,7 @@ fixed_to_float(int a)
  * immediately after it.
  * The memory is allocated from the per-scene pool, not per-tile.
  * \param tri_size  returns number of bytes allocated
- * \param nr_inputs  number of fragment shader inputs
+ * \param num_inputs  number of fragment shader inputs
  * \return pointer to triangle space
  */
 struct lp_rast_triangle *
@@ -75,22 +78,23 @@ lp_setup_alloc_triangle(struct lp_scene *scene,
                         unsigned *tri_size)
 {
    unsigned input_array_sz = NUM_CHANNELS * (nr_inputs + 1) * sizeof(float);
+   unsigned plane_sz = nr_planes * sizeof(struct lp_rast_plane);
    struct lp_rast_triangle *tri;
-   unsigned tri_bytes, bytes;
-   char *inputs;
 
-   tri_bytes = align(Offset(struct lp_rast_triangle, plane[nr_planes]), 16);
-   bytes = tri_bytes + (3 * input_array_sz);
+   *tri_size = (sizeof(struct lp_rast_triangle) +
+                3 * input_array_sz +
+                plane_sz);
 
-   tri = lp_scene_alloc_aligned( scene, bytes, 16 );
+   tri = lp_scene_alloc_aligned( scene, *tri_size, 16 );
+   if (tri == NULL)
+      return NULL;
 
-   if (tri) {
-      inputs = ((char *)tri) + tri_bytes;
-      tri->inputs.a0   = (float (*)[4]) inputs;
-      tri->inputs.dadx = (float (*)[4]) (inputs + input_array_sz);
-      tri->inputs.dady = (float (*)[4]) (inputs + 2 * input_array_sz);
+   tri->inputs.stride = input_array_sz;
 
-      *tri_size = bytes;
+   {
+      char *a = (char *)tri;
+      char *b = (char *)&GET_PLANES(tri)[nr_planes];
+      assert(b - a == *tri_size);
    }
 
    return tri;
@@ -101,25 +105,26 @@ lp_setup_print_vertex(struct lp_setup_context *setup,
                       const char *name,
                       const float (*v)[4])
 {
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    int i, j;
 
    debug_printf("   wpos (%s[0]) xyzw %f %f %f %f\n",
                 name,
                 v[0][0], v[0][1], v[0][2], v[0][3]);
 
-   for (i = 0; i < setup->fs.nr_inputs; i++) {
-      const float *in = v[setup->fs.input[i].src_index];
+   for (i = 0; i < key->num_inputs; i++) {
+      const float *in = v[key->inputs[i].src_index];
 
       debug_printf("  in[%d] (%s[%d]) %s%s%s%s ",
                    i, 
-                   name, setup->fs.input[i].src_index,
-                   (setup->fs.input[i].usage_mask & 0x1) ? "x" : " ",
-                   (setup->fs.input[i].usage_mask & 0x2) ? "y" : " ",
-                   (setup->fs.input[i].usage_mask & 0x4) ? "z" : " ",
-                   (setup->fs.input[i].usage_mask & 0x8) ? "w" : " ");
+                   name, key->inputs[i].src_index,
+                   (key->inputs[i].usage_mask & 0x1) ? "x" : " ",
+                   (key->inputs[i].usage_mask & 0x2) ? "y" : " ",
+                   (key->inputs[i].usage_mask & 0x4) ? "z" : " ",
+                   (key->inputs[i].usage_mask & 0x8) ? "w" : " ");
 
       for (j = 0; j < 4; j++)
-         if (setup->fs.input[i].usage_mask & (1<<j))
+         if (key->inputs[i].usage_mask & (1<<j))
             debug_printf("%.5f ", in[j]);
 
       debug_printf("\n");
@@ -200,14 +205,16 @@ lp_setup_whole_tile(struct lp_setup_context *setup,
       }
 
       LP_COUNT(nr_shade_opaque_64);
-      return lp_scene_bin_command( scene, tx, ty,
-                                   LP_RAST_OP_SHADE_TILE_OPAQUE,
-                                   lp_rast_arg_inputs(inputs) );
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored,
+                                          LP_RAST_OP_SHADE_TILE_OPAQUE,
+                                          lp_rast_arg_inputs(inputs) );
    } else {
       LP_COUNT(nr_shade_64);
-      return lp_scene_bin_command( scene, tx, ty,
-                                   LP_RAST_OP_SHADE_TILE,
-                                   lp_rast_arg_inputs(inputs) );
+      return lp_scene_bin_cmd_with_state( scene, tx, ty,
+                                          setup->fs.stored, 
+                                          LP_RAST_OP_SHADE_TILE,
+                                          lp_rast_arg_inputs(inputs) );
    }
 }
 
@@ -225,13 +232,13 @@ do_triangle_ccw(struct lp_setup_context *setup,
 		boolean frontfacing )
 {
    struct lp_scene *scene = setup->scene;
+   const struct lp_setup_variant_key *key = &setup->setup.variant->key;
    struct lp_rast_triangle *tri;
-   int x[3];
-   int y[3];
-   int area;
+   struct lp_rast_plane *plane;
+   int x[4];
+   int y[4];
    struct u_rect bbox;
    unsigned tri_bytes;
-   int i;
    int nr_planes = 3;
 
    if (0)
@@ -248,10 +255,12 @@ do_triangle_ccw(struct lp_setup_context *setup,
    x[0] = subpixel_snap(v0[0][0] - setup->pixel_offset);
    x[1] = subpixel_snap(v1[0][0] - setup->pixel_offset);
    x[2] = subpixel_snap(v2[0][0] - setup->pixel_offset);
+   x[3] = 0;
    y[0] = subpixel_snap(v0[0][1] - setup->pixel_offset);
    y[1] = subpixel_snap(v1[0][1] - setup->pixel_offset);
    y[2] = subpixel_snap(v2[0][1] - setup->pixel_offset);
-
+   y[3] = 0;
+   
 
    /* Bounding rectangle (in pixels) */
    {
@@ -289,13 +298,13 @@ do_triangle_ccw(struct lp_setup_context *setup,
    u_rect_find_intersection(&setup->draw_region, &bbox);
 
    tri = lp_setup_alloc_triangle(scene,
-                                 setup->fs.nr_inputs,
+                                 key->num_inputs,
                                  nr_planes,
                                  &tri_bytes);
    if (!tri)
       return FALSE;
 
-#ifdef DEBUG
+#if 0
    tri->v[0][0] = v0[0][0];
    tri->v[1][0] = v1[0][0];
    tri->v[2][0] = v2[0][0];
@@ -304,92 +313,172 @@ do_triangle_ccw(struct lp_setup_context *setup,
    tri->v[2][1] = v2[0][1];
 #endif
 
-   tri->plane[0].dcdy = x[0] - x[1];
-   tri->plane[1].dcdy = x[1] - x[2];
-   tri->plane[2].dcdy = x[2] - x[0];
-
-   tri->plane[0].dcdx = y[0] - y[1];
-   tri->plane[1].dcdx = y[1] - y[2];
-   tri->plane[2].dcdx = y[2] - y[0];
-
-   area = (tri->plane[0].dcdy * tri->plane[2].dcdx -
-           tri->plane[2].dcdy * tri->plane[0].dcdx);
-
    LP_COUNT(nr_tris);
 
-   /* Cull non-ccw and zero-sized triangles. 
-    *
-    * XXX: subject to overflow??
-    */
-   if (area <= 0) {
-      lp_scene_putback_data( scene, tri_bytes );
-      LP_COUNT(nr_culled_tris);
-      return TRUE;
-   }
-
    /* Setup parameter interpolants:
     */
-   lp_setup_tri_coef( setup, &tri->inputs, v0, v1, v2, frontfacing );
-
-   tri->inputs.facing = frontfacing ? 1.0F : -1.0F;
+   setup->setup.variant->jit_function( v0,
+				       v1,
+				       v2,
+				       frontfacing,
+				       GET_A0(&tri->inputs),
+				       GET_DADX(&tri->inputs),
+				       GET_DADY(&tri->inputs) );
+
+   tri->inputs.frontfacing = frontfacing;
    tri->inputs.disable = FALSE;
    tri->inputs.opaque = setup->fs.current.variant->opaque;
-   tri->inputs.state = setup->fs.stored;
 
-  
-   for (i = 0; i < 3; i++) {
-      struct lp_rast_plane *plane = &tri->plane[i];
+   if (0)
+      lp_dump_setup_coef(&setup->setup.variant->key,
+			 (const float (*)[4])GET_A0(&tri->inputs),
+			 (const float (*)[4])GET_DADX(&tri->inputs),
+			 (const float (*)[4])GET_DADY(&tri->inputs));
+
+   plane = GET_PLANES(tri);
+
+#if defined(PIPE_ARCH_SSE)
+   {
+      __m128i vertx, verty;
+      __m128i shufx, shufy;
+      __m128i dcdx, dcdy, c;
+      __m128i unused;
+      __m128i dcdx_neg_mask;
+      __m128i dcdy_neg_mask;
+      __m128i dcdx_zero_mask;
+      __m128i top_left_flag;
+      __m128i c_inc_mask, c_inc;
+      __m128i eo, p0, p1, p2;
+      __m128i zero = _mm_setzero_si128();
+
+      vertx = _mm_loadu_si128((__m128i *)x); /* vertex x coords */
+      verty = _mm_loadu_si128((__m128i *)y); /* vertex y coords */
+
+      shufx = _mm_shuffle_epi32(vertx, _MM_SHUFFLE(3,0,2,1));
+      shufy = _mm_shuffle_epi32(verty, _MM_SHUFFLE(3,0,2,1));
+
+      dcdx = _mm_sub_epi32(verty, shufy);
+      dcdy = _mm_sub_epi32(vertx, shufx);
+
+      dcdx_neg_mask = _mm_srai_epi32(dcdx, 31);
+      dcdx_zero_mask = _mm_cmpeq_epi32(dcdx, zero);
+      dcdy_neg_mask = _mm_srai_epi32(dcdy, 31);
+
+      top_left_flag = _mm_set1_epi32((setup->pixel_offset == 0) ? ~0 : 0);
 
-      /* half-edge constants, will be interated over the whole render
-       * target.
+      c_inc_mask = _mm_or_si128(dcdx_neg_mask,
+                                _mm_and_si128(dcdx_zero_mask,
+                                              _mm_xor_si128(dcdy_neg_mask,
+                                                            top_left_flag)));
+
+      c_inc = _mm_srli_epi32(c_inc_mask, 31);
+
+      c = _mm_sub_epi32(mm_mullo_epi32(dcdx, vertx),
+                        mm_mullo_epi32(dcdy, verty));
+
+      c = _mm_add_epi32(c, c_inc);
+
+      /* Scale up to match c:
        */
-      plane->c = plane->dcdx * x[i] - plane->dcdy * y[i];
-
-      /* correct for top-left vs. bottom-left fill convention.  
-       *
-       * note that we're overloading gl_rasterization_rules to mean
-       * both (0.5,0.5) pixel centers *and* bottom-left filling
-       * convention.
-       *
-       * GL actually has a top-left filling convention, but GL's
-       * notion of "top" differs from gallium's...
-       *
-       * Also, sometimes (in FBO cases) GL will render upside down
-       * to its usual method, in which case it will probably want
-       * to use the opposite, top-left convention.
-       */         
-      if (plane->dcdx < 0) {
-         /* both fill conventions want this - adjust for left edges */
-         plane->c++;            
-      }
-      else if (plane->dcdx == 0) {
-         if (setup->pixel_offset == 0) {
-            /* correct for top-left fill convention:
-             */
-            if (plane->dcdy > 0) plane->c++;
+      dcdx = _mm_slli_epi32(dcdx, FIXED_ORDER);
+      dcdy = _mm_slli_epi32(dcdy, FIXED_ORDER);
+
+      /* Calculate trivial reject values:
+       */
+      eo = _mm_sub_epi32(_mm_andnot_si128(dcdy_neg_mask, dcdy),
+                         _mm_and_si128(dcdx_neg_mask, dcdx));
+
+      /* ei = _mm_sub_epi32(_mm_sub_epi32(dcdy, dcdx), eo); */
+
+      /* Pointless transpose which gets undone immediately in
+       * rasterization:
+       */
+      transpose4_epi32(&c, &dcdx, &dcdy, &eo,
+                       &p0, &p1, &p2, &unused);
+
+      _mm_store_si128((__m128i *)&plane[0], p0);
+      _mm_store_si128((__m128i *)&plane[1], p1);
+      _mm_store_si128((__m128i *)&plane[2], p2);
+   }
+#else
+   {
+      int i;
+      plane[0].dcdy = x[0] - x[1];
+      plane[1].dcdy = x[1] - x[2];
+      plane[2].dcdy = x[2] - x[0];
+      plane[0].dcdx = y[0] - y[1];
+      plane[1].dcdx = y[1] - y[2];
+      plane[2].dcdx = y[2] - y[0];
+  
+      for (i = 0; i < 3; i++) {
+         /* half-edge constants, will be interated over the whole render
+          * target.
+          */
+         plane[i].c = plane[i].dcdx * x[i] - plane[i].dcdy * y[i];
+
+         /* correct for top-left vs. bottom-left fill convention.  
+          *
+          * note that we're overloading gl_rasterization_rules to mean
+          * both (0.5,0.5) pixel centers *and* bottom-left filling
+          * convention.
+          *
+          * GL actually has a top-left filling convention, but GL's
+          * notion of "top" differs from gallium's...
+          *
+          * Also, sometimes (in FBO cases) GL will render upside down
+          * to its usual method, in which case it will probably want
+          * to use the opposite, top-left convention.
+          */         
+         if (plane[i].dcdx < 0) {
+            /* both fill conventions want this - adjust for left edges */
+            plane[i].c++;            
          }
-         else {
-            /* correct for bottom-left fill convention:
-             */
-            if (plane->dcdy < 0) plane->c++;
+         else if (plane[i].dcdx == 0) {
+            if (setup->pixel_offset == 0) {
+               /* correct for top-left fill convention:
+                */
+               if (plane[i].dcdy > 0) plane[i].c++;
+            }
+            else {
+               /* correct for bottom-left fill convention:
+                */
+               if (plane[i].dcdy < 0) plane[i].c++;
+            }
          }
-      }
 
-      plane->dcdx *= FIXED_ONE;
-      plane->dcdy *= FIXED_ONE;
+         plane[i].dcdx *= FIXED_ONE;
+         plane[i].dcdy *= FIXED_ONE;
 
-      /* find trivial reject offsets for each edge for a single-pixel
-       * sized block.  These will be scaled up at each recursive level to
-       * match the active blocksize.  Scaling in this way works best if
-       * the blocks are square.
-       */
-      plane->eo = 0;
-      if (plane->dcdx < 0) plane->eo -= plane->dcdx;
-      if (plane->dcdy > 0) plane->eo += plane->dcdy;
+         /* find trivial reject offsets for each edge for a single-pixel
+          * sized block.  These will be scaled up at each recursive level to
+          * match the active blocksize.  Scaling in this way works best if
+          * the blocks are square.
+          */
+         plane[i].eo = 0;
+         if (plane[i].dcdx < 0) plane[i].eo -= plane[i].dcdx;
+         if (plane[i].dcdy > 0) plane[i].eo += plane[i].dcdy;
+      }
+   }
+#endif
 
-      /* Calculate trivial accept offsets from the above.
-       */
-      plane->ei = plane->dcdy - plane->dcdx - plane->eo;
+   if (0) {
+      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+                   plane[0].c,
+                   plane[0].dcdx,
+                   plane[0].dcdy,
+                   plane[0].eo);
+      
+      debug_printf("p1: %08x/%08x/%08x/%08x\n",
+                   plane[1].c,
+                   plane[1].dcdx,
+                   plane[1].dcdy,
+                   plane[1].eo);
+      
+      debug_printf("p0: %08x/%08x/%08x/%08x\n",
+                   plane[2].c,
+                   plane[2].dcdx,
+                   plane[2].dcdy,
+                   plane[2].eo);
    }
 
 
@@ -412,29 +501,25 @@ do_triangle_ccw(struct lp_setup_context *setup,
     * these planes elsewhere.
     */
    if (nr_planes == 7) {
-      tri->plane[3].dcdx = -1;
-      tri->plane[3].dcdy = 0;
-      tri->plane[3].c = 1-bbox.x0;
-      tri->plane[3].ei = 0;
-      tri->plane[3].eo = 1;
-
-      tri->plane[4].dcdx = 1;
-      tri->plane[4].dcdy = 0;
-      tri->plane[4].c = bbox.x1+1;
-      tri->plane[4].ei = -1;
-      tri->plane[4].eo = 0;
-
-      tri->plane[5].dcdx = 0;
-      tri->plane[5].dcdy = 1;
-      tri->plane[5].c = 1-bbox.y0;
-      tri->plane[5].ei = 0;
-      tri->plane[5].eo = 1;
-
-      tri->plane[6].dcdx = 0;
-      tri->plane[6].dcdy = -1;
-      tri->plane[6].c = bbox.y1+1;
-      tri->plane[6].ei = -1;
-      tri->plane[6].eo = 0;
+      plane[3].dcdx = -1;
+      plane[3].dcdy = 0;
+      plane[3].c = 1-bbox.x0;
+      plane[3].eo = 1;
+
+      plane[4].dcdx = 1;
+      plane[4].dcdy = 0;
+      plane[4].c = bbox.x1+1;
+      plane[4].eo = 0;
+
+      plane[5].dcdx = 0;
+      plane[5].dcdy = 1;
+      plane[5].c = 1-bbox.y0;
+      plane[5].eo = 1;
+
+      plane[6].dcdx = 0;
+      plane[6].dcdy = -1;
+      plane[6].c = bbox.y1+1;
+      plane[6].eo = 0;
    }
 
    return lp_setup_bin_triangle( setup, tri, &bbox, nr_planes );
@@ -487,51 +572,58 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
    int sz = floor_pot((bbox->x1 - (bbox->x0 & ~3)) |
 		      (bbox->y1 - (bbox->y0 & ~3)));
 
-   if (nr_planes == 3) {
-      if (sz < 4 && dx < 64)
-      {
-	 /* Triangle is contained in a single 4x4 stamp:
-	  */
-	 int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
-	 return lp_scene_bin_command( scene,
-				      bbox->x0/64, bbox->y0/64,
-				      LP_RAST_OP_TRIANGLE_3_4,
-				      lp_rast_arg_triangle(tri, mask) );
-      }
-
-      if (sz < 16 && dx < 64)
-      {
-	 int mask = (bbox->x0 & 63 & ~3) | ((bbox->y0 & 63 & ~3) << 8);
-
-	 /* Triangle is contained in a single 16x16 block:
-	  */
-	 return lp_scene_bin_command( scene,
-				      bbox->x0/64, bbox->y0/64,
-                                      LP_RAST_OP_TRIANGLE_3_16,
-                                      lp_rast_arg_triangle(tri, mask) );
-      }
-   }
-
-
    /* Determine which tile(s) intersect the triangle's bounding box
     */
    if (dx < TILE_SIZE)
    {
       int ix0 = bbox->x0 / TILE_SIZE;
       int iy0 = bbox->y0 / TILE_SIZE;
+      int px = bbox->x0 & 63 & ~3;
+      int py = bbox->y0 & 63 & ~3;
+      int mask = px | (py << 8);
 
       assert(iy0 == bbox->y1 / TILE_SIZE &&
 	     ix0 == bbox->x1 / TILE_SIZE);
 
+      if (nr_planes == 3) {
+         if (sz < 4)
+         {
+            /* Triangle is contained in a single 4x4 stamp:
+             */
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                LP_RAST_OP_TRIANGLE_3_4,
+                                                lp_rast_arg_triangle(tri, mask) );
+         }
+
+         if (sz < 16)
+         {
+            /* Triangle is contained in a single 16x16 block:
+             */
+            return lp_scene_bin_cmd_with_state( scene, ix0, iy0,
+                                                setup->fs.stored,
+                                                LP_RAST_OP_TRIANGLE_3_16,
+                                                lp_rast_arg_triangle(tri, mask) );
+         }
+      }
+      else if (nr_planes == 4 && sz < 16) 
+      {
+         return lp_scene_bin_cmd_with_state(scene, ix0, iy0,
+                                            setup->fs.stored,
+                                            LP_RAST_OP_TRIANGLE_4_16,
+                                            lp_rast_arg_triangle(tri, mask) );
+      }
+
+
       /* Triangle is contained in a single tile:
        */
-      return lp_scene_bin_command( scene, ix0, iy0,
-                                   lp_rast_tri_tab[nr_planes], 
-                                   lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
+      return lp_scene_bin_cmd_with_state( scene, ix0, iy0, setup->fs.stored,
+                                          lp_rast_tri_tab[nr_planes], 
+                                          lp_rast_arg_triangle(tri, (1<<nr_planes)-1) );
    }
    else
    {
+      struct lp_rast_plane *plane = GET_PLANES(tri);
       int c[MAX_PLANES];
       int ei[MAX_PLANES];
       int eo[MAX_PLANES];
@@ -545,14 +637,17 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
       int iy1 = bbox->y1 / TILE_SIZE;
       
       for (i = 0; i < nr_planes; i++) {
-         c[i] = (tri->plane[i].c + 
-                 tri->plane[i].dcdy * iy0 * TILE_SIZE - 
-                 tri->plane[i].dcdx * ix0 * TILE_SIZE);
-
-         ei[i] = tri->plane[i].ei << TILE_ORDER;
-         eo[i] = tri->plane[i].eo << TILE_ORDER;
-         xstep[i] = -(tri->plane[i].dcdx << TILE_ORDER);
-         ystep[i] = tri->plane[i].dcdy << TILE_ORDER;
+         c[i] = (plane[i].c + 
+                 plane[i].dcdy * iy0 * TILE_SIZE - 
+                 plane[i].dcdx * ix0 * TILE_SIZE);
+
+         ei[i] = (plane[i].dcdy - 
+                  plane[i].dcdx - 
+                  plane[i].eo) << TILE_ORDER;
+
+         eo[i] = plane[i].eo << TILE_ORDER;
+         xstep[i] = -(plane[i].dcdx << TILE_ORDER);
+         ystep[i] = plane[i].dcdy << TILE_ORDER;
       }
 
 
@@ -594,9 +689,11 @@ lp_setup_bin_triangle( struct lp_setup_context *setup,
                 */
                int count = util_bitcount(partial);
                in = TRUE;
-               if (!lp_scene_bin_command( scene, x, y,
-                                          lp_rast_tri_tab[count], 
-                                          lp_rast_arg_triangle(tri, partial) ))
+               
+               if (!lp_scene_bin_cmd_with_state( scene, x, y,
+                                                 setup->fs.stored,
+                                                 lp_rast_tri_tab[count], 
+                                                 lp_rast_arg_triangle(tri, partial) ))
                   goto fail;
 
                LP_COUNT(nr_partially_covered_64);
@@ -635,40 +732,62 @@ fail:
 
 
 /**
- * Draw triangle if it's CW, cull otherwise.
+ * Try to draw the triangle, restart the scene on failure.
  */
-static void triangle_cw( struct lp_setup_context *setup,
-			 const float (*v0)[4],
-			 const float (*v1)[4],
-			 const float (*v2)[4] )
+static void retry_triangle_ccw( struct lp_setup_context *setup,
+                                const float (*v0)[4],
+                                const float (*v1)[4],
+                                const float (*v2)[4],
+                                boolean front)
 {
-   if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
+   if (!do_triangle_ccw( setup, v0, v1, v2, front ))
    {
-      lp_setup_flush_and_restart(setup);
+      if (!lp_setup_flush_and_restart(setup))
+         return;
 
-      if (!do_triangle_ccw( setup, v1, v0, v2, !setup->ccw_is_frontface ))
-         assert(0);
+      if (!do_triangle_ccw( setup, v0, v1, v2, front ))
+         return;
    }
 }
 
+static INLINE float
+calc_area(const float (*v0)[4],
+          const float (*v1)[4],
+          const float (*v2)[4])
+{
+   float dx01 = v0[0][0] - v1[0][0];
+   float dy01 = v0[0][1] - v1[0][1];
+   float dx20 = v2[0][0] - v0[0][0];
+   float dy20 = v2[0][1] - v0[0][1];
+   return dx01 * dy20 - dx20 * dy01;
+}
+
 
 /**
- * Draw triangle if it's CCW, cull otherwise.
+ * Draw triangle if it's CW, cull otherwise.
  */
-static void triangle_ccw( struct lp_setup_context *setup,
+static void triangle_cw( struct lp_setup_context *setup,
 			 const float (*v0)[4],
 			 const float (*v1)[4],
 			 const float (*v2)[4] )
 {
-   if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-   {
-      lp_setup_flush_and_restart(setup);
-      if (!do_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface ))
-         assert(0);
-   }
+   float area = calc_area(v0, v1, v2);
+
+   if (area < 0.0f) 
+      retry_triangle_ccw(setup, v0, v2, v1, !setup->ccw_is_frontface);
 }
 
 
+static void triangle_ccw( struct lp_setup_context *setup,
+                          const float (*v0)[4],
+                          const float (*v1)[4],
+                          const float (*v2)[4])
+{
+   float area = calc_area(v0, v1, v2);
+
+   if (area > 0.0f) 
+      retry_triangle_ccw(setup, v0, v1, v2, setup->ccw_is_frontface);
+}
 
 /**
  * Draw triangle whether it's CW or CCW.
@@ -678,18 +797,12 @@ static void triangle_both( struct lp_setup_context *setup,
 			   const float (*v1)[4],
 			   const float (*v2)[4] )
 {
-   /* edge vectors e = v0 - v2, f = v1 - v2 */
-   const float ex = v0[0][0] - v2[0][0];
-   const float ey = v0[0][1] - v2[0][1];
-   const float fx = v1[0][0] - v2[0][0];
-   const float fy = v1[0][1] - v2[0][1];
-
-   /* det = cross(e,f).z */
-   const float det = ex * fy - ey * fx;
-   if (det < 0.0f) 
-      triangle_ccw( setup, v0, v1, v2 );
-   else if (det > 0.0f)
-      triangle_cw( setup, v0, v1, v2 );
+   float area = calc_area(v0, v1, v2);
+
+   if (area > 0.0f) 
+      retry_triangle_ccw( setup, v0, v1, v2, setup->ccw_is_frontface );
+   else if (area < 0.0f)
+      retry_triangle_ccw( setup, v0, v2, v1, !setup->ccw_is_frontface );
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
index 6308561f242..9c1f0fe7939 100644
--- a/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
+++ b/src/gallium/drivers/llvmpipe/lp_setup_vbuf.c
@@ -141,7 +141,8 @@ lp_setup_draw_elements(struct vbuf_render *vbr, const ushort *indices, uint nr)
    const boolean flatshade_first = setup->flatshade_first;
    unsigned i;
 
-   lp_setup_update_state(setup, TRUE);
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
 
    switch (setup->prim) {
    case PIPE_PRIM_POINTS:
@@ -338,7 +339,8 @@ lp_setup_draw_arrays(struct vbuf_render *vbr, uint start, uint nr)
    const boolean flatshade_first = setup->flatshade_first;
    unsigned i;
 
-   lp_setup_update_state(setup, TRUE);
+   if (!lp_setup_update_state(setup, TRUE))
+      return;
 
    switch (setup->prim) {
    case PIPE_PRIM_POINTS:
diff --git a/src/gallium/drivers/llvmpipe/lp_state.h b/src/gallium/drivers/llvmpipe/lp_state.h
index 86313e1c484..7893e9cdc0c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state.h
+++ b/src/gallium/drivers/llvmpipe/lp_state.h
@@ -97,6 +97,9 @@ llvmpipe_set_framebuffer_state(struct pipe_context *,
 void
 llvmpipe_update_fs(struct llvmpipe_context *lp);
 
+void 
+llvmpipe_update_setup(struct llvmpipe_context *lp);
+
 void
 llvmpipe_update_derived(struct llvmpipe_context *llvmpipe);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_derived.c b/src/gallium/drivers/llvmpipe/lp_state_derived.c
index bb059d04599..0f5f7369e04 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_derived.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_derived.c
@@ -50,12 +50,13 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 {
    const struct lp_fragment_shader *lpfs = llvmpipe->fs;
    struct vertex_info *vinfo = &llvmpipe->vertex_info;
-   struct lp_shader_input *inputs = llvmpipe->inputs;
    unsigned vs_index;
    uint i;
 
    /*
-    * Match FS inputs against VS outputs, emitting the necessary attributes.
+    * Match FS inputs against VS outputs, emitting the necessary
+    * attributes.  Could cache these structs and look them up with a
+    * combination of fragment shader, vertex shader ids.
     */
 
    vinfo->num_attribs = 0;
@@ -66,72 +67,18 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
 
    draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
 
-   for (i = 0; i < lpfs->info.num_inputs; i++) {
+   for (i = 0; i < lpfs->info.base.num_inputs; i++) {
       /*
        * Search for each input in current vs output:
        */
 
       vs_index = draw_find_shader_output(llvmpipe->draw,
-                                         lpfs->info.input_semantic_name[i],
-                                         lpfs->info.input_semantic_index[i]);
-      if (vs_index < 0) {
-         /*
-          * This can happen with sprite coordinates - the vertex
-          * shader doesn't need to provide an output as we generate
-          * them internally.  However, lets keep pretending that there
-          * is something there to not confuse other code.
-          */
-         vs_index = 0;
-      }
-
-      /* This can be pre-computed, except for flatshade:
-       */
-      inputs[i].usage_mask = lpfs->info.input_usage_mask[i];
-
-      switch (lpfs->info.input_interpolate[i]) {
-      case TGSI_INTERPOLATE_CONSTANT:
-         inputs[i].interp = LP_INTERP_CONSTANT;
-         break;
-      case TGSI_INTERPOLATE_LINEAR:
-         inputs[i].interp = LP_INTERP_LINEAR;
-         break;
-      case TGSI_INTERPOLATE_PERSPECTIVE:
-         inputs[i].interp = LP_INTERP_PERSPECTIVE;
-         break;
-      default:
-         assert(0);
-         break;
-      }
-
-      switch (lpfs->info.input_semantic_name[i]) {
-      case TGSI_SEMANTIC_FACE:
-         inputs[i].interp = LP_INTERP_FACING;
-         break;
-      case TGSI_SEMANTIC_POSITION:
-         /* Position was already emitted above
-          */
-         inputs[i].interp = LP_INTERP_POSITION;
-         inputs[i].src_index = 0;
-         continue;
-      case TGSI_SEMANTIC_COLOR:
-         /* Colors are linearly inputs[i].interpolated in the fragment shader
-          * even when flatshading is active.  This just tells the
-          * setup module to use coefficients with ddx==0 and
-          * ddy==0.
-          */
-         if (llvmpipe->rasterizer->flatshade)
-            inputs[i].interp = LP_INTERP_CONSTANT;
-         break;
-
-      default:
-         break;
-      }
+                                         lpfs->info.base.input_semantic_name[i],
+                                         lpfs->info.base.input_semantic_index[i]);
 
       /*
        * Emit the requested fs attribute for all but position.
        */
-
-      inputs[i].src_index = vinfo->num_attribs;
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_PERSPECTIVE, vs_index);
    }
 
@@ -145,15 +92,8 @@ compute_vertex_info(struct llvmpipe_context *llvmpipe)
       draw_emit_vertex_attr(vinfo, EMIT_4F, INTERP_CONSTANT, vs_index);
    }
 
-   llvmpipe->num_inputs = lpfs->info.num_inputs;
-
    draw_compute_vertex_size(vinfo);
-
    lp_setup_set_vertex_info(llvmpipe->setup, vinfo);
-
-   lp_setup_set_fs_inputs(llvmpipe->setup,
-                          inputs,
-                          lpfs->info.num_inputs);
 }
 
 
@@ -190,6 +130,10 @@ void llvmpipe_update_derived( struct llvmpipe_context *llvmpipe )
                           LP_NEW_QUERY))
       llvmpipe_update_fs( llvmpipe );
 
+   if (llvmpipe->dirty & (LP_NEW_FS |
+			  LP_NEW_RASTERIZER))
+      llvmpipe_update_setup( llvmpipe );
+
    if (llvmpipe->dirty & LP_NEW_BLEND_COLOR)
       lp_setup_set_blend_color(llvmpipe->setup,
                                &llvmpipe->blend_color);
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index f0a15e11b9b..9fbedac165f 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -99,74 +99,12 @@
 
 
 #include <llvm-c/Analysis.h>
+#include <llvm-c/BitWriter.h>
 
 
 static unsigned fs_no = 0;
 
 
-/**
- * Generate the depth /stencil test code.
- */
-static void
-generate_depth_stencil(LLVMBuilderRef builder,
-                       const struct lp_fragment_shader_variant_key *key,
-                       struct lp_type src_type,
-                       struct lp_build_mask_context *mask,
-                       LLVMValueRef stencil_refs[2],
-                       LLVMValueRef src,
-                       LLVMValueRef dst_ptr,
-                       LLVMValueRef facing,
-                       LLVMValueRef counter)
-{
-   const struct util_format_description *format_desc;
-   struct lp_type dst_type;
-
-   if (!key->depth.enabled && !key->stencil[0].enabled && !key->stencil[1].enabled)
-      return;
-
-   format_desc = util_format_description(key->zsbuf_format);
-   assert(format_desc);
-
-   /*
-    * Depths are expected to be between 0 and 1, even if they are stored in
-    * floats. Setting these bits here will ensure that the lp_build_conv() call
-    * below won't try to unnecessarily clamp the incoming values.
-    */
-   if(src_type.floating) {
-      src_type.sign = FALSE;
-      src_type.norm = TRUE;
-   }
-   else {
-      assert(!src_type.sign);
-      assert(src_type.norm);
-   }
-
-   /* Pick the depth type. */
-   dst_type = lp_depth_type(format_desc, src_type.width*src_type.length);
-
-   /* FIXME: Cope with a depth test type with a different bit width. */
-   assert(dst_type.width == src_type.width);
-   assert(dst_type.length == src_type.length);
-
-   /* Convert fragment Z from float to integer */
-   lp_build_conv(builder, src_type, dst_type, &src, 1, &src, 1);
-
-   dst_ptr = LLVMBuildBitCast(builder,
-                              dst_ptr,
-                              LLVMPointerType(lp_build_vec_type(dst_type), 0), "");
-   lp_build_depth_stencil_test(builder,
-                               &key->depth,
-                               key->stencil,
-                               dst_type,
-                               format_desc,
-                               mask,
-                               stencil_refs,
-                               src,
-                               dst_ptr,
-                               facing,
-                               counter);
-}
-
 
 /**
  * Expand the relevent bits of mask_input to a 4-dword mask for the 
@@ -248,6 +186,26 @@ generate_quad_mask(LLVMBuilderRef builder,
 }
 
 
+#define EARLY_DEPTH_TEST  0x1
+#define LATE_DEPTH_TEST   0x2
+#define EARLY_DEPTH_WRITE 0x4
+#define LATE_DEPTH_WRITE  0x8
+
+static int
+find_output_by_semantic( const struct tgsi_shader_info *info,
+			 unsigned semantic,
+			 unsigned index )
+{
+   int i;
+
+   for (i = 0; i < info->num_outputs; i++)
+      if (info->output_semantic_name[i] == semantic &&
+	  info->output_semantic_index[i] == index)
+	 return i;
+
+   return -1;
+}
+
 
 /**
  * Generate the fragment shader, depth/stencil test, and alpha tests.
@@ -255,14 +213,13 @@ generate_quad_mask(LLVMBuilderRef builder,
  * \param partial_mask  if 1, do mask_input testing
  */
 static void
-generate_fs(struct llvmpipe_context *lp,
-            struct lp_fragment_shader *shader,
+generate_fs(struct lp_fragment_shader *shader,
             const struct lp_fragment_shader_variant_key *key,
             LLVMBuilderRef builder,
             struct lp_type type,
             LLVMValueRef context_ptr,
             unsigned i,
-            const struct lp_build_interp_soa_context *interp,
+            struct lp_build_interp_soa_context *interp,
             struct lp_build_sampler_soa *sampler,
             LLVMValueRef *pmask,
             LLVMValueRef (*color)[4],
@@ -272,18 +229,52 @@ generate_fs(struct llvmpipe_context *lp,
             LLVMValueRef mask_input,
             LLVMValueRef counter)
 {
+   const struct util_format_description *zs_format_desc = NULL;
    const struct tgsi_token *tokens = shader->base.tokens;
    LLVMTypeRef vec_type;
    LLVMValueRef consts_ptr;
    LLVMValueRef outputs[PIPE_MAX_SHADER_OUTPUTS][NUM_CHANNELS];
-   LLVMValueRef z = interp->pos[2];
+   LLVMValueRef z;
+   LLVMValueRef zs_value = NULL;
    LLVMValueRef stencil_refs[2];
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask;
-   boolean early_depth_stencil_test;
+   boolean simple_shader = (shader->info.base.file_count[TGSI_FILE_SAMPLER] == 0 &&
+                            shader->info.base.num_inputs < 3 &&
+                            shader->info.base.num_instructions < 8);
    unsigned attrib;
    unsigned chan;
    unsigned cbuf;
+   unsigned depth_mode;
+
+   if (key->depth.enabled ||
+       key->stencil[0].enabled ||
+       key->stencil[1].enabled) {
+
+      zs_format_desc = util_format_description(key->zsbuf_format);
+      assert(zs_format_desc);
+
+      if (!shader->info.base.writes_z) {
+         if (key->alpha.enabled || shader->info.base.uses_kill)
+            /* With alpha test and kill, can do the depth test early
+             * and hopefully eliminate some quads.  But need to do a
+             * special deferred depth write once the final mask value
+             * is known.
+             */
+            depth_mode = EARLY_DEPTH_TEST | LATE_DEPTH_WRITE;
+         else
+            depth_mode = EARLY_DEPTH_TEST | EARLY_DEPTH_WRITE;
+      }
+      else {
+         depth_mode = LATE_DEPTH_TEST | LATE_DEPTH_WRITE;
+      }
+
+      if (!(key->depth.enabled && key->depth.writemask) &&
+          !(key->stencil[0].enabled && key->stencil[0].writemask))
+         depth_mode &= ~(LATE_DEPTH_WRITE | EARLY_DEPTH_WRITE);
+   }
+   else {
+      depth_mode = 0;
+   }
 
    assert(i < 4);
 
@@ -294,20 +285,14 @@ generate_fs(struct llvmpipe_context *lp,
 
    consts_ptr = lp_jit_context_constants(builder, context_ptr);
 
-   flow = lp_build_flow_create(builder);
-
    memset(outputs, 0, sizeof outputs);
 
-   lp_build_flow_scope_begin(flow);
-
    /* Declare the color and z variables */
    for(cbuf = 0; cbuf < key->nr_cbufs; cbuf++) {
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-	 color[cbuf][chan] = LLVMGetUndef(vec_type);
-	 lp_build_flow_scope_declare(flow, &color[cbuf][chan]);
+	 color[cbuf][chan] = lp_build_alloca(builder, vec_type, "color");
       }
    }
-   lp_build_flow_scope_declare(flow, &z);
 
    /* do triangle edge testing */
    if (partial_mask) {
@@ -319,74 +304,126 @@ generate_fs(struct llvmpipe_context *lp,
    }
 
    /* 'mask' will control execution based on quad's pixel alive/killed state */
-   lp_build_mask_begin(&mask, flow, type, *pmask);
-
-   early_depth_stencil_test =
-      (key->depth.enabled || key->stencil[0].enabled) &&
-      !key->alpha.enabled &&
-      !shader->info.uses_kill &&
-      !shader->info.writes_z;
-
-   if (early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+   lp_build_mask_begin(&mask, builder, type, *pmask);
+
+   if (!(depth_mode & EARLY_DEPTH_TEST) && !simple_shader)
+      lp_build_mask_check(&mask);
+
+   lp_build_interp_soa_update_pos(interp, i);
+   z = interp->pos[2];
+
+   if (depth_mode & EARLY_DEPTH_TEST) {
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+
+      if (depth_mode & EARLY_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
+   }
 
+   lp_build_interp_soa_update_inputs(interp, i);
+   
+   /* Build the actual shader */
    lp_build_tgsi_soa(builder, tokens, type, &mask,
                      consts_ptr, interp->pos, interp->inputs,
-                     outputs, sampler, &shader->info);
+                     outputs, sampler, &shader->info.base);
 
-   /* loop over fragment shader outputs/results */
-   for (attrib = 0; attrib < shader->info.num_outputs; ++attrib) {
-      for(chan = 0; chan < NUM_CHANNELS; ++chan) {
-         if(outputs[attrib][chan]) {
-            LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
-            lp_build_name(out, "output%u.%u.%c", i, attrib, "xyzw"[chan]);
-
-            switch (shader->info.output_semantic_name[attrib]) {
-            case TGSI_SEMANTIC_COLOR:
-               {
-                  unsigned cbuf = shader->info.output_semantic_index[attrib];
-
-                  lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
-
-                  /* Alpha test */
-		  /* XXX: should only test the final assignment to alpha */
-                  if (cbuf == 0 && chan == 3 && key->alpha.enabled) {
-                     LLVMValueRef alpha = out;
-                     LLVMValueRef alpha_ref_value;
-                     alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
-                     alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
-                     lp_build_alpha_test(builder, key->alpha.func, type,
-                                         &mask, alpha, alpha_ref_value);
-                  }
-
-		  color[cbuf][chan] = out;
-                  break;
-               }
-
-            case TGSI_SEMANTIC_POSITION:
-               if(chan == 2)
-                  z = out;
-               break;
-            }
-         }
+
+   /* Alpha test */
+   if (key->alpha.enabled) {
+      int color0 = find_output_by_semantic(&shader->info.base,
+                                           TGSI_SEMANTIC_COLOR,
+                                           0);
+
+      if (color0 != -1 && outputs[color0][3]) {
+         LLVMValueRef alpha = LLVMBuildLoad(builder, outputs[color0][3], "alpha");
+         LLVMValueRef alpha_ref_value;
+
+         alpha_ref_value = lp_jit_context_alpha_ref_value(builder, context_ptr);
+         alpha_ref_value = lp_build_broadcast(builder, vec_type, alpha_ref_value);
+
+         lp_build_alpha_test(builder, key->alpha.func, type,
+                             &mask, alpha, alpha_ref_value,
+                             (depth_mode & LATE_DEPTH_TEST) != 0);
       }
    }
 
-   if (!early_depth_stencil_test)
-      generate_depth_stencil(builder, key,
-                             type, &mask,
-                             stencil_refs, z, depth_ptr, facing, counter);
+   /* Late Z test */
+   if (depth_mode & LATE_DEPTH_TEST) { 
+      int pos0 = find_output_by_semantic(&shader->info.base,
+                                         TGSI_SEMANTIC_POSITION,
+                                         0);
+         
+      if (pos0 != -1 && outputs[pos0][2]) {
+         z = LLVMBuildLoad(builder, outputs[pos0][2], "output.z");
+      }
 
-   lp_build_mask_end(&mask);
+      lp_build_depth_stencil_test(builder,
+                                  &key->depth,
+                                  key->stencil,
+                                  type,
+                                  zs_format_desc,
+                                  &mask,
+                                  stencil_refs,
+                                  z,
+                                  depth_ptr, facing,
+                                  &zs_value,
+                                  !simple_shader);
+      /* Late Z write */
+      if (depth_mode & LATE_DEPTH_WRITE) {
+         lp_build_depth_write(builder, zs_format_desc, depth_ptr, zs_value);
+      }
+   }
+   else if ((depth_mode & EARLY_DEPTH_TEST) &&
+            (depth_mode & LATE_DEPTH_WRITE))
+   {
+      /* Need to apply a reduced mask to the depth write.  Reload the
+       * depth value, update from zs_value with the new mask value and
+       * write that out.
+       */
+      lp_build_deferred_depth_write(builder,
+                                    type,
+                                    zs_format_desc,
+                                    &mask,
+                                    depth_ptr,
+                                    zs_value);
+   }
 
-   lp_build_flow_scope_end(flow);
 
-   lp_build_flow_destroy(flow);
+   /* Color write  */
+   for (attrib = 0; attrib < shader->info.base.num_outputs; ++attrib)
+   {
+      if (shader->info.base.output_semantic_name[attrib] == TGSI_SEMANTIC_COLOR &&
+          shader->info.base.output_semantic_index[attrib] < key->nr_cbufs)
+      {
+         unsigned cbuf = shader->info.base.output_semantic_index[attrib];
+         for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+            if(outputs[attrib][chan]) {
+               /* XXX: just initialize outputs to point at colors[] and
+                * skip this.
+                */
+               LLVMValueRef out = LLVMBuildLoad(builder, outputs[attrib][chan], "");
+               lp_build_name(out, "color%u.%u.%c", i, attrib, "rgba"[chan]);
+               LLVMBuildStore(builder, out, color[cbuf][chan]);
+            }
+         }
+      }
+   }
 
-   *pmask = mask.value;
+   if (counter)
+      lp_build_occlusion_count(builder, type,
+                               lp_build_mask_value(&mask), counter);
 
+   *pmask = lp_build_mask_end(&mask);
 }
 
 
@@ -407,10 +444,10 @@ generate_blend(const struct pipe_blend_state *blend,
                LLVMValueRef context_ptr,
                LLVMValueRef mask,
                LLVMValueRef *src,
-               LLVMValueRef dst_ptr)
+               LLVMValueRef dst_ptr,
+               boolean do_branch)
 {
    struct lp_build_context bld;
-   struct lp_build_flow_context *flow;
    struct lp_build_mask_context mask_ctx;
    LLVMTypeRef vec_type;
    LLVMValueRef const_ptr;
@@ -421,10 +458,9 @@ generate_blend(const struct pipe_blend_state *blend,
 
    lp_build_context_init(&bld, builder, type);
 
-   flow = lp_build_flow_create(builder);
-
-   /* we'll use this mask context to skip blending if all pixels are dead */
-   lp_build_mask_begin(&mask_ctx, flow, type, mask);
+   lp_build_mask_begin(&mask_ctx, builder, type, mask);
+   if (do_branch)
+      lp_build_mask_check(&mask_ctx);
 
    vec_type = lp_build_vec_type(type);
 
@@ -457,7 +493,6 @@ generate_blend(const struct pipe_blend_state *blend,
    }
 
    lp_build_mask_end(&mask_ctx);
-   lp_build_flow_destroy(flow);
 }
 
 
@@ -468,13 +503,13 @@ generate_blend(const struct pipe_blend_state *blend,
  * 2x2 pixels.
  */
 static void
-generate_fragment(struct llvmpipe_context *lp,
+generate_fragment(struct llvmpipe_screen *screen,
                   struct lp_fragment_shader *shader,
                   struct lp_fragment_shader_variant *variant,
                   unsigned partial_mask)
 {
-   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    const struct lp_fragment_shader_variant_key *key = &variant->key;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
    char func_name[256];
    struct lp_type fs_type;
    struct lp_type blend_type;
@@ -502,11 +537,24 @@ generate_fragment(struct llvmpipe_context *lp,
    LLVMValueRef blend_mask;
    LLVMValueRef function;
    LLVMValueRef facing;
+   const struct util_format_description *zs_format_desc;
    unsigned num_fs;
    unsigned i;
    unsigned chan;
    unsigned cbuf;
 
+   /* Adjust color input interpolation according to flatshade state:
+    */
+   memcpy(inputs, shader->inputs, shader->info.base.num_inputs * sizeof inputs[0]);
+   for (i = 0; i < shader->info.base.num_inputs; i++) {
+      if (inputs[i].interp == LP_INTERP_COLOR) {
+	 if (key->flatshade)
+	    inputs[i].interp = LP_INTERP_CONSTANT;
+	 else
+	    inputs[i].interp = LP_INTERP_LINEAR;
+      }
+   }
+
 
    /* TODO: actually pick these based on the fs and color buffer
     * characteristics. */
@@ -542,12 +590,12 @@ generate_fragment(struct llvmpipe_context *lp,
    arg_types[0] = screen->context_ptr_type;            /* context */
    arg_types[1] = LLVMInt32Type();                     /* x */
    arg_types[2] = LLVMInt32Type();                     /* y */
-   arg_types[3] = LLVMFloatType();                     /* facing */
+   arg_types[3] = LLVMInt32Type();                     /* facing */
    arg_types[4] = LLVMPointerType(fs_elem_type, 0);    /* a0 */
    arg_types[5] = LLVMPointerType(fs_elem_type, 0);    /* dadx */
    arg_types[6] = LLVMPointerType(fs_elem_type, 0);    /* dady */
    arg_types[7] = LLVMPointerType(LLVMPointerType(blend_vec_type, 0), 0);  /* color */
-   arg_types[8] = LLVMPointerType(fs_int_vec_type, 0); /* depth */
+   arg_types[8] = LLVMPointerType(LLVMInt8Type(), 0);  /* depth */
    arg_types[9] = LLVMInt32Type();                     /* mask_input */
    arg_types[10] = LLVMPointerType(LLVMInt32Type(), 0);/* counter */
 
@@ -558,7 +606,6 @@ generate_fragment(struct llvmpipe_context *lp,
 
    variant->function[partial_mask] = function;
 
-
    /* XXX: need to propagate noalias down into color param now we are
     * passing a pointer-to-pointer?
     */
@@ -606,8 +653,8 @@ generate_fragment(struct llvmpipe_context *lp,
     * already included in the shader key.
     */
    lp_build_interp_soa_init(&interp, 
-                            lp->num_inputs,
-                            lp->inputs,
+                            shader->info.base.num_inputs,
+                            inputs,
                             builder, fs_type,
                             a0_ptr, dadx_ptr, dady_ptr,
                             x, y);
@@ -616,17 +663,18 @@ generate_fragment(struct llvmpipe_context *lp,
    sampler = lp_llvm_sampler_soa_create(key->sampler, context_ptr);
 
    /* loop over quads in the block */
+   zs_format_desc = util_format_description(key->zsbuf_format);
+
    for(i = 0; i < num_fs; ++i) {
-      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      LLVMValueRef depth_offset = LLVMConstInt(LLVMInt32Type(),
+                                               i*fs_type.length*zs_format_desc->block.bits/8,
+                                               0);
       LLVMValueRef out_color[PIPE_MAX_COLOR_BUFS][NUM_CHANNELS];
       LLVMValueRef depth_ptr_i;
 
-      if(i != 0)
-         lp_build_interp_soa_update(&interp, i);
+      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &depth_offset, 1, "");
 
-      depth_ptr_i = LLVMBuildGEP(builder, depth_ptr, &index, 1, "");
-
-      generate_fs(lp, shader, key,
+      generate_fs(shader, key,
                   builder,
                   fs_type,
                   context_ptr,
@@ -660,9 +708,18 @@ generate_fragment(struct llvmpipe_context *lp,
        * Convert the fs's output color and mask to fit to the blending type. 
        */
       for(chan = 0; chan < NUM_CHANNELS; ++chan) {
+         LLVMValueRef fs_color_vals[LP_MAX_VECTOR_LENGTH];
+         
+         for (i = 0; i < num_fs; i++) {
+            fs_color_vals[i] =
+               LLVMBuildLoad(builder, fs_out_color[cbuf][chan][i], "fs_color_vals");
+         }
+
 	 lp_build_conv(builder, fs_type, blend_type,
-		       fs_out_color[cbuf][chan], num_fs,
+                       fs_color_vals,
+                       num_fs,
 		       &blend_in_color[chan], 1);
+
 	 lp_build_name(blend_in_color[chan], "color%d.%c", cbuf, "rgba"[chan]);
       }
 
@@ -685,14 +742,23 @@ generate_fragment(struct llvmpipe_context *lp,
       /*
        * Blending.
        */
-      generate_blend(&key->blend,
-                     rt,
-		     builder,
-		     blend_type,
-		     context_ptr,
-		     blend_mask,
-		     blend_in_color,
-		     color_ptr);
+      {
+         /* Could the 4x4 have been killed?
+          */
+         boolean do_branch = ((key->depth.enabled || key->stencil[0].enabled) &&
+                              !key->alpha.enabled &&
+                              !shader->info.base.uses_kill);
+
+         generate_blend(&key->blend,
+                        rt,
+                        builder,
+                        blend_type,
+                        context_ptr,
+                        blend_mask,
+                        blend_in_color,
+                        color_ptr,
+                        do_branch);
+      }
    }
 
 #ifdef PIPE_ARCH_X86
@@ -717,12 +783,17 @@ generate_fragment(struct llvmpipe_context *lp,
    /* Apply optimizations to LLVM IR */
    LLVMRunFunctionPassManager(screen->pass, function);
 
-   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+   if ((gallivm_debug & GALLIVM_DEBUG_IR) || (LP_DEBUG & DEBUG_FS)) {
       /* Print the LLVM IR to stderr */
       lp_debug_dump_value(function);
       debug_printf("\n");
    }
 
+   /* Dump byte code to a file */
+   if (0) {
+      LLVMWriteBitcodeToFile(lp_build_module, "llvmpipe.bc");
+   }
+
    /*
     * Translate the LLVM IR into machine code.
     */
@@ -731,7 +802,7 @@ generate_fragment(struct llvmpipe_context *lp,
 
       variant->jit_function[partial_mask] = (lp_jit_frag_func)pointer_to_func(f);
 
-      if (gallivm_debug & GALLIVM_DEBUG_ASM) {
+      if ((gallivm_debug & GALLIVM_DEBUG_ASM) || (LP_DEBUG & DEBUG_FS)) {
          lp_disassemble(f);
       }
       lp_func_delete_body(function);
@@ -746,6 +817,9 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
 
    debug_printf("fs variant %p:\n", (void *) key);
 
+   if (key->flatshade) {
+      debug_printf("flatshade = 1\n");
+   }
    for (i = 0; i < key->nr_cbufs; ++i) {
       debug_printf("cbuf_format[%u] = %s\n", i, util_format_name(key->cbuf_format[i]));
    }
@@ -770,6 +844,10 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       debug_printf("alpha.func = %s\n", util_dump_func(key->alpha.func, TRUE));
    }
 
+   if (key->occlusion_count) {
+      debug_printf("occlusion_count = 1\n");
+   }
+
    if (key->blend.logicop_enable) {
       debug_printf("blend.logicop_func = %s\n", util_dump_logicop(key->blend.logicop_func, TRUE));
    }
@@ -782,31 +860,33 @@ dump_fs_variant_key(const struct lp_fragment_shader_variant_key *key)
       debug_printf("blend.alpha_dst_factor = %s\n", util_dump_blend_factor(key->blend.rt[0].alpha_dst_factor, TRUE));
    }
    debug_printf("blend.colormask = 0x%x\n", key->blend.rt[0].colormask);
-   for (i = 0; i < PIPE_MAX_SAMPLERS; ++i) {
-      if (key->sampler[i].format) {
-         debug_printf("sampler[%u] = \n", i);
-         debug_printf("  .format = %s\n",
-                      util_format_name(key->sampler[i].format));
-         debug_printf("  .target = %s\n",
-                      util_dump_tex_target(key->sampler[i].target, TRUE));
-         debug_printf("  .pot = %u %u %u\n",
-                      key->sampler[i].pot_width,
-                      key->sampler[i].pot_height,
-                      key->sampler[i].pot_depth);
-         debug_printf("  .wrap = %s %s %s\n",
-                      util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
-                      util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
-                      util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
-         debug_printf("  .min_img_filter = %s\n",
-                      util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
-         debug_printf("  .min_mip_filter = %s\n",
-                      util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
-         debug_printf("  .mag_img_filter = %s\n",
-                      util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
-         if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
-            debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
-         debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
-      }
+   for (i = 0; i < key->nr_samplers; ++i) {
+      debug_printf("sampler[%u] = \n", i);
+      debug_printf("  .format = %s\n",
+                   util_format_name(key->sampler[i].format));
+      debug_printf("  .target = %s\n",
+                   util_dump_tex_target(key->sampler[i].target, TRUE));
+      debug_printf("  .pot = %u %u %u\n",
+                   key->sampler[i].pot_width,
+                   key->sampler[i].pot_height,
+                   key->sampler[i].pot_depth);
+      debug_printf("  .wrap = %s %s %s\n",
+                   util_dump_tex_wrap(key->sampler[i].wrap_s, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_t, TRUE),
+                   util_dump_tex_wrap(key->sampler[i].wrap_r, TRUE));
+      debug_printf("  .min_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].min_img_filter, TRUE));
+      debug_printf("  .min_mip_filter = %s\n",
+                   util_dump_tex_mipfilter(key->sampler[i].min_mip_filter, TRUE));
+      debug_printf("  .mag_img_filter = %s\n",
+                   util_dump_tex_filter(key->sampler[i].mag_img_filter, TRUE));
+      if (key->sampler[i].compare_mode != PIPE_TEX_COMPARE_NONE)
+         debug_printf("  .compare_func = %s\n", util_dump_func(key->sampler[i].compare_func, TRUE));
+      debug_printf("  .normalized_coords = %u\n", key->sampler[i].normalized_coords);
+      debug_printf("  .min_max_lod_equal = %u\n", key->sampler[i].min_max_lod_equal);
+      debug_printf("  .lod_bias_non_zero = %u\n", key->sampler[i].lod_bias_non_zero);
+      debug_printf("  .apply_min_lod = %u\n", key->sampler[i].apply_min_lod);
+      debug_printf("  .apply_max_lod = %u\n", key->sampler[i].apply_max_lod);
    }
 }
 
@@ -823,7 +903,7 @@ lp_debug_fs_variant(const struct lp_fragment_shader_variant *variant)
 }
 
 static struct lp_fragment_shader_variant *
-generate_variant(struct llvmpipe_context *lp,
+generate_variant(struct llvmpipe_screen *screen,
                  struct lp_fragment_shader *shader,
                  const struct lp_fragment_shader_variant_key *key)
 {
@@ -861,7 +941,7 @@ generate_variant(struct llvmpipe_context *lp,
          !key->stencil[0].enabled &&
          !key->alpha.enabled &&
          !key->depth.enabled &&
-         !shader->info.uses_kill
+         !shader->info.base.uses_kill
          ? TRUE : FALSE;
 
 
@@ -869,11 +949,11 @@ generate_variant(struct llvmpipe_context *lp,
       lp_debug_fs_variant(variant);
    }
 
-   generate_fragment(lp, shader, variant, RAST_EDGE_TEST);
+   generate_fragment(screen, shader, variant, RAST_EDGE_TEST);
 
    if (variant->opaque) {
       /* Specialized shader, which doesn't need to read the color buffer. */
-      generate_fragment(lp, shader, variant, RAST_WHOLE);
+      generate_fragment(screen, shader, variant, RAST_WHOLE);
    } else {
       variant->jit_function[RAST_WHOLE] = variant->jit_function[RAST_EDGE_TEST];
    }
@@ -889,6 +969,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    struct llvmpipe_context *llvmpipe = llvmpipe_context(pipe);
    struct lp_fragment_shader *shader;
    int nr_samplers;
+   int i;
 
    shader = CALLOC_STRUCT(lp_fragment_shader);
    if (!shader)
@@ -898,7 +979,7 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
    make_empty_list(&shader->variants);
 
    /* get/save the summary info for this shader */
-   tgsi_scan_shader(templ->tokens, &shader->info);
+   lp_build_tgsi_info(templ->tokens, &shader->info);
 
    /* we need to keep a local copy of the tokens */
    shader->base.tokens = tgsi_dup_tokens(templ->tokens);
@@ -910,18 +991,58 @@ llvmpipe_create_fs_state(struct pipe_context *pipe,
       return NULL;
    }
 
-   nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+   nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    shader->variant_key_size = Offset(struct lp_fragment_shader_variant_key,
 				     sampler[nr_samplers]);
 
+   for (i = 0; i < shader->info.base.num_inputs; i++) {
+      shader->inputs[i].usage_mask = shader->info.base.input_usage_mask[i];
+
+      switch (shader->info.base.input_interpolate[i]) {
+      case TGSI_INTERPOLATE_CONSTANT:
+	 shader->inputs[i].interp = LP_INTERP_CONSTANT;
+	 break;
+      case TGSI_INTERPOLATE_LINEAR:
+	 shader->inputs[i].interp = LP_INTERP_LINEAR;
+	 break;
+      case TGSI_INTERPOLATE_PERSPECTIVE:
+	 shader->inputs[i].interp = LP_INTERP_PERSPECTIVE;
+	 break;
+      default:
+	 assert(0);
+	 break;
+      }
+
+      switch (shader->info.base.input_semantic_name[i]) {
+      case TGSI_SEMANTIC_COLOR:
+         /* Colors may be either linearly or constant interpolated in
+	  * the fragment shader, but that information isn't available
+	  * here.  Mark color inputs and fix them up later.
+          */
+	 shader->inputs[i].interp = LP_INTERP_COLOR;
+         break;
+      case TGSI_SEMANTIC_FACE:
+	 shader->inputs[i].interp = LP_INTERP_FACING;
+	 break;
+      case TGSI_SEMANTIC_POSITION:
+	 /* Position was already emitted above
+	  */
+	 shader->inputs[i].interp = LP_INTERP_POSITION;
+	 shader->inputs[i].src_index = 0;
+	 continue;
+      }
+
+      shader->inputs[i].src_index = i+1;
+   }
+
    if (LP_DEBUG & DEBUG_TGSI) {
       unsigned attrib;
       debug_printf("llvmpipe: Create fragment shader #%u %p:\n", shader->no, (void *) shader);
       tgsi_dump(templ->tokens, 0);
       debug_printf("usage masks:\n");
-      for (attrib = 0; attrib < shader->info.num_inputs; ++attrib) {
-         unsigned usage_mask = shader->info.input_usage_mask[attrib];
+      for (attrib = 0; attrib < shader->info.base.num_inputs; ++attrib) {
+         unsigned usage_mask = shader->info.base.input_usage_mask[attrib];
          debug_printf("  IN[%u].%s%s%s%s\n",
                       attrib,
                       usage_mask & TGSI_WRITEMASK_X ? "x" : "",
@@ -1150,10 +1271,10 @@ make_variant_key(struct llvmpipe_context *lp,
 
    /* This value will be the same for all the variants of a given shader:
     */
-   key->nr_samplers = shader->info.file_max[TGSI_FILE_SAMPLER] + 1;
+   key->nr_samplers = shader->info.base.file_max[TGSI_FILE_SAMPLER] + 1;
 
    for(i = 0; i < key->nr_samplers; ++i) {
-      if(shader->info.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
+      if(shader->info.base.file_mask[TGSI_FILE_SAMPLER] & (1 << i)) {
          lp_sampler_static_state(&key->sampler[i],
 				 lp->fragment_sampler_views[i],
 				 lp->sampler[i]);
@@ -1168,6 +1289,7 @@ make_variant_key(struct llvmpipe_context *lp,
 void 
 llvmpipe_update_fs(struct llvmpipe_context *lp)
 {
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
    struct lp_fragment_shader *shader = lp->fs;
    struct lp_fragment_shader_variant_key key;
    struct lp_fragment_shader_variant *variant = NULL;
@@ -1208,7 +1330,7 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
       }
       t0 = os_time_get();
 
-      variant = generate_variant(lp, shader, &key);
+      variant = generate_variant(screen, shader, &key);
 
       t1 = os_time_get();
       dt = t1 - t0;
@@ -1228,6 +1350,10 @@ llvmpipe_update_fs(struct llvmpipe_context *lp)
 
 
 
+
+
+
+
 void
 llvmpipe_init_fs_funcs(struct llvmpipe_context *llvmpipe)
 {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index 4999b8dca1a..7d58c4936c7 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -34,6 +34,8 @@
 #include "pipe/p_state.h"
 #include "tgsi/tgsi_scan.h" /* for tgsi_shader_info */
 #include "gallivm/lp_bld_sample.h" /* for struct lp_sampler_static_state */
+#include "gallivm/lp_bld_tgsi.h" /* for lp_tgsi_info */
+#include "lp_bld_interp.h" /* for struct lp_shader_input */
 
 
 struct tgsi_token;
@@ -96,7 +98,7 @@ struct lp_fragment_shader
 {
    struct pipe_shader_state base;
 
-   struct tgsi_shader_info info;
+   struct lp_tgsi_info info;
 
    struct lp_fs_variant_list_item variants;
 
@@ -107,6 +109,9 @@ struct lp_fragment_shader
    unsigned no;
    unsigned variants_created;
    unsigned variants_cached;
+
+   /** Fragment shader input interpolation info */
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
 };
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_state_sampler.c b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
index 17a4a0ed02d..1dd866195d3 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_sampler.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_sampler.c
@@ -246,9 +246,9 @@ llvmpipe_prepare_vertex_sampling(struct llvmpipe_context *lp,
                                  struct pipe_sampler_view **views)
 {
    unsigned i;
-   uint32_t row_stride[DRAW_MAX_TEXTURE_LEVELS];
-   uint32_t img_stride[DRAW_MAX_TEXTURE_LEVELS];
-   const void *data[DRAW_MAX_TEXTURE_LEVELS];
+   uint32_t row_stride[PIPE_MAX_TEXTURE_LEVELS];
+   uint32_t img_stride[PIPE_MAX_TEXTURE_LEVELS];
+   const void *data[PIPE_MAX_TEXTURE_LEVELS];
 
    assert(num <= PIPE_MAX_VERTEX_SAMPLERS);
    if (!num)
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.c b/src/gallium/drivers/llvmpipe/lp_state_setup.c
new file mode 100644
index 00000000000..2c8b8b9a928
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.c
@@ -0,0 +1,759 @@
+/**************************************************************************
+ *
+ * Copyright 2010 VMware.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sub license, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial portions
+ * of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
+ * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
+ * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+ * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
+ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ **************************************************************************/
+
+
+#include "util/u_math.h"
+#include "util/u_memory.h"
+#include "util/u_simple_list.h"
+#include "os/os_time.h"
+#include "gallivm/lp_bld_debug.h"
+#include "gallivm/lp_bld_init.h"
+#include "gallivm/lp_bld_intr.h"
+#include <llvm-c/Analysis.h>	/* for LLVMVerifyFunction */
+
+#include "lp_perf.h"
+#include "lp_debug.h"
+#include "lp_flush.h"
+#include "lp_screen.h"
+#include "lp_context.h"
+#include "lp_setup_context.h"
+#include "lp_rast.h"
+#include "lp_state.h"
+#include "lp_state_fs.h"
+#include "lp_state_setup.h"
+
+
+
+/* currently organized to interpolate full float[4] attributes even
+ * when some elements are unused.  Later, can pack vertex data more
+ * closely.
+ */
+
+
+struct lp_setup_args
+{
+   /* Function arguments:
+    */
+   LLVMValueRef v0;
+   LLVMValueRef v1;
+   LLVMValueRef v2;
+   LLVMValueRef facing;		/* boolean */
+   LLVMValueRef a0;
+   LLVMValueRef dadx;
+   LLVMValueRef dady;
+
+   /* Derived:
+    */
+   LLVMValueRef x0_center;
+   LLVMValueRef y0_center;
+   LLVMValueRef dy20_ooa;
+   LLVMValueRef dy01_ooa;
+   LLVMValueRef dx20_ooa;
+   LLVMValueRef dx01_ooa;
+};
+
+static LLVMTypeRef type4f(void)
+{
+   return LLVMVectorType(LLVMFloatType(), 4);
+}
+
+
+/* Equivalent of _mm_setr_ps(a,b,c,d)
+ */
+static LLVMValueRef vec4f(LLVMBuilderRef bld,
+			  LLVMValueRef a, LLVMValueRef b, LLVMValueRef c, LLVMValueRef d,
+			  const char *name)
+{
+   LLVMValueRef i0 = LLVMConstInt(LLVMInt32Type(), 0, 0);
+   LLVMValueRef i1 = LLVMConstInt(LLVMInt32Type(), 1, 0);
+   LLVMValueRef i2 = LLVMConstInt(LLVMInt32Type(), 2, 0);
+   LLVMValueRef i3 = LLVMConstInt(LLVMInt32Type(), 3, 0);
+
+   LLVMValueRef res = LLVMGetUndef(type4f());
+
+   res = LLVMBuildInsertElement(bld, res, a, i0, "");
+   res = LLVMBuildInsertElement(bld, res, b, i1, "");
+   res = LLVMBuildInsertElement(bld, res, c, i2, "");
+   res = LLVMBuildInsertElement(bld, res, d, i3, name);
+
+   return res;
+}
+
+/* Equivalent of _mm_set1_ps(a)
+ */
+static LLVMValueRef vec4f_from_scalar(LLVMBuilderRef bld,
+				      LLVMValueRef a,
+				      const char *name)
+{
+   LLVMValueRef res = LLVMGetUndef(type4f());
+   int i;
+
+   for(i = 0; i < 4; ++i) {
+      LLVMValueRef index = LLVMConstInt(LLVMInt32Type(), i, 0);
+      res = LLVMBuildInsertElement(bld, res, a, index, i == 3 ? name : "");
+   }
+
+   return res;
+}
+
+static void
+store_coef(LLVMBuilderRef builder,
+	   struct lp_setup_args *args,
+	   unsigned slot,
+	   LLVMValueRef a0,
+	   LLVMValueRef dadx,
+	   LLVMValueRef dady)
+{
+   LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), slot, 0);
+   
+   LLVMBuildStore(builder,
+		  a0, 
+		  LLVMBuildGEP(builder, args->a0, &idx, 1, ""));
+
+   LLVMBuildStore(builder,
+		  dadx, 
+		  LLVMBuildGEP(builder, args->dadx, &idx, 1, ""));
+
+   LLVMBuildStore(builder,
+		  dady, 
+		  LLVMBuildGEP(builder, args->dady, &idx, 1, ""));
+}
+
+
+
+static void 
+emit_constant_coef4( LLVMBuilderRef builder,
+		     struct lp_setup_args *args,
+		     unsigned slot,
+		     LLVMValueRef vert,
+		     unsigned attr)
+{
+   LLVMValueRef zero      = LLVMConstReal(LLVMFloatType(), 0.0);
+   LLVMValueRef zerovec   = vec4f_from_scalar(builder, zero, "zero");
+   LLVMValueRef idx       = LLVMConstInt(LLVMInt32Type(), attr, 0);
+   LLVMValueRef attr_ptr  = LLVMBuildGEP(builder, vert, &idx, 1, "attr_ptr");
+   LLVMValueRef vert_attr = LLVMBuildLoad(builder, attr_ptr, "vert_attr");
+
+   store_coef(builder, args, slot, vert_attr, zerovec, zerovec);
+}
+
+
+
+/**
+ * Setup the fragment input attribute with the front-facing value.
+ * \param frontface  is the triangle front facing?
+ */
+static void 
+emit_facing_coef( LLVMBuilderRef builder,
+		  struct lp_setup_args *args,
+		  unsigned slot )
+{
+   LLVMValueRef a0_0 = args->facing;
+   LLVMValueRef a0_0f = LLVMBuildSIToFP(builder, a0_0, LLVMFloatType(), "");
+   LLVMValueRef zero = LLVMConstReal(LLVMFloatType(), 0.0);
+   LLVMValueRef a0      = vec4f(builder, a0_0f, zero, zero, zero, "facing");
+   LLVMValueRef zerovec = vec4f_from_scalar(builder, zero, "zero");
+
+   store_coef(builder, args, slot, a0, zerovec, zerovec);
+}
+
+
+static LLVMValueRef
+vert_attrib(LLVMBuilderRef b,
+	    LLVMValueRef vert,
+	    int attr,
+	    int elem,
+	    const char *name)
+{
+   LLVMValueRef idx[2];
+   idx[0] = LLVMConstInt(LLVMInt32Type(), attr, 0);
+   idx[1] = LLVMConstInt(LLVMInt32Type(), elem, 0);
+   return LLVMBuildLoad(b, LLVMBuildGEP(b, vert, idx, 2, ""), name);
+}
+
+
+
+static void 
+emit_coef4( LLVMBuilderRef b,
+	    struct lp_setup_args *args,
+	    unsigned slot,
+	    LLVMValueRef a0,
+	    LLVMValueRef a1,
+	    LLVMValueRef a2)
+{
+   LLVMValueRef dy20_ooa = args->dy20_ooa;
+   LLVMValueRef dy01_ooa = args->dy01_ooa;
+   LLVMValueRef dx20_ooa = args->dx20_ooa;
+   LLVMValueRef dx01_ooa = args->dx01_ooa;
+   LLVMValueRef x0_center = args->x0_center;
+   LLVMValueRef y0_center = args->y0_center;
+
+   /* XXX: using fsub, fmul on vector types -- does this work??
+    */
+   LLVMValueRef da01 = LLVMBuildFSub(b, a0, a1, "da01");
+   LLVMValueRef da20 = LLVMBuildFSub(b, a2, a0, "da20");
+
+   /* Calculate dadx (vec4f)
+    */
+   LLVMValueRef da01_dy20_ooa = LLVMBuildFMul(b, da01, dy20_ooa, "da01_dy20_ooa");
+   LLVMValueRef da20_dy01_ooa = LLVMBuildFMul(b, da20, dy01_ooa, "da20_dy01_ooa");
+   LLVMValueRef dadx          = LLVMBuildFSub(b, da01_dy20_ooa, da20_dy01_ooa, "dadx");
+
+   /* Calculate dady (vec4f)
+    */
+   LLVMValueRef da01_dx20_ooa = LLVMBuildFMul(b, da01, dx20_ooa, "da01_dx20_ooa");
+   LLVMValueRef da20_dx01_ooa = LLVMBuildFMul(b, da20, dx01_ooa, "da20_dx01_ooa");
+   LLVMValueRef dady          = LLVMBuildFSub(b, da20_dx01_ooa, da01_dx20_ooa, "dady");
+
+   /* Calculate a0 - the attribute value at the origin
+    */
+   LLVMValueRef dadx_x0       = LLVMBuildFMul(b, dadx, x0_center, "dadx_x0"); 
+   LLVMValueRef dady_y0       = LLVMBuildFMul(b, dady, y0_center, "dady_y0"); 
+   LLVMValueRef attr_v0       = LLVMBuildFAdd(b, dadx_x0, dady_y0, "attr_v0"); 
+   LLVMValueRef attr_0        = LLVMBuildFSub(b, a0, attr_v0, "attr_0"); 
+
+   store_coef(b, args, slot, attr_0, dadx, dady);
+}
+
+
+static void 
+emit_linear_coef( LLVMBuilderRef b,
+		  struct lp_setup_args *args,
+		  unsigned slot,
+		  unsigned vert_attr)
+{
+   LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0);
+
+   LLVMValueRef a0 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+   LLVMValueRef a1 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+   LLVMValueRef a2 = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+
+   emit_coef4(b, args, slot, a0, a1, a2);
+}
+
+
+
+/**
+ * Compute a0, dadx and dady for a perspective-corrected interpolant,
+ * for a triangle.
+ * We basically multiply the vertex value by 1/w before computing
+ * the plane coefficients (a0, dadx, dady).
+ * Later, when we compute the value at a particular fragment position we'll
+ * divide the interpolated value by the interpolated W at that fragment.
+ */
+static void 
+emit_perspective_coef( LLVMBuilderRef b,
+		       struct lp_setup_args *args,
+		       unsigned slot,
+		       unsigned vert_attr)
+{
+   /* premultiply by 1/w  (v[0][3] is always 1/w):
+    */
+   LLVMValueRef idx = LLVMConstInt(LLVMInt32Type(), vert_attr, 0);
+
+   LLVMValueRef v0a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v0, &idx, 1, ""), "v0a");
+   LLVMValueRef v1a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v1, &idx, 1, ""), "v1a");
+   LLVMValueRef v2a = LLVMBuildLoad(b, LLVMBuildGEP(b, args->v2, &idx, 1, ""), "v2a");
+
+   LLVMValueRef v0_oow = vec4f_from_scalar(b, vert_attrib(b, args->v0, 0, 3, ""), "v0_oow");
+   LLVMValueRef v1_oow = vec4f_from_scalar(b, vert_attrib(b, args->v1, 0, 3, ""), "v1_oow");
+   LLVMValueRef v2_oow = vec4f_from_scalar(b, vert_attrib(b, args->v2, 0, 3, ""), "v2_oow");
+
+   LLVMValueRef v0_oow_v0a = LLVMBuildFMul(b, v0a, v0_oow, "v0_oow_v0a");
+   LLVMValueRef v1_oow_v1a = LLVMBuildFMul(b, v1a, v1_oow, "v1_oow_v1a");
+   LLVMValueRef v2_oow_v2a = LLVMBuildFMul(b, v2a, v2_oow, "v2_oow_v2a");
+
+   emit_coef4(b, args, slot, v0_oow_v0a, v1_oow_v1a, v2_oow_v2a);
+}
+
+
+static void
+emit_position_coef( LLVMBuilderRef builder,
+		    struct lp_setup_args *args,
+		    int slot, int attrib )
+{
+   emit_linear_coef(builder, args, slot, attrib);
+}
+
+
+
+
+/**
+ * Compute the inputs-> dadx, dady, a0 values.
+ */
+static void 
+emit_tri_coef( LLVMBuilderRef builder,
+	       const struct lp_setup_variant_key *key,
+	       struct lp_setup_args *args )
+{
+   unsigned slot;
+
+   /* The internal position input is in slot zero:
+    */
+   emit_position_coef(builder, args, 0, 0);
+
+   /* setup interpolation for all the remaining attributes:
+    */
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned vert_attr = key->inputs[slot].src_index;
+
+      switch (key->inputs[slot].interp) {
+      case LP_INTERP_CONSTANT:
+	 if (key->flatshade_first) {
+	    emit_constant_coef4(builder, args, slot+1, args->v0, vert_attr);
+	 }
+	 else {
+	    emit_constant_coef4(builder, args, slot+1, args->v2, vert_attr);
+	 }
+	 break;
+
+      case LP_INTERP_LINEAR:
+	 emit_linear_coef(builder, args, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_PERSPECTIVE:
+	 emit_perspective_coef(builder, args, slot+1, vert_attr);
+         break;
+
+      case LP_INTERP_POSITION:
+         /*
+          * The generated pixel interpolators will pick up the coeffs from
+          * slot 0.
+          */
+         break;
+
+      case LP_INTERP_FACING:
+         emit_facing_coef(builder, args, slot+1);
+         break;
+
+      default:
+         assert(0);
+      }
+   }
+}
+
+
+/* XXX: This is generic code, share with fs/vs codegen:
+ */
+static lp_jit_setup_triangle
+finalize_function(struct llvmpipe_screen *screen,
+		  LLVMBuilderRef builder,
+		  LLVMValueRef function)
+{
+   void *f;
+
+   /* Verify the LLVM IR.  If invalid, dump and abort */
+#ifdef DEBUG
+   if (LLVMVerifyFunction(function, LLVMPrintMessageAction)) {
+      if (1)
+         lp_debug_dump_value(function);
+      abort();
+   }
+#endif
+
+   /* Apply optimizations to LLVM IR */
+   LLVMRunFunctionPassManager(screen->pass, function);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR)
+   {
+      /* Print the LLVM IR to stderr */
+      lp_debug_dump_value(function);
+      debug_printf("\n");
+   }
+
+   /*
+    * Translate the LLVM IR into machine code.
+    */
+   f = LLVMGetPointerToGlobal(screen->engine, function);
+
+   if (gallivm_debug & GALLIVM_DEBUG_ASM)
+   {
+      lp_disassemble(f);
+   }
+
+   lp_func_delete_body(function);
+
+   return f;
+}
+
+/* XXX: Generic code:
+ */
+static void
+lp_emit_emms(LLVMBuilderRef builder)
+{
+#ifdef PIPE_ARCH_X86
+   /* Avoid corrupting the FPU stack on 32bit OSes. */
+   lp_build_intrinsic(builder, "llvm.x86.mmx.emms", LLVMVoidType(), NULL, 0);
+#endif
+}
+
+
+/* XXX: generic code:
+ */
+static void
+set_noalias(LLVMBuilderRef builder,
+	    LLVMValueRef function,
+	    const LLVMTypeRef *arg_types,
+	    int nr_args)
+{
+   int i;
+   for(i = 0; i < Elements(arg_types); ++i)
+      if(LLVMGetTypeKind(arg_types[i]) == LLVMPointerTypeKind)
+         LLVMAddAttribute(LLVMGetParam(function, i),
+			  LLVMNoAliasAttribute);
+}
+
+static void
+init_args(LLVMBuilderRef b, 
+	  struct lp_setup_args *args,
+	  const struct lp_setup_variant *variant)
+{
+   LLVMValueRef v0_x = vert_attrib(b, args->v0, 0, 0, "v0_x");
+   LLVMValueRef v0_y = vert_attrib(b, args->v0, 0, 1, "v0_y");
+
+   LLVMValueRef v1_x = vert_attrib(b, args->v1, 0, 0, "v1_x");
+   LLVMValueRef v1_y = vert_attrib(b, args->v1, 0, 1, "v1_y");
+
+   LLVMValueRef v2_x = vert_attrib(b, args->v2, 0, 0, "v2_x");
+   LLVMValueRef v2_y = vert_attrib(b, args->v2, 0, 1, "v2_y");
+
+   LLVMValueRef pixel_center = LLVMConstReal(LLVMFloatType(),
+					     variant->key.pixel_center_half ? 0.5 : 0);
+
+   LLVMValueRef x0_center = LLVMBuildFSub(b, v0_x, pixel_center, "x0_center" );
+   LLVMValueRef y0_center = LLVMBuildFSub(b, v0_y, pixel_center, "y0_center" );
+   
+   LLVMValueRef dx01 = LLVMBuildFSub(b, v0_x, v1_x, "dx01");
+   LLVMValueRef dy01 = LLVMBuildFSub(b, v0_y, v1_y, "dy01");
+   LLVMValueRef dx20 = LLVMBuildFSub(b, v2_x, v0_x, "dx20");
+   LLVMValueRef dy20 = LLVMBuildFSub(b, v2_y, v0_y, "dy20");
+
+   LLVMValueRef one  = LLVMConstReal(LLVMFloatType(), 1.0);
+   LLVMValueRef e    = LLVMBuildFMul(b, dx01, dy20, "e");
+   LLVMValueRef f    = LLVMBuildFMul(b, dx20, dy01, "f");
+   LLVMValueRef ooa  = LLVMBuildFDiv(b, one, LLVMBuildFSub(b, e, f, ""), "ooa");
+
+   LLVMValueRef dy20_ooa = LLVMBuildFMul(b, dy20, ooa, "dy20_ooa");
+   LLVMValueRef dy01_ooa = LLVMBuildFMul(b, dy01, ooa, "dy01_ooa");
+   LLVMValueRef dx20_ooa = LLVMBuildFMul(b, dx20, ooa, "dx20_ooa");
+   LLVMValueRef dx01_ooa = LLVMBuildFMul(b, dx01, ooa, "dx01_ooa");
+
+   args->dy20_ooa  = vec4f_from_scalar(b, dy20_ooa, "dy20_ooa_4f");
+   args->dy01_ooa  = vec4f_from_scalar(b, dy01_ooa, "dy01_ooa_4f");
+
+   args->dx20_ooa  = vec4f_from_scalar(b, dx20_ooa, "dx20_ooa_4f");
+   args->dx01_ooa  = vec4f_from_scalar(b, dx01_ooa, "dx01_ooa_4f");
+
+   args->x0_center = vec4f_from_scalar(b, x0_center, "x0_center_4f");
+   args->y0_center = vec4f_from_scalar(b, y0_center, "y0_center_4f");
+}
+
+/**
+ * Generate the runtime callable function for the coefficient calculation.
+ *
+ */
+static struct lp_setup_variant *
+generate_setup_variant(struct llvmpipe_screen *screen,
+		       struct lp_setup_variant_key *key)
+{
+   struct lp_setup_variant *variant = NULL;
+   struct lp_setup_args args;
+   char func_name[256];
+   LLVMTypeRef vec4f_type;
+   LLVMTypeRef func_type;
+   LLVMTypeRef arg_types[7];
+   LLVMBasicBlockRef block;
+   LLVMBuilderRef builder;
+   int64_t t0, t1;
+
+   if (0)
+      goto fail;
+
+   variant = CALLOC_STRUCT(lp_setup_variant);
+   if (variant == NULL)
+      goto fail;
+
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      t0 = os_time_get();
+   }
+
+   memcpy(&variant->key, key, key->size);
+   variant->list_item_global.base = variant;
+
+   util_snprintf(func_name, sizeof(func_name), "fs%u_setup%u",
+		 0,
+		 variant->no);
+
+   /* Currently always deal with full 4-wide vertex attributes from
+    * the vertices.
+    */
+
+   vec4f_type = LLVMVectorType(LLVMFloatType(), 4);
+
+   arg_types[0] = LLVMPointerType(vec4f_type, 0);        /* v0 */
+   arg_types[1] = LLVMPointerType(vec4f_type, 0);        /* v1 */
+   arg_types[2] = LLVMPointerType(vec4f_type, 0);        /* v2 */
+   arg_types[3] = LLVMInt32Type();			/* facing */
+   arg_types[4] = LLVMPointerType(vec4f_type, 0);	/* a0, aligned */
+   arg_types[5] = LLVMPointerType(vec4f_type, 0);	/* dadx, aligned */
+   arg_types[6] = LLVMPointerType(vec4f_type, 0);	/* dady, aligned */
+
+   func_type = LLVMFunctionType(LLVMVoidType(), arg_types, Elements(arg_types), 0);
+
+   variant->function = LLVMAddFunction(screen->module, func_name, func_type);
+   if (!variant->function)
+      goto fail;
+
+   LLVMSetFunctionCallConv(variant->function, LLVMCCallConv);
+
+   args.v0       = LLVMGetParam(variant->function, 0);
+   args.v1       = LLVMGetParam(variant->function, 1);
+   args.v2       = LLVMGetParam(variant->function, 2);
+   args.facing   = LLVMGetParam(variant->function, 3);
+   args.a0       = LLVMGetParam(variant->function, 4);
+   args.dadx     = LLVMGetParam(variant->function, 5);
+   args.dady     = LLVMGetParam(variant->function, 6);
+
+   lp_build_name(args.v0, "in_v0");
+   lp_build_name(args.v1, "in_v1");
+   lp_build_name(args.v2, "in_v2");
+   lp_build_name(args.facing, "in_facing");
+   lp_build_name(args.a0, "out_a0");
+   lp_build_name(args.dadx, "out_dadx");
+   lp_build_name(args.dady, "out_dady");
+
+   /*
+    * Function body
+    */
+   block = LLVMAppendBasicBlock(variant->function, "entry");
+   builder = LLVMCreateBuilder();
+   LLVMPositionBuilderAtEnd(builder, block);
+
+   set_noalias(builder, variant->function, arg_types, Elements(arg_types));
+   init_args(builder, &args, variant);
+   emit_tri_coef(builder, &variant->key, &args);
+
+   lp_emit_emms(builder);
+   LLVMBuildRetVoid(builder);
+   LLVMDisposeBuilder(builder);
+
+   variant->jit_function = finalize_function(screen, builder,
+					     variant->function);
+   if (!variant->jit_function)
+      goto fail;
+
+   /*
+    * Update timing information:
+    */
+   if (LP_DEBUG & DEBUG_COUNTERS) {
+      t1 = os_time_get();
+      LP_COUNT_ADD(llvm_compile_time, t1 - t0);
+      LP_COUNT_ADD(nr_llvm_compiles, 1);
+   }
+   
+   return variant;
+
+fail:
+   if (variant) {
+      if (variant->function) {
+	 if (variant->jit_function)
+	    LLVMFreeMachineCodeForFunction(screen->engine,
+					   variant->function);
+	 LLVMDeleteFunction(variant->function);
+      }
+      FREE(variant);
+   }
+   
+   return NULL;
+}
+
+
+
+static void
+lp_make_setup_variant_key(struct llvmpipe_context *lp,
+			  struct lp_setup_variant_key *key)
+{
+   struct lp_fragment_shader *fs = lp->fs;
+   unsigned i;
+
+   assert(sizeof key->inputs[0] == sizeof(ushort));
+   
+   key->num_inputs = fs->info.base.num_inputs;
+   key->flatshade_first = lp->rasterizer->flatshade_first;
+   key->pixel_center_half = lp->rasterizer->gl_rasterization_rules;
+   key->size = Offset(struct lp_setup_variant_key,
+		      inputs[key->num_inputs]);
+   key->pad = 0;
+
+   memcpy(key->inputs, fs->inputs, key->num_inputs * sizeof key->inputs[0]);
+   for (i = 0; i < key->num_inputs; i++) {
+      if (key->inputs[i].interp == LP_INTERP_COLOR) {
+	 if (lp->rasterizer->flatshade)
+	    key->inputs[i].interp = LP_INTERP_CONSTANT;
+	 else
+	    key->inputs[i].interp = LP_INTERP_LINEAR;
+      }
+   }
+
+}
+
+
+static void
+remove_setup_variant(struct llvmpipe_context *lp,
+		     struct lp_setup_variant *variant)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+
+   if (gallivm_debug & GALLIVM_DEBUG_IR) {
+      debug_printf("llvmpipe: del setup_variant #%u total %u\n",
+		   variant->no, lp->nr_setup_variants);
+   }
+
+   if (variant->function) {
+      if (variant->jit_function)
+	 LLVMFreeMachineCodeForFunction(screen->engine,
+					variant->function);
+      LLVMDeleteFunction(variant->function);
+   }
+
+   remove_from_list(&variant->list_item_global);
+   lp->nr_setup_variants--;
+   FREE(variant);
+}
+
+
+
+/* When the number of setup variants exceeds a threshold, cull a
+ * fraction (currently a quarter) of them.
+ */
+static void
+cull_setup_variants(struct llvmpipe_context *lp)
+{
+   struct pipe_context *pipe = &lp->pipe;
+   int i;
+
+   /*
+    * XXX: we need to flush the context until we have some sort of reference
+    * counting in fragment shaders as they may still be binned
+    * Flushing alone might not be sufficient we need to wait on it too.
+    */
+   llvmpipe_finish(pipe, __FUNCTION__);
+
+   for (i = 0; i < LP_MAX_SETUP_VARIANTS / 4; i++) {
+      struct lp_setup_variant_list_item *item = last_elem(&lp->setup_variants_list);
+      remove_setup_variant(lp, item->base);
+   }
+}
+
+
+/**
+ * Update fragment/vertex shader linkage state.  This is called just
+ * prior to drawing something when some fragment-related state has
+ * changed.
+ */
+void 
+llvmpipe_update_setup(struct llvmpipe_context *lp)
+{
+   struct llvmpipe_screen *screen = llvmpipe_screen(lp->pipe.screen);
+
+   struct lp_setup_variant_key *key = &lp->setup_variant.key;
+   struct lp_setup_variant *variant = NULL;
+   struct lp_setup_variant_list_item *li;
+
+   lp_make_setup_variant_key(lp, key);
+
+   foreach(li, &lp->setup_variants_list) {
+      if(li->base->key.size == key->size &&
+	 memcmp(&li->base->key, key, key->size) == 0) {
+         variant = li->base;
+         break;
+      }
+   }
+
+   if (variant) {
+      move_to_head(&lp->setup_variants_list, &variant->list_item_global);
+   }
+   else {
+      if (lp->nr_setup_variants >= LP_MAX_SETUP_VARIANTS) {
+	 cull_setup_variants(lp);
+      }
+
+      variant = generate_setup_variant(screen, key);
+      insert_at_head(&lp->setup_variants_list, &variant->list_item_global);
+      lp->nr_setup_variants++;
+   }
+
+   lp_setup_set_setup_variant(lp->setup,
+			      variant);
+}
+
+void
+lp_delete_setup_variants(struct llvmpipe_context *lp)
+{
+   struct lp_setup_variant_list_item *li;
+   li = first_elem(&lp->setup_variants_list);
+   while(!at_end(&lp->setup_variants_list, li)) {
+      struct lp_setup_variant_list_item *next = next_elem(li);
+      remove_setup_variant(lp, li->base);
+      li = next;
+   }
+}
+
+void
+lp_dump_setup_coef( const struct lp_setup_variant_key *key,
+		    const float (*sa0)[4],
+		    const float (*sdadx)[4],
+		    const float (*sdady)[4])
+{
+   int i, slot;
+
+   for (i = 0; i < NUM_CHANNELS; i++) {
+      float a0   = sa0  [0][i];
+      float dadx = sdadx[0][i];
+      float dady = sdady[0][i];
+
+      debug_printf("POS.%c: a0 = %f, dadx = %f, dady = %f\n",
+		   "xyzw"[i],
+		   a0, dadx, dady);
+   }
+
+   for (slot = 0; slot < key->num_inputs; slot++) {
+      unsigned usage_mask = key->inputs[slot].usage_mask;
+      for (i = 0; i < NUM_CHANNELS; i++) {
+	 if (usage_mask & (1 << i)) {
+	    float a0   = sa0  [1 + slot][i];
+	    float dadx = sdadx[1 + slot][i];
+	    float dady = sdady[1 + slot][i];
+
+	    debug_printf("IN[%u].%c: a0 = %f, dadx = %f, dady = %f\n",
+			 slot,
+			 "xyzw"[i],
+			 a0, dadx, dady);
+	 }
+      }
+   }
+}
diff --git a/src/gallium/drivers/llvmpipe/lp_state_setup.h b/src/gallium/drivers/llvmpipe/lp_state_setup.h
new file mode 100644
index 00000000000..b0c81baa75f
--- /dev/null
+++ b/src/gallium/drivers/llvmpipe/lp_state_setup.h
@@ -0,0 +1,80 @@
+#ifndef LP_STATE_SETUP_H
+#define LP_STATE_SETUP_H
+
+#include "lp_bld_interp.h"
+
+
+struct llvmpipe_context;
+struct lp_setup_variant;
+
+struct lp_setup_variant_list_item
+{
+   struct lp_setup_variant *base;
+   struct lp_setup_variant_list_item *next, *prev;
+};
+
+
+struct lp_setup_variant_key {   
+   unsigned num_inputs:8;
+   unsigned flatshade_first:1;
+   unsigned pixel_center_half:1;
+   unsigned pad:7;
+   unsigned size:16;
+   struct lp_shader_input inputs[PIPE_MAX_SHADER_INPUTS];
+};
+
+
+typedef void (*lp_jit_setup_triangle)( const float (*v0)[4],
+				       const float (*v1)[4],
+				       const float (*v2)[4],
+				       boolean front_facing,
+				       float (*a0)[4],
+				       float (*dadx)[4],
+				       float (*dady)[4] );
+
+
+
+
+/* At this stage, for a given variant key, we create a
+ * draw_vertex_info struct telling the draw module how to format the
+ * vertices, and an llvm-generated function which calculates the
+ * attribute interpolants (a0, dadx, dady) from three of those
+ * vertices.
+ */
+struct lp_setup_variant {
+   struct lp_setup_variant_key key;
+   
+   struct lp_setup_variant_list_item list_item_global;
+
+   /* XXX: this is a pointer to the LLVM IR.  Once jit_function is
+    * generated, we never need to use the IR again - need to find a
+    * way to release this data without destroying the generated
+    * assembly.
+    */
+   LLVMValueRef function;
+
+   /* The actual generated setup function:
+    */
+   lp_jit_setup_triangle jit_function;
+
+   unsigned no;
+};
+
+void lp_setup_tri_fallback( const float (*v0)[4],
+			    const float (*v1)[4],
+			    const float (*v2)[4],
+			    boolean front_facing,
+			    float (*a0)[4],
+			    float (*dadx)[4],
+			    float (*dady)[4],
+			    const struct lp_setup_variant_key *key );
+
+void lp_delete_setup_variants(struct llvmpipe_context *lp);
+
+void
+lp_dump_setup_coef( const struct lp_setup_variant_key *key,
+		    const float (*sa0)[4],
+		    const float (*sdadx)[4],
+		    const float (*sdady)[4]);
+
+#endif
diff --git a/src/gallium/drivers/llvmpipe/lp_test_round.c b/src/gallium/drivers/llvmpipe/lp_test_round.c
index 57b0ee57767..816518e5081 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_round.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_round.c
@@ -75,10 +75,7 @@ add_test(LLVMModuleRef module, const char *name, lp_func_t lp_func)
    LLVMValueRef ret;
    struct lp_build_context bld;
 
-   bld.builder = builder;
-   bld.type.floating = 1;
-   bld.type.width = 32;
-   bld.type.length = 4;
+   lp_build_context_init(&bld, builder, lp_float32_vec4_type());
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
@@ -100,9 +97,10 @@ printv(char* string, v4sf value)
            f[0], f[1], f[2], f[3]);
 }
 
-static void
+static boolean
 compare(v4sf x, v4sf y)
 {
+   boolean success = TRUE;
    float *xp = (float *) &x;
    float *yp = (float *) &y;
    if (xp[0] != yp[0] ||
@@ -110,7 +108,9 @@ compare(v4sf x, v4sf y)
        xp[2] != yp[2] ||
        xp[3] != yp[3]) {
       printf(" Incorrect result! ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ \n");
+      success = FALSE;
    }
+   return success;
 }
 
 
@@ -171,9 +171,12 @@ test_round(unsigned verbose, FILE *fp)
       LLVMDumpModule(module);
 
    for (i = 0; i < 3; i++) {
+      /* NOTE: There are several acceptable rules for x.5 rounding: ceiling,
+       * nearest even, etc. So we avoid testing such corner cases here.
+       */
       v4sf xvals[3] = {
          {-10.0, -1, 0, 12.0},
-         {-1.5, -0.25, 1.25, 2.5},
+         {-1.49, -0.25, 1.25, 2.51},
          {-0.99, -0.01, 0.01, 0.99}
       };
       v4sf x = xvals[i];
@@ -191,7 +194,7 @@ test_round(unsigned verbose, FILE *fp)
       y = round_func(x);
       printv("C round(x)   ", ref);
       printv("LLVM round(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = trunc(xp[0]);
       refp[1] = trunc(xp[1]);
@@ -200,7 +203,7 @@ test_round(unsigned verbose, FILE *fp)
       y = trunc_func(x);
       printv("C trunc(x)   ", ref);
       printv("LLVM trunc(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = floor(xp[0]);
       refp[1] = floor(xp[1]);
@@ -209,7 +212,7 @@ test_round(unsigned verbose, FILE *fp)
       y = floor_func(x);
       printv("C floor(x)   ", ref);
       printv("LLVM floor(x)", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
 
       refp[0] = ceil(xp[0]);
       refp[1] = ceil(xp[1]);
@@ -218,7 +221,7 @@ test_round(unsigned verbose, FILE *fp)
       y = ceil_func(x);
       printv("C ceil(x)    ", ref);
       printv("LLVM ceil(x) ", y);
-      compare(ref, y);
+      success = success && compare(ref, y);
    }
 
    LLVMFreeMachineCodeForFunction(engine, test_round);
@@ -247,11 +250,7 @@ test_round(unsigned verbose, FILE *fp)
 boolean
 test_all(unsigned verbose, FILE *fp)
 {
-   boolean success = TRUE;
-
-   test_round(verbose, fp);
-
-   return success;
+   return test_round(verbose, fp);
 }
 
 
diff --git a/src/gallium/drivers/llvmpipe/lp_test_sincos.c b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
index 7ab357f162e..79939b1a393 100644
--- a/src/gallium/drivers/llvmpipe/lp_test_sincos.c
+++ b/src/gallium/drivers/llvmpipe/lp_test_sincos.c
@@ -72,10 +72,7 @@ add_sincos_test(LLVMModuleRef module, boolean sin)
    LLVMValueRef ret;
    struct lp_build_context bld;
 
-   bld.builder = builder;
-   bld.type.floating = 1;
-   bld.type.width = 32;
-   bld.type.length = 4;
+   lp_build_context_init(&bld, builder, lp_float32_vec4_type());
 
    LLVMSetFunctionCallConv(func, LLVMCCallConv);
 
diff --git a/src/gallium/drivers/llvmpipe/lp_tile_soa.py b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
index 2ba39052aba..e49f9c62fe7 100644
--- a/src/gallium/drivers/llvmpipe/lp_tile_soa.py
+++ b/src/gallium/drivers/llvmpipe/lp_tile_soa.py
@@ -289,172 +289,141 @@ def generate_format_write(format, src_channel, src_native_type, src_suffix):
     print
     
 
-def generate_ssse3():
+def generate_sse2():
     print '''
 #if defined(PIPE_ARCH_SSE)
 
 #include "util/u_sse.h"
 
-static void
-lp_tile_b8g8r8a8_unorm_swizzle_4ub_ssse3(uint8_t *dst,
-                                         const uint8_t *src, unsigned src_stride,
-                                         unsigned x0, unsigned y0)
+static ALWAYS_INLINE void 
+swz4( const __m128i * restrict x, 
+      const __m128i * restrict y, 
+      const __m128i * restrict z, 
+      const __m128i * restrict w, 
+      __m128i * restrict a, 
+      __m128i * restrict b, 
+      __m128i * restrict c, 
+      __m128i * restrict d)
 {
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+   __m128i e, f, g, h;
+
+   m = _mm_unpacklo_epi8(*x,*y);
+   n = _mm_unpackhi_epi8(*x,*y);
+   o = _mm_unpacklo_epi8(*z,*w);
+   p = _mm_unpackhi_epi8(*z,*w);
+
+   i = _mm_unpacklo_epi16(m,n);
+   j = _mm_unpackhi_epi16(m,n);
+   k = _mm_unpacklo_epi16(o,p);
+   l = _mm_unpackhi_epi16(o,p);
+
+   e = _mm_unpacklo_epi8(i,j);
+   f = _mm_unpackhi_epi8(i,j);
+   g = _mm_unpacklo_epi8(k,l);
+   h = _mm_unpackhi_epi8(k,l);
+
+   *a = _mm_unpacklo_epi64(e,g);
+   *b = _mm_unpackhi_epi64(e,g);
+   *c = _mm_unpacklo_epi64(f,h);
+   *d = _mm_unpackhi_epi64(f,h);
+}
+
+static ALWAYS_INLINE void
+unswz4( const __m128i * restrict a, 
+        const __m128i * restrict b, 
+        const __m128i * restrict c, 
+        const __m128i * restrict d, 
+        __m128i * restrict x, 
+        __m128i * restrict y, 
+        __m128i * restrict z, 
+        __m128i * restrict w)
+{
+   __m128i i, j, k, l;
+   __m128i m, n, o, p;
+
+   i = _mm_unpacklo_epi8(*a,*b);
+   j = _mm_unpackhi_epi8(*a,*b);
+   k = _mm_unpacklo_epi8(*c,*d);
+   l = _mm_unpackhi_epi8(*c,*d);
+
+   m = _mm_unpacklo_epi16(i,k);
+   n = _mm_unpackhi_epi16(i,k);
+   o = _mm_unpacklo_epi16(j,l);
+   p = _mm_unpackhi_epi16(j,l);
+
+   *x = _mm_unpacklo_epi64(m,n);
+   *y = _mm_unpackhi_epi64(m,n);
+   *z = _mm_unpacklo_epi64(o,p);
+   *w = _mm_unpackhi_epi64(o,p);
+}
 
+static void
+lp_tile_b8g8r8a8_unorm_swizzle_4ub_sse2(uint8_t * restrict dst,
+                                        const uint8_t * restrict src, unsigned src_stride,
+                                        unsigned x0, unsigned y0)
+{
+   __m128i *dst128 = (__m128i *) dst;
    unsigned x, y;
-   __m128i *pdst = (__m128i*) dst;
-   const uint8_t *ysrc0 = src + y0*src_stride + x0*sizeof(uint32_t);
-   unsigned int tile_stridex = src_stride*(TILE_VECTOR_HEIGHT - 1) - sizeof(uint32_t)*TILE_VECTOR_WIDTH;
-   unsigned int tile_stridey = src_stride*TILE_VECTOR_HEIGHT;
-
-   const __m128i shuffle00 = _mm_setr_epi8(0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle01 = _mm_setr_epi8(0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle02 = _mm_setr_epi8(0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle03 = _mm_setr_epi8(0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
-   const __m128i shuffle10 = _mm_setr_epi8(0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle11 = _mm_setr_epi8(0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle12 = _mm_setr_epi8(0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-   const __m128i shuffle13 = _mm_setr_epi8(0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff);
-
-   const __m128i shuffle20 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e,0xff,0xff);
-   const __m128i shuffle21 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d,0xff,0xff);
-   const __m128i shuffle22 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c,0xff,0xff);
-   const __m128i shuffle23 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f,0xff,0xff);
-
-   const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x02,0x06,0xff,0xff,0x0a,0x0e);
-   const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x01,0x05,0xff,0xff,0x09,0x0d);
-   const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x04,0xff,0xff,0x08,0x0c);
-   const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x03,0x07,0xff,0xff,0x0b,0x0f);
-
-   for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
-      __m128i line0 = *(__m128i*)ysrc0;
-      const uint8_t *ysrc = ysrc0 + src_stride;
-      ysrc0 += tile_stridey;
-
-      for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
-         __m128i r, g, b, a, line1;
-         line1 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc += src_stride;
-         r = _mm_shuffle_epi8(line0, shuffle00);
-         g = _mm_shuffle_epi8(line0, shuffle01);
-         b = _mm_shuffle_epi8(line0, shuffle02);
-         a = _mm_shuffle_epi8(line0, shuffle03);
-
-         line0 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc += src_stride;
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle10));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle11));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle12));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle13));
-
-         line1 = *(__m128i*)ysrc;
-         PIPE_READ_WRITE_BARRIER();
-         ysrc -= tile_stridex;
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line0, shuffle20));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line0, shuffle21));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line0, shuffle22));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line0, shuffle23));
-
-         if (x + 1 < TILE_SIZE) {
-            line0 = *(__m128i*)ysrc;
-            ysrc += src_stride;
-         }
-
-         PIPE_READ_WRITE_BARRIER();
-         r = _mm_or_si128(r, _mm_shuffle_epi8(line1, shuffle30));
-         g = _mm_or_si128(g, _mm_shuffle_epi8(line1, shuffle31));
-         b = _mm_or_si128(b, _mm_shuffle_epi8(line1, shuffle32));
-         a = _mm_or_si128(a, _mm_shuffle_epi8(line1, shuffle33));
-
-         *pdst++ = r;
-         *pdst++ = g;
-         *pdst++ = b;
-         *pdst++ = a;
+   
+   src += y0 * src_stride;
+   src += x0 * sizeof(uint32_t);
+
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *src_row = src;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         swz4((const __m128i *) (src_row + 0 * src_stride),
+              (const __m128i *) (src_row + 1 * src_stride),
+              (const __m128i *) (src_row + 2 * src_stride),
+              (const __m128i *) (src_row + 3 * src_stride),
+              dst128 + 2,     /* b */
+              dst128 + 1,     /* g */
+              dst128 + 0,     /* r */
+              dst128 + 3);    /* a */
+
+         dst128 += 4;
+         src_row += sizeof(__m128i);
       }
-   }
 
+      src += 4 * src_stride;
+   }
 }
 
 static void
-lp_tile_b8g8r8a8_unorm_unswizzle_4ub_ssse3(const uint8_t *src,
-                                          uint8_t *dst, unsigned dst_stride,
+lp_tile_b8g8r8a8_unorm_unswizzle_4ub_sse2(const uint8_t * restrict src,
+                                          uint8_t * restrict dst, unsigned dst_stride,
                                           unsigned x0, unsigned y0)
 {
    unsigned int x, y;
-   const __m128i *psrc = (__m128i*) src;
-   const __m128i *end = (__m128i*) (src + (y0 + TILE_SIZE - 1)*dst_stride + (x0 + TILE_SIZE - 1)*sizeof(uint32_t));
-   uint8_t *pdst = dst + y0 * dst_stride + x0 * sizeof(uint32_t);
-   __m128i c0 = *psrc++;
-   __m128i c1;
-
-   const __m128i shuffle00 = _mm_setr_epi8(0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff);
-   const __m128i shuffle01 = _mm_setr_epi8(0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff);
-   const __m128i shuffle02 = _mm_setr_epi8(0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff);
-   const __m128i shuffle03 = _mm_setr_epi8(0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff);
-
-   const __m128i shuffle10 = _mm_setr_epi8(0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff);
-   const __m128i shuffle11 = _mm_setr_epi8(0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff);
-   const __m128i shuffle12 = _mm_setr_epi8(0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff);
-   const __m128i shuffle13 = _mm_setr_epi8(0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff);
-
-   const __m128i shuffle20 = _mm_setr_epi8(0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05,0xff,0xff,0xff);
-   const __m128i shuffle21 = _mm_setr_epi8(0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07,0xff,0xff,0xff);
-   const __m128i shuffle22 = _mm_setr_epi8(0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d,0xff,0xff,0xff);
-   const __m128i shuffle23 = _mm_setr_epi8(0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f,0xff,0xff,0xff);
-
-   const __m128i shuffle30 = _mm_setr_epi8(0xff,0xff,0xff,0x00,0xff,0xff,0xff,0x01,0xff,0xff,0xff,0x04,0xff,0xff,0xff,0x05);
-   const __m128i shuffle31 = _mm_setr_epi8(0xff,0xff,0xff,0x02,0xff,0xff,0xff,0x03,0xff,0xff,0xff,0x06,0xff,0xff,0xff,0x07);
-   const __m128i shuffle32 = _mm_setr_epi8(0xff,0xff,0xff,0x08,0xff,0xff,0xff,0x09,0xff,0xff,0xff,0x0c,0xff,0xff,0xff,0x0d);
-   const __m128i shuffle33 = _mm_setr_epi8(0xff,0xff,0xff,0x0a,0xff,0xff,0xff,0x0b,0xff,0xff,0xff,0x0e,0xff,0xff,0xff,0x0f);
-
-   for (y = 0; y < TILE_SIZE; y += TILE_VECTOR_HEIGHT) {
-      __m128i *tile = (__m128i*) pdst;
-      pdst += dst_stride * TILE_VECTOR_HEIGHT;
-      for (x = 0; x < TILE_SIZE; x += TILE_VECTOR_WIDTH) {
-         uint8_t *linep = (uint8_t*) (tile++);
-         __m128i line0, line1, line2, line3;
-
-         c1 = *psrc++; /* r */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_shuffle_epi8(c0, shuffle00);
-         line1 = _mm_shuffle_epi8(c0, shuffle01);
-         line2 = _mm_shuffle_epi8(c0, shuffle02);
-         line3 = _mm_shuffle_epi8(c0, shuffle03);
-
-         c0 = *psrc++; /* g */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle10));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle11));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle12));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle13));
-
-         c1 = *psrc++; /* b */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c0, shuffle20));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c0, shuffle21));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c0, shuffle22));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c0, shuffle23));
-
-         if (psrc != end)
-                 c0 = *psrc++; /* a */
-         PIPE_READ_WRITE_BARRIER();
-         line0 = _mm_or_si128(line0, _mm_shuffle_epi8(c1, shuffle30));
-         line1 = _mm_or_si128(line1, _mm_shuffle_epi8(c1, shuffle31));
-         line2 = _mm_or_si128(line2, _mm_shuffle_epi8(c1, shuffle32));
-         line3 = _mm_or_si128(line3, _mm_shuffle_epi8(c1, shuffle33));
-
-         *(__m128i*) (linep) = line0;
-         *(__m128i*) (((char*)linep) + dst_stride) = line1;
-         *(__m128i*) (((char*)linep) + 2 * dst_stride) = line2;
-         *(__m128i*) (((char*)linep) + 3 * dst_stride) = line3;
+   const __m128i *src128 = (const __m128i *) src;
+   
+   dst += y0 * dst_stride;
+   dst += x0 * sizeof(uint32_t);
+   
+   for (y = 0; y < TILE_SIZE; y += 4) {
+      const uint8_t *dst_row = dst;
+
+      for (x = 0; x < TILE_SIZE; x += 4) {
+         unswz4( &src128[2],     /* b */
+                 &src128[1],     /* g */
+                 &src128[0],     /* r */
+                 &src128[3],     /* a */
+                 (__m128i *) (dst_row + 0 * dst_stride),
+                 (__m128i *) (dst_row + 1 * dst_stride),
+                 (__m128i *) (dst_row + 2 * dst_stride),
+                 (__m128i *) (dst_row + 3 * dst_stride));
+
+         src128 += 4;
+         dst_row += sizeof(__m128i);;
       }
+
+      dst += 4 * dst_stride;
    }
 }
 
-#endif /* PIPE_ARCH_SSSE3 */
+#endif /* PIPE_ARCH_SSE */
 '''
 
 
@@ -479,7 +448,7 @@ def generate_swizzle(formats, dst_channel, dst_native_type, dst_suffix):
             func_name = 'lp_tile_%s_swizzle_%s' % (format.short_name(), dst_suffix)
             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
                 print '#ifdef PIPE_ARCH_SSE'
-                print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+                print '      func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
                 print '#else'
                 print '      func = %s;' % (func_name,)
                 print '#endif'
@@ -517,7 +486,7 @@ def generate_unswizzle(formats, src_channel, src_native_type, src_suffix):
             func_name = 'lp_tile_%s_unswizzle_%s' % (format.short_name(), src_suffix)
             if format.name == 'PIPE_FORMAT_B8G8R8A8_UNORM':
                 print '#ifdef PIPE_ARCH_SSE'
-                print '      func = util_cpu_caps.has_ssse3 ? %s_ssse3 : %s;' % (func_name, func_name)
+                print '      func = util_cpu_caps.has_sse2 ? %s_sse2 : %s;' % (func_name, func_name)
                 print '#else'
                 print '      func = %s;' % (func_name,)
                 print '#endif'
@@ -577,7 +546,7 @@ def main():
     print '};'
     print
 
-    generate_ssse3()
+    generate_sse2()
 
     channel = Channel(UNSIGNED, True, 8)
     native_type = 'uint8_t'